From 3de1415b1d1aaf1adae5b188c7db0156284b872f Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Thu, 14 May 2026 10:23:12 +0200
Subject: [PATCH 01/81] Harden CI measurement regression gates
---
.github/workflows/ci.yml | 360 ++++++++-------
.github/workflows/ci.yml.genie.ts | 12 +-
.../ci-measurement-comparison.test.sh | 105 +++++
genie/ci-workflow/measurements.ts | 417 ++++++++++--------
.../ci-workflow-helpers.unit.test.ts | 26 +-
5 files changed, 547 insertions(+), 373 deletions(-)
create mode 100755 genie/ci-scripts/ci-measurement-comparison.test.sh
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 059f88ce0..d3ace3a1d 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2110,6 +2110,7 @@ jobs:
- name: pnpm regression suite
run: |
bash genie/ci-scripts/nix-gc-race-retry.test.sh
+ bash genie/ci-scripts/ci-measurement-comparison.test.sh
bash nix/workspace-tools/lib/mk-pnpm-cli/tests/run.sh --skip-genie --skip-megarepo --skip-devenv-shell --skip-downstream-megarepo
- name: Save pnpm state
if: ${{ success() && steps.restore-pnpm-state.outputs.cache-hit != 'true' }}
@@ -2185,7 +2186,6 @@ jobs:
GITHUB_TOKEN: ${{ github.token }}
ARTIFACT_DIR: tmp/devenv-perf-ci
OTEL_SERVICE_NAME: devenv-perf-ci
- DEVENV_PERF_REGRESSION_MODE: warn
RUNNER_CLASS: 'namespace-profile-linux-x86-64,namespace-features:github.run-id=${{ github.run_id }}'
steps:
- uses: actions/checkout@v6
@@ -2359,7 +2359,7 @@ jobs:
BASELINE_WORKFLOW_NAME: ${{ github.workflow }}
BASELINE_BRANCH: ${{ github.base_ref || github.ref_name }}
BASELINE_SEED_RUN_IDS: '25710204667'
- BASELINE_MAX_RUNS: '5'
+ BASELINE_MAX_RUNS: '20'
run: |
set -euo pipefail
@@ -2521,6 +2521,7 @@ jobs:
local stdout="$7"
local stderr="$8"
local trace="$9"
+ local gate_policy="${10}"
local samples_file="$ARTIFACT_DIR/$id.samples.json"
if [ "$first" -eq 0 ]; then
@@ -2535,13 +2536,15 @@ jobs:
--arg group "$group" \
--arg description "$description" \
--argjson status "$status" \
- --argjson durationMs "$duration_ms" \
- --arg stdout "$stdout" \
- --arg stderr "$stderr" \
- --arg trace "$trace" \
- '($samples[0] // []) as $sampleList
- | ($sampleList | map(select(.status == 0) | .durationMs)) as $successfulDurations
- | {
+ --argjson durationMs "$duration_ms" \
+ --arg stdout "$stdout" \
+ --arg stderr "$stderr" \
+ --arg trace "$trace" \
+ --argjson gatePolicy "$gate_policy" \
+ '($samples[0] // []) as $sampleList
+ | ($sampleList | map(select(.phase != "warmup" and .status == 0) | .durationMs)) as $successfulDurations
+ | ($sampleList | map(select(.phase == "warmup"))) as $warmupSamples
+ | {
id:$id,
name:$id,
label:$label,
@@ -2551,9 +2554,12 @@ jobs:
durationMs:$durationMs,
stdout:$stdout,
stderr:$stderr,
- trace:(if $trace == "" then null else $trace end),
- statistics: {
+ trace:(if $trace == "" then null else $trace end),
+ gatePolicy:$gatePolicy,
+ statistics: {
sampleCount: ($sampleList | length),
+ warmupCount: ($warmupSamples | length),
+ measuredSampleCount: (($sampleList | length) - ($warmupSamples | length)),
successfulSampleCount: ($successfulDurations | length),
minDurationMs: ($successfulDurations | min),
maxDurationMs: ($successfulDurations | max),
@@ -2568,10 +2574,12 @@ jobs:
local id="$1"
local label="$2"
local group="$3"
- local description="$4"
- local trace_file="$5"
- local repetitions="$6"
- shift 6
+ local description="$4"
+ local trace_file="$5"
+ local warmup_repetitions="$6"
+ local repetitions="$7"
+ local gate_policy="$8"
+ shift 8
case "$trace_file" in
'$ARTIFACT_DIR'*) trace_file="${ARTIFACT_DIR}${trace_file#'$ARTIFACT_DIR'}" ;;
esac
@@ -2584,11 +2592,22 @@ jobs:
if ! [[ "$repetitions" =~ ^[0-9]+$ ]] || [ "$repetitions" -lt 1 ]; then
repetitions=1
fi
+ if ! [[ "$warmup_repetitions" =~ ^[0-9]+$ ]] || [ "$warmup_repetitions" -lt 0 ]; then
+ warmup_repetitions=0
+ fi
printf '[' >"$samples_file"
local sample_first=1
- local sample_index sample_stdout sample_stderr sample_trace expanded
- for sample_index in $(seq 1 "$repetitions"); do
+ local sample_index measured_index total_repetitions phase sample_stdout sample_stderr sample_trace expanded
+ total_repetitions=$((warmup_repetitions + repetitions))
+ for sample_index in $(seq 1 "$total_repetitions"); do
+ if [ "$sample_index" -le "$warmup_repetitions" ]; then
+ phase="warmup"
+ measured_index=""
+ else
+ phase="measured"
+ measured_index=$((sample_index - warmup_repetitions))
+ fi
sample_stdout="$ARTIFACT_DIR/$id.$sample_index.stdout"
sample_stderr="$ARTIFACT_DIR/$id.$sample_index.stderr"
sample_trace=""
@@ -2623,12 +2642,14 @@ jobs:
sample_first=0
jq -cn \
--argjson index "$sample_index" \
+ --arg measuredIndex "$measured_index" \
+ --arg phase "$phase" \
--argjson status "$status" \
--argjson durationMs "$duration_ms" \
--arg stdout "$sample_stdout" \
--arg stderr "$sample_stderr" \
--arg trace "$sample_trace" \
- '{index:$index,status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:(if $trace == "" then null else $trace end)}' \
+ '{index:$index,measuredIndex:(if $measuredIndex == "" then null else ($measuredIndex | tonumber) end),phase:$phase,status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:(if $trace == "" then null else $trace end)}' \
>>"$samples_file"
stdout="$sample_stdout"
@@ -2642,12 +2663,12 @@ jobs:
printf ']\n' >>"$samples_file"
status="$(jq -r 'map(.status) | max // 0' "$samples_file")"
- duration_ms="$(jq -r 'map(select(.status == 0) | .durationMs) as $values | if ($values | length) == 0 then (map(.durationMs) | max // 0) else ($values | sort | .[(length - 1) / 2 | floor]) end' "$samples_file")"
+ duration_ms="$(jq -r 'map(select(.phase != "warmup" and .status == 0) | .durationMs) as $values | if ($values | length) == 0 then (map(.durationMs) | max // 0) else ($values | sort | .[(length - 1) / 2 | floor]) end' "$samples_file")"
cp "$stdout" "$ARTIFACT_DIR/$id.stdout" 2>/dev/null || true
cp "$stderr" "$ARTIFACT_DIR/$id.stderr" 2>/dev/null || true
- json_append_timing "$id" "$label" "$group" "$description" "$status" "$duration_ms" "$ARTIFACT_DIR/$id.stdout" "$ARTIFACT_DIR/$id.stderr" "$trace_file"
+ json_append_timing "$id" "$label" "$group" "$description" "$status" "$duration_ms" "$ARTIFACT_DIR/$id.stdout" "$ARTIFACT_DIR/$id.stderr" "$trace_file" "$gate_policy"
if [ "$status" -ne 0 ]; then
echo "::error::$id failed after ${duration_ms}ms; stderr tail follows"
@@ -2656,14 +2677,14 @@ jobs:
fi
}
- measure 'shell_eval_traced' 'Shell eval with OTEL trace' 'devenv shell' 'Evaluates the dev shell with native devenv JSON tracing enabled.' '$ARTIFACT_DIR/traces/shell_eval_traced.json' '1' '$DEVENV_BIN' '--trace-to' 'json:file:$trace_file' 'shell' '--no-reload' '--' 'true'
- measure 'shell_eval_warm' 'Warm shell eval' 'devenv shell' 'Evaluates a warm dev shell without reloading direnv state.' '' '3' '$DEVENV_BIN' 'shell' '--no-reload' '--' 'true'
- measure 'tasks_list' 'devenv tasks list' 'devenv cli' 'Lists devenv tasks to measure task graph loading overhead.' '' '5' '$DEVENV_BIN' 'tasks' 'list'
- measure 'processes_help' 'devenv processes --help' 'devenv cli' 'Loads the devenv processes command help path.' '' '5' '$DEVENV_BIN' 'processes' '--help'
- measure 'task_pnpm_install' 'pnpm install task' 'workspace setup' 'Runs the cached pnpm install devenv task.' '' '1' '$DEVENV_BIN' 'tasks' 'run' 'pnpm:install' '--mode' 'before' '--no-tui' '--show-output'
- measure 'task_genie_run' 'Genie run task' 'genie' 'Runs the normal devenv genie:run task including its declared dependencies.' '' '1' '$DEVENV_BIN' 'tasks' 'run' 'genie:run' '--mode' 'before' '--no-tui' '--show-output'
- measure 'task_check_quick' 'Quick check task' 'quality gates' 'Runs the fast local quality gate through devenv.' '' '1' '$DEVENV_BIN' 'tasks' 'run' 'check:quick' '--mode' 'before' '--no-tui' '--show-output'
- measure 'genie_check_direct' 'Genie check direct' 'genie' 'Runs Genie directly in check mode to isolate generator runtime from devenv task dependency overhead.' '' '3' '$DEVENV_BIN' 'shell' '--no-reload' '--' 'bun' 'packages/@overeng/genie/bin/genie.tsx' '--output' 'ci-plain' '--check'
+ measure 'shell_eval_traced' 'Shell eval with OTEL trace' 'devenv shell' 'Evaluates the dev shell with native devenv JSON tracing enabled.' '$ARTIFACT_DIR/traces/shell_eval_traced.json' '0' '1' '{"enabled":false,"minBaselineSources":10,"minCurrentSamples":3,"warnRatio":1.25,"failRatio":1.5,"warnAbs":1.5,"failAbs":3,"noiseFloor":0.5}' '$DEVENV_BIN' '--trace-to' 'json:file:$trace_file' 'shell' '--no-reload' '--' 'true'
+ measure 'shell_eval_warm' 'Warm shell eval' 'devenv shell' 'Evaluates a warm dev shell without reloading direnv state.' '' '1' '5' '{"enabled":true,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":0.5,"noiseFloor":0.1}' '$DEVENV_BIN' 'shell' '--no-reload' '--' 'true'
+ measure 'tasks_list' 'devenv tasks list' 'devenv cli' 'Lists devenv tasks to measure task graph loading overhead.' '' '1' '9' '{"enabled":true,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":2,"failRatio":3,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1}' '$DEVENV_BIN' 'tasks' 'list'
+ measure 'processes_help' 'devenv processes --help' 'devenv cli' 'Loads the devenv processes command help path.' '' '1' '9' '{"enabled":true,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":2,"failRatio":3,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1}' '$DEVENV_BIN' 'processes' '--help'
+ measure 'task_pnpm_install' 'pnpm install task' 'workspace setup' 'Runs the cached pnpm install devenv task.' '' '1' '5' '{"enabled":true,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":0.5,"noiseFloor":0.1}' '$DEVENV_BIN' 'tasks' 'run' 'pnpm:install' '--mode' 'before' '--no-tui' '--show-output'
+ measure 'task_genie_run' 'Genie run task' 'genie' 'Runs the normal devenv genie:run task including its declared dependencies.' '' '1' '5' '{"enabled":true,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":0.5,"noiseFloor":0.1}' '$DEVENV_BIN' 'tasks' 'run' 'genie:run' '--mode' 'before' '--no-tui' '--show-output'
+ measure 'task_check_quick' 'Quick check task' 'quality gates' 'Runs the fast local quality gate through devenv.' '' '1' '5' '{"enabled":true,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":0.5,"noiseFloor":0.1}' '$DEVENV_BIN' 'tasks' 'run' 'check:quick' '--mode' 'before' '--no-tui' '--show-output'
+ measure 'genie_check_direct' 'Genie check direct' 'genie' 'Runs Genie directly in check mode to isolate generator runtime from devenv task dependency overhead.' '' '1' '5' '{"enabled":true,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":0.5,"noiseFloor":0.1}' '$DEVENV_BIN' 'shell' '--no-reload' '--' 'bun' 'packages/@overeng/genie/bin/genie.tsx' '--output' 'ci-plain' '--check'
printf ']\n' >>"$ARTIFACT_DIR/timings.json"
@@ -2717,7 +2738,11 @@ jobs:
'{
schemaVersion: $schemaVersion,
generatedAt: $generatedAt,
- producer: { name: "effect-utils-ci-measurement", version: 1 },
+ producer: {
+ name: "effect-utils-ci-measurement",
+ version: 2,
+ measurementProtocol: "devenv-perf-warm-median-v2"
+ },
subject: {
repo: $repository,
branchKind: (if $branchKind == "" then "unknown" else $branchKind end),
@@ -2745,9 +2770,12 @@ jobs:
group: .group,
name: ("devenv." + .id + ".duration"),
unit: "seconds",
- value: (.durationMs / 1000),
- statistics: {
+ value: (.durationMs / 1000),
+ policy: .gatePolicy,
+ statistics: {
sampleCount: (.statistics.sampleCount // 1),
+ warmupCount: (.statistics.warmupCount // 0),
+ measuredSampleCount: (.statistics.measuredSampleCount // (.statistics.sampleCount // 1)),
successfulSampleCount: (.statistics.successfulSampleCount // (if .status == 0 then 1 else 0 end)),
min: ((.statistics.minDurationMs // .durationMs) / 1000),
max: ((.statistics.maxDurationMs // .durationMs) / 1000),
@@ -2758,6 +2786,11 @@ jobs:
probeLabel: .label,
status: .status,
sampleCount: (.statistics.sampleCount // 1),
+ warmupCount: (.statistics.warmupCount // 0),
+ measuredSampleCount: (.statistics.measuredSampleCount // (.statistics.sampleCount // 1)),
+ measurementProtocol: "devenv-perf-warm-median-v2",
+ aggregation: "median",
+ phase: "warm",
devenvRev: $devenvRev,
otelServiceName: $otelServiceName
}
@@ -2778,109 +2811,6 @@ jobs:
}
}' >"$ARTIFACT_DIR/measurements.json"
- compare_baseline() {
- local baseline_path="${DEVENV_PERF_BASELINE_SUMMARY:-$ARTIFACT_DIR/baseline/summary.json}"
- local mode="${DEVENV_PERF_REGRESSION_MODE:-warn}"
-
- if [ "$mode" = "off" ]; then
- jq -n --argjson schemaVersion 1 --arg status skipped --arg mode "$mode" '{schemaVersion:$schemaVersion, status:$status, mode:$mode, checks:{}}' >"$ARTIFACT_DIR/perf-comparison.json"
- return 0
- fi
-
- if [ ! -f "$baseline_path" ]; then
- jq -n \
- --argjson schemaVersion 1 \
- --arg status baseline_missing \
- --arg mode "$mode" \
- --arg baseline "$baseline_path" \
- '{schemaVersion:$schemaVersion, status:$status, mode:$mode, baseline:$baseline, checks:{}}' \
- >"$ARTIFACT_DIR/perf-comparison.json"
- echo "::notice::devenv perf baseline not found at $baseline_path; recorded current measurements only"
- return 0
- fi
-
- jq -n \
- --slurpfile current "$ARTIFACT_DIR/summary.json" \
- --slurpfile baseline "$baseline_path" \
- --argjson schemaVersion 1 \
- --arg mode "$mode" \
- --arg baselinePath "$baseline_path" \
- '
- def budget($name):
- if $name == "shell_eval_traced" then
- {warnRatio:1.25, failRatio:1.5, warnMs:1500, failMs:3000}
- elif $name == "shell_eval_warm" then
- {warnRatio:1.5, failRatio:2.0, warnMs:500, failMs:1000}
- elif $name == "tasks_list" or $name == "processes_help" then
- {warnRatio:2.0, failRatio:3.0, warnMs:250, failMs:1000}
- else
- {warnRatio:1.5, failRatio:2.0, warnMs:1000, failMs:3000}
- end;
- def classify($name; $current; $baseline):
- budget($name) as $b
- | ($current - $baseline) as $delta
- | (if $baseline > 0 then ($current / $baseline) else null end) as $ratio
- | (
- if $baseline <= 0 then "unknown"
- elif ($delta > $b.failMs and $current > ($baseline * $b.failRatio)) then "fail"
- elif ($delta > $b.warnMs and $current > ($baseline * $b.warnRatio)) then "warn"
- else "pass"
- end
- ) as $status
- | {status:$status, currentMs:$current, baselineMs:$baseline, deltaMs:$delta, ratio:$ratio, budget:$b};
- ($current[0].checks // {}) as $currentChecks
- | ($baseline[0].checks // {}) as $baselineChecks
- | (
- $currentChecks
- | to_entries
- | map(
- .key as $name
- | .value as $current
- | ($baselineChecks[$name] // null) as $base
- | {
- key: $name,
- value: (
- if $base == null then
- {status:"missing_baseline", currentMs:$current.durationMs}
- elif ($current.status != 0) then
- {status:"current_failed", currentMs:$current.durationMs, baselineMs:$base.durationMs}
- elif ($base.status != 0) then
- {status:"baseline_failed", currentMs:$current.durationMs, baselineMs:$base.durationMs}
- else
- classify($name; $current.durationMs; $base.durationMs)
- end
- )
- }
- )
- | from_entries
- ) as $checks
- | (
- if any($checks[]; .status == "fail") then "fail"
- elif any($checks[]; .status == "warn") then "warn"
- elif any($checks[]; .status == "missing_baseline") then "partial"
- else "pass"
- end
- ) as $status
- | {schemaVersion:$schemaVersion, status:$status, mode:$mode, baseline:$baselinePath, checks:$checks}
- ' >"$ARTIFACT_DIR/perf-comparison.json"
-
- local status
- status="$(jq -r '.status' "$ARTIFACT_DIR/perf-comparison.json")"
- case "$status:$mode" in
- fail:fail)
- echo "::error::devenv perf regression detected"
- jq . "$ARTIFACT_DIR/perf-comparison.json"
- return 1
- ;;
- fail:*|warn:*)
- echo "::warning::devenv perf regression threshold exceeded"
- jq . "$ARTIFACT_DIR/perf-comparison.json"
- ;;
- esac
- }
-
- compare_baseline
-
if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
{
echo "### Devenv perf"
@@ -2891,12 +2821,6 @@ jobs:
echo ""
echo "- Artifact directory: \`$ARTIFACT_DIR\`"
echo "- OTEL service: \`${OTEL_SERVICE_NAME:-unknown}\`"
- echo ""
- echo "#### Regression comparison"
- echo ""
- if [ -f "$ARTIFACT_DIR/perf-comparison.json" ]; then
- jq -r '["- Status: " + .status, "- Mode: " + .mode, "- Baseline: " + (.baseline // "none")] | .[]' "$ARTIFACT_DIR/perf-comparison.json"
- fi
} >>"$GITHUB_STEP_SUMMARY"
fi
@@ -2908,7 +2832,7 @@ jobs:
CI_MEASUREMENT_CURRENT_DIR: tmp/devenv-perf-ci
CI_MEASUREMENT_BASELINE_DIR: tmp/devenv-perf-ci/baseline
CI_MEASUREMENT_COMPARISON_FILE: tmp/devenv-perf-ci/measurement-comparison.json
- CI_MEASUREMENT_REGRESSION_MODE: warn
+ CI_MEASUREMENT_REGRESSION_MODE: fail
CI_MEASUREMENT_PR_COMMENT_ENABLED: 'true'
CI_MEASUREMENT_PR_COMMENT_TITLE: Devenv Performance
CI_MEASUREMENT_PR_COMMENT_MAX_ROWS: '8'
@@ -2935,7 +2859,10 @@ jobs:
current_index="$(mktemp)"
baseline_index="$(mktemp)"
find "$current_dir" -path "$baseline_dir" -prune -o -name measurements.json -type f -print | sort >"$current_index" || true
- find "$baseline_dir" -name measurements.json -type f -print | sort >"$baseline_index" || true
+ {
+ find "$baseline_dir" -maxdepth 1 -name measurements.json -type f -print
+ find "$baseline_dir" -mindepth 2 -maxdepth 2 -name measurements.json -type f -print
+ } | sort -u >"$baseline_index" || true
if [ ! -s "$current_index" ]; then
echo "::error::no current measurements.json files found under $current_dir"
@@ -2962,7 +2889,7 @@ jobs:
def identity_dimensions:
(.dimensions // {})
| to_entries
- | map(select(.key as $key | ["devenvRev", "otelServiceName", "status", "probeLabel", "sampleCount"] | index($key) | not))
+ | map(select(.key as $key | ["devenvRev", "otelServiceName", "status", "probeLabel", "sampleCount", "measuredSampleCount"] | index($key) | not))
| sort_by(.key)
| map("\(.key)=\(.value|tostring)")
| join(",");
@@ -2985,6 +2912,13 @@ jobs:
else (($sorted[($count / 2 - 1)] + $sorted[($count / 2)]) / 2)
end;
+ def percentile($p):
+ sort as $sorted
+ | ($sorted | length) as $count
+ | if $count == 0 then null
+ else $sorted[(($p * ($count - 1)) | floor)]
+ end;
+
def observations_by_key($docs):
reduce $docs[]? as $doc
({};
@@ -3001,13 +2935,14 @@ jobs:
def observation_stats($items):
($items | map(.observation.value)) as $values
- | ($items | map(.observation.statistics.sampleCount // 1) | add // ($items | length)) as $sampleCount
+ | ($items | map(.observation.statistics.measuredSampleCount // .observation.statistics.sampleCount // 1) | add // ($items | length)) as $sampleCount
| {
target: ($items[0].target // {}),
observation: ($items[-1].observation // {}),
value: ($values | median),
min: ($values | min),
max: ($values | max),
+ p95: ($values | percentile(0.95)),
sourceCount: ($items | length),
sampleCount: $sampleCount,
generatedAt: ($items[-1].generatedAt // null)
@@ -3021,7 +2956,7 @@ jobs:
elif $metric == "nix.closure.path_count" then
{warnRatio:1.10, failRatio:1.25, warnAbs:100, failAbs:500}
elif $unit == "seconds" then
- {warnRatio:1.25, failRatio:1.50, warnAbs:1.5, failAbs:3.0}
+ {warnRatio:1.10, failRatio:1.20, warnAbs:0.25, failAbs:0.5}
else
{warnRatio:1.25, failRatio:1.50, warnAbs:1, failAbs:3}
end;
@@ -3032,11 +2967,24 @@ jobs:
elif $unit == "seconds" then 0.1
else 0
end;
- def abs_value: if . < 0 then -. else . end;
-
- def classify($metric; $unit; $current; $baseline; $baselineMin; $baselineMax; $currentSamples; $baselineSources):
+ def default_policy($metric; $unit):
budget($metric; $unit) as $b
| noise_floor($metric; $unit) as $noise
+ | $b + {
+ enabled:true,
+ minBaselineSources:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" then 3 else 10 end),
+ minCurrentSamples:(if $unit == "seconds" then 3 else 1 end),
+ noiseFloor:$noise
+ };
+ def observation_policy($obs):
+ default_policy($obs.name // "unknown"; $obs.unit // "unknown") + ($obs.policy // {});
+ def policy_enabled($policy):
+ if ($policy | has("enabled")) then $policy.enabled else true end;
+ def abs_value: if . < 0 then -. else . end;
+
+ def classify($metric; $unit; $policy; $current; $baseline; $baselineMin; $baselineMax; $baselineP95; $currentSamples; $baselineSources):
+ $policy as $b
+ | ($policy.noiseFloor // noise_floor($metric; $unit)) as $noise
| ($current - $baseline) as $delta
| (if $baseline > 0 then ($current / $baseline) else null end) as $ratio
| (
@@ -3052,17 +3000,34 @@ jobs:
else "pass"
end
) as $thresholdStatus
+ | (
+ policy_enabled($policy) == true
+ and $baseline > 0
+ and $baselineSources >= ($policy.minBaselineSources // 1)
+ and $currentSamples >= ($policy.minCurrentSamples // 1)
+ ) as $gateable
+ | (
+ if (policy_enabled($policy) != true) then "disabled"
+ elif $baseline <= 0 then "missing_baseline"
+ elif $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
+ elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
+ else "eligible"
+ end
+ ) as $gateReason
| (
if $baseline <= 0 then "unknown"
+ elif (policy_enabled($policy) != true) then "diagnostic"
elif ($delta | abs_value) <= $noise then "noise_floor"
elif ($withinBaselineRange and $thresholdStatus == "pass") then "within_baseline_range"
- elif ($baselineSources < 3 or $currentSamples < 3) then "low_sample_count"
+ elif $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
+ elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
elif $thresholdStatus == "pass" then "within_budget"
+ elif ($baselineP95 != null and $current <= $baselineP95) then "within_baseline_distribution"
else "threshold_exceeded"
end
) as $confidence
| (
- if $confidence == "threshold_exceeded" then $thresholdStatus
+ if ($gateable and $confidence == "threshold_exceeded") then $thresholdStatus
elif $thresholdStatus == "unknown" then "unknown"
else "pass"
end
@@ -3075,7 +3040,7 @@ jobs:
else "regressed"
end
) as $direction
- | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,confidence:$confidence,direction:$direction};
+ | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction};
(observations_by_key($current[0]) | with_entries(.value = observation_stats(.value))) as $currentObs
| (observations_by_key($baseline[0]) | with_entries(.value = observation_stats(.value))) as $baselineObs
@@ -3094,30 +3059,36 @@ jobs:
status: "missing_baseline",
target: $currentValue.target,
observation: $currentValue.observation,
- current: $currentValue.value,
- currentSamples: $currentValue.sampleCount,
- baselineSources: 0,
- confidence: "missing_baseline",
- direction: "unknown"
- }
- else
- classify(
- $currentValue.observation.name;
- $currentValue.observation.unit;
- $currentValue.value;
- $baselineValue.value;
- $baselineValue.min;
- $baselineValue.max;
- $currentValue.sampleCount;
- $baselineValue.sourceCount
- ) + {
+ current: $currentValue.value,
+ currentSamples: $currentValue.sampleCount,
+ baselineSources: 0,
+ gatePolicy: ($currentValue.observation | observation_policy(.)),
+ gateable: false,
+ gateReason: "missing_baseline",
+ confidence: "missing_baseline",
+ direction: "unknown"
+ }
+ else
+ classify(
+ $currentValue.observation.name;
+ $currentValue.observation.unit;
+ ($currentValue.observation | observation_policy(.));
+ $currentValue.value;
+ $baselineValue.value;
+ $baselineValue.min;
+ $baselineValue.max;
+ $baselineValue.p95;
+ $currentValue.sampleCount;
+ $baselineValue.sourceCount
+ ) + {
target: $currentValue.target,
observation: $currentValue.observation,
- currentSamples: $currentValue.sampleCount,
- baselineSources: $baselineValue.sourceCount,
- baselineMin: $baselineValue.min,
- baselineMax: $baselineValue.max
- }
+ currentSamples: $currentValue.sampleCount,
+ baselineSources: $baselineValue.sourceCount,
+ baselineMin: $baselineValue.min,
+ baselineMax: $baselineValue.max,
+ baselineP95: $baselineValue.p95
+ }
end
)
}
@@ -3127,7 +3098,7 @@ jobs:
| (
if any($comparisons[]?; .status == "fail") then "fail"
elif any($comparisons[]?; .status == "warn") then "warn"
- elif any($comparisons[]?; .status == "missing_baseline") then "partial"
+ elif any($comparisons[]?; .status == "missing_baseline" and (if (.gatePolicy | has("enabled")) then .gatePolicy.enabled else true end)) then "partial"
else "pass"
end
) as $status
@@ -3171,8 +3142,8 @@ jobs:
echo ""
jq -r '"- Status: " + .status + "\n- Mode: " + .mode + "\n- Baseline: " + .baselineDir' "$comparison_file"
echo ""
- echo "| Status | Target | Observation | Current | Baseline | Delta | Ratio |"
- echo "| --- | --- | --- | ---: | ---: | ---: | ---: |"
+ echo "| Status | Gate | Target | Observation | Current | Baseline | Delta | Ratio |"
+ echo "| --- | --- | --- | --- | ---: | ---: | ---: | ---: |"
jq -r '
.comparisons
| to_entries
@@ -3186,9 +3157,10 @@ jobs:
| .[:20]
| .[]
| .value as $v
- | [
- $v.status,
- (($v.target.kind // "unknown") + "/" + ($v.target.name // "unknown") + "/" + ($v.target.system // "unknown")),
+ | [
+ $v.status,
+ (if ($v.gateable // false) then "yes" else ($v.gateReason // "no") end),
+ (($v.target.kind // "unknown") + "/" + ($v.target.name // "unknown") + "/" + ($v.target.system // "unknown")),
($v.observation.name // "unknown"),
(($v.current // $v.observation.value // 0) | tostring),
(($v.baseline // "") | tostring),
@@ -3332,16 +3304,25 @@ jobs:
}
const formatResult = (row) => {
- if (row.confidence === 'low_sample_count') return 'gray needs repeat'
+ if (row.confidence === 'low_baseline_count') return 'gray needs baseline'
+ if (row.confidence === 'low_current_sample_count') return 'gray needs repeat'
+ if (row.confidence === 'diagnostic') return 'gray diagnostic'
if (row.status === 'fail') return 'red regression'
if (row.status === 'warn') return 'yellow regression'
if (row.status === 'missing_baseline') return 'gray no baseline'
if (row.confidence === 'noise_floor') return 'gray noise floor'
if (row.confidence === 'within_baseline_range') return 'gray within range'
+ if (row.confidence === 'within_baseline_distribution') return 'gray within p95'
if (row.direction === 'improved') return 'green improved'
return 'gray unchanged'
}
+ const formatGate = (row) => {
+ if (row.gateable) return 'yes'
+ const reason = row.gateReason || row.confidence || 'unknown'
+ return 'no
' + reason + ''
+ }
+
const escapeCell = (value) => String(value ?? '-').replaceAll('|', '\\|').replaceAll('\n', '
')
const escapeXml = (value) => String(value)
.replaceAll('&', '&')
@@ -3406,6 +3387,14 @@ jobs:
if (byRank !== 0) return byRank
return (right.delta || 0) - (left.delta || 0)
})
+ const protocolLabel = (() => {
+ const protocols = new Set(
+ allRows
+ .map((row) => row.observation?.dimensions?.measurementProtocol)
+ .filter((value) => typeof value === 'string' && value.length > 0),
+ )
+ return protocols.size > 0 ? Array.from(protocols).join(', ') : 'legacy'
+ })()
const visibleLimit = Number.isFinite(maxRows) && maxRows > 0 ? maxRows : 10
const comparableRows = allRows.filter((row) => typeof row.baseline === 'number')
const hasComparableBaseline = comparableRows.length > 0
@@ -3417,8 +3406,8 @@ jobs:
const comparisonTable = (rows) => {
if (rows.length === 0) return 'No measurement regressions detected.'
return [
- '| Probe | Baseline | Current | Change | Result | Confidence |',
- '| --- | ---: | ---: | ---: | --- | --- |',
+ '| Probe | Baseline | Current | Change | Result | Gate | Confidence |',
+ '| --- | ---: | ---: | ---: | --- | --- | --- |',
...rows.map((row) => {
const unit = row.observation?.unit
const baselineRange = typeof row.baselineMin === 'number' && typeof row.baselineMax === 'number' && row.baselineMin !== row.baselineMax
@@ -3430,6 +3419,7 @@ jobs:
formatValue(row.current, unit),
formatDelta(row.delta, unit) + ' / ' + formatRatio(row.ratio),
formatResult(row),
+ formatGate(row),
(row.confidence || 'unknown') + '
baseline n=' + (row.baselineSources ?? 0) + ', current samples=' + (row.currentSamples ?? 1) + '',
].map(escapeCell).join(' | ') + ' |'
}),
@@ -3450,12 +3440,13 @@ jobs:
const allMeasurementsTable = (rows) => {
if (rows.length === 0) return 'No measurement regressions detected.'
return [
- '| Status | Target | Observation | Dimensions | Baseline | Current | Delta | Ratio |',
- '| --- | --- | --- | --- | ---: | ---: | ---: | ---: |',
+ '| Status | Gate | Target | Observation | Dimensions | Baseline | Current | Delta | Ratio |',
+ '| --- | --- | --- | --- | --- | ---: | ---: | ---: | ---: |',
...rows.map((row) => {
const unit = row.observation?.unit
return '| ' + [
row.status,
+ row.gateable ? 'yes' : (row.gateReason || 'no'),
row.target?.label || row.target?.name || 'unknown',
row.observation?.label || row.observation?.name || 'unknown',
dimensions(row),
@@ -3592,6 +3583,7 @@ jobs:
'- Commit: ' + shortSha,
'- Run: ' + runLink,
'- Baseline: ' + baselineLabel,
+ '- Protocol: ' + protocolLabel,
'',
hasComparableBaseline
? 'Chart: performance change versus baseline median. Green is faster, red is slower, gray is within noise or baseline range.'
diff --git a/.github/workflows/ci.yml.genie.ts b/.github/workflows/ci.yml.genie.ts
index 166591fc0..cb13e63d8 100644
--- a/.github/workflows/ci.yml.genie.ts
+++ b/.github/workflows/ci.yml.genie.ts
@@ -255,6 +255,7 @@ const jobs: Record | ReturnType = {
}),
artifactName: 'devenv-perf',
baselineSeedRunIds: ['25710204667'],
+ baselineMaxRuns: 20,
+ regressionMode: 'fail',
setupSteps: baseSteps,
taskProbes: [
{
@@ -279,18 +282,24 @@ const extraJobs: Record = {
label: 'pnpm install task',
group: 'workspace setup',
description: 'Runs the cached pnpm install devenv task.',
+ warmupRepetitions: 1,
+ repetitions: 5,
},
{
task: 'genie:run',
label: 'Genie run task',
group: 'genie',
description: 'Runs the normal devenv genie:run task including its declared dependencies.',
+ warmupRepetitions: 1,
+ repetitions: 5,
},
{
task: 'check:quick',
label: 'Quick check task',
group: 'quality gates',
description: 'Runs the fast local quality gate through devenv.',
+ warmupRepetitions: 1,
+ repetitions: 5,
},
],
probes: [
@@ -300,7 +309,8 @@ const extraJobs: Record = {
group: 'genie',
description:
'Runs Genie directly in check mode to isolate generator runtime from devenv task dependency overhead.',
- repetitions: 3,
+ warmupRepetitions: 1,
+ repetitions: 5,
command: [
'$DEVENV_BIN',
'shell',
diff --git a/genie/ci-scripts/ci-measurement-comparison.test.sh b/genie/ci-scripts/ci-measurement-comparison.test.sh
new file mode 100755
index 000000000..c30aa2069
--- /dev/null
+++ b/genie/ci-scripts/ci-measurement-comparison.test.sh
@@ -0,0 +1,105 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+cd "$ROOT"
+
+tmp_dir="$(mktemp -d)"
+trap 'rm -rf "$tmp_dir"' EXIT
+
+run_bun() {
+ if command -v bun >/dev/null 2>&1; then
+ bun "$@"
+ elif [ -n "${DEVENV_BIN:-}" ]; then
+ "$DEVENV_BIN" shell --no-reload -- bun "$@"
+ else
+ echo "bun is not available and DEVENV_BIN is not set" >&2
+ return 127
+ fi
+}
+
+emit_compare_script() {
+ run_bun -e "import { compareCiMeasurementsStep } from './genie/ci-workflow/measurements.ts'; process.stdout.write(compareCiMeasurementsStep({ currentDir: '$tmp_dir/current', baselineDir: '$tmp_dir/baseline', outputFile: '$tmp_dir/comparison.json', regressionMode: 'warn' }).run)" >"$tmp_dir/compare.sh"
+}
+
+write_measurement() {
+ local file="$1"
+ local value="$2"
+ local protocol="$3"
+ local policy="$4"
+ mkdir -p "$(dirname "$file")"
+ jq -n \
+ --argjson value "$value" \
+ --arg protocol "$protocol" \
+ --argjson policy "$policy" \
+ '{
+ schemaVersion: 1,
+ generatedAt: "2026-05-14T00:00:00Z",
+ producer: { name: "test", version: (if $protocol == "legacy" then 1 else 2 end) },
+ target: { kind: "devenv", id: "dev-shell", name: "dev-shell", label: "Dev shell", group: "devenv", system: "Linux" },
+ observations: [
+ {
+ id: "devenv.task.duration",
+ label: "Task",
+ group: "test",
+ name: "devenv.task.duration",
+ unit: "seconds",
+ value: $value,
+ policy: $policy,
+ statistics: { sampleCount: 6, warmupCount: 1, measuredSampleCount: 5, successfulSampleCount: 5, min: $value, max: $value, median: $value },
+ dimensions: (
+ { probe: "task", probeLabel: "Task", status: 0, sampleCount: 6, warmupCount: 1, measuredSampleCount: 5 }
+ + if $protocol == "legacy" then {} else { measurementProtocol: $protocol, aggregation: "median", phase: "warm" } end
+ )
+ }
+ ]
+ }' >"$file"
+}
+
+run_compare() {
+ CI_MEASUREMENT_CURRENT_DIR="$tmp_dir/current" \
+ CI_MEASUREMENT_BASELINE_DIR="$tmp_dir/baseline" \
+ CI_MEASUREMENT_COMPARISON_FILE="$tmp_dir/comparison.json" \
+ CI_MEASUREMENT_REGRESSION_MODE=warn \
+ CI_MEASUREMENT_PR_COMMENT_ENABLED=false \
+ bash "$tmp_dir/compare.sh"
+}
+
+policy='{"enabled":true,"minBaselineSources":1,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":0.5,"noiseFloor":0.1}'
+emit_compare_script
+
+rm -rf "$tmp_dir/current" "$tmp_dir/baseline"
+write_measurement "$tmp_dir/current/measurements.json" 12 legacy "$policy"
+write_measurement "$tmp_dir/baseline/run-1/measurements.json" 10 legacy "$policy"
+write_measurement "$tmp_dir/baseline/run-1/baseline/run-old/measurements.json" 1 legacy "$policy"
+run_compare
+actual_sources="$(jq -r '.comparisons[] | .baselineSources' "$tmp_dir/comparison.json")"
+actual_baseline="$(jq -r '.comparisons[] | .baseline' "$tmp_dir/comparison.json")"
+if [ "$actual_sources" != "1" ] || [ "$actual_baseline" != "10" ]; then
+ echo "expected clean top-level baseline only; got sources=$actual_sources baseline=$actual_baseline" >&2
+ exit 1
+fi
+
+rm -rf "$tmp_dir/current" "$tmp_dir/baseline"
+write_measurement "$tmp_dir/current/measurements.json" 12 devenv-perf-warm-median-v2 "$policy"
+write_measurement "$tmp_dir/baseline/run-1/measurements.json" 10 legacy "$policy"
+run_compare
+actual_status="$(jq -r '.status' "$tmp_dir/comparison.json")"
+actual_gate="$(jq -r '.comparisons[] | .gateReason' "$tmp_dir/comparison.json")"
+if [ "$actual_status" != "partial" ] || [ "$actual_gate" != "missing_baseline" ]; then
+ echo "expected protocol mismatch to be missing_baseline; got status=$actual_status gate=$actual_gate" >&2
+ exit 1
+fi
+
+rm -rf "$tmp_dir/current" "$tmp_dir/baseline"
+write_measurement "$tmp_dir/current/measurements.json" 13 devenv-perf-warm-median-v2 "$policy"
+write_measurement "$tmp_dir/baseline/run-1/measurements.json" 10 devenv-perf-warm-median-v2 "$policy"
+run_compare
+actual_status="$(jq -r '.status' "$tmp_dir/comparison.json")"
+actual_row="$(jq -r '.comparisons[] | .status' "$tmp_dir/comparison.json")"
+if [ "$actual_status" != "fail" ] || [ "$actual_row" != "fail" ]; then
+ echo "expected confirmed regression to fail; got status=$actual_status row=$actual_row" >&2
+ exit 1
+fi
+
+echo "ci-measurement-comparison tests passed"
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 53df6b724..300b621cf 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -20,10 +20,23 @@ export type CiMeasurementDescriptor = {
readonly description?: string
}
+export type CiMeasurementGatePolicy = {
+ readonly enabled?: boolean
+ readonly minBaselineSources?: number
+ readonly minCurrentSamples?: number
+ readonly noiseFloor?: number
+ readonly warnRatio?: number
+ readonly failRatio?: number
+ readonly warnAbs?: number
+ readonly failAbs?: number
+}
+
export type DevenvPerfProbe = CiMeasurementDescriptor & {
readonly command: readonly [string, ...string[]]
readonly traceOutput?: string
+ readonly warmupRepetitions?: number
readonly repetitions?: number
+ readonly gate?: CiMeasurementGatePolicy
}
export type CiMeasurementObservation = {
@@ -108,7 +121,9 @@ export type DevenvPerfTaskProbe =
readonly label?: string
readonly group?: string
readonly description?: string
+ readonly warmupRepetitions?: number
readonly repetitions?: number
+ readonly gate?: CiMeasurementGatePolicy
}
export type DevenvPerfJobOptions = {
@@ -128,11 +143,57 @@ export type DevenvPerfJobOptions = {
readonly permissions?: GitHubWorkflowArgs['jobs'][string]['permissions']
}
+const defaultDevenvPerfGatePolicy = (probeId: string): CiMeasurementGatePolicy => {
+ if (probeId === 'shell_eval_traced') {
+ return {
+ enabled: false,
+ minBaselineSources: 10,
+ minCurrentSamples: 3,
+ warnRatio: 1.25,
+ failRatio: 1.5,
+ warnAbs: 1.5,
+ failAbs: 3,
+ noiseFloor: 0.5,
+ }
+ }
+ if (probeId === 'tasks_list' || probeId === 'processes_help') {
+ return {
+ enabled: true,
+ minBaselineSources: 10,
+ minCurrentSamples: 5,
+ warnRatio: 2,
+ failRatio: 3,
+ warnAbs: 0.25,
+ failAbs: 1,
+ noiseFloor: 0.1,
+ }
+ }
+ return {
+ enabled: true,
+ minBaselineSources: 10,
+ minCurrentSamples: 5,
+ warnRatio: 1.1,
+ failRatio: 1.2,
+ warnAbs: 0.25,
+ failAbs: 0.5,
+ noiseFloor: 0.1,
+ }
+}
+
+const devenvPerfGatePolicy = (probe: Pick) => ({
+ ...defaultDevenvPerfGatePolicy(probe.id),
+ ...probe.gate,
+})
+
const devenvPerfProbeLine = (probe: DevenvPerfProbe) => {
const args = probe.command.map(shellSingleQuote).join(' ')
const trace = probe.traceOutput ?? ''
- const repetitions = Math.max(1, Math.floor(probe.repetitions ?? 1))
- return `measure ${shellSingleQuote(probe.id)} ${shellSingleQuote(probe.label)} ${shellSingleQuote(probe.group ?? '')} ${shellSingleQuote(probe.description ?? '')} ${shellSingleQuote(trace)} ${shellSingleQuote(String(repetitions))} ${args}`
+ const gatePolicy = devenvPerfGatePolicy(probe)
+ const defaultRepetitions = gatePolicy.enabled ? gatePolicy.minCurrentSamples : 1
+ const repetitions = Math.max(1, Math.floor(probe.repetitions ?? defaultRepetitions))
+ const defaultWarmupRepetitions = gatePolicy.enabled && repetitions > 1 ? 1 : 0
+ const warmupRepetitions = Math.max(0, Math.floor(probe.warmupRepetitions ?? defaultWarmupRepetitions))
+ return `measure ${shellSingleQuote(probe.id)} ${shellSingleQuote(probe.label)} ${shellSingleQuote(probe.group ?? '')} ${shellSingleQuote(probe.description ?? '')} ${shellSingleQuote(trace)} ${shellSingleQuote(String(warmupRepetitions))} ${shellSingleQuote(String(repetitions))} ${shellSingleQuote(JSON.stringify(gatePolicy))} ${args}`
}
const defaultDevenvPerfTaskProbe = (probe: DevenvPerfTaskProbe): DevenvPerfProbe => {
@@ -141,13 +202,17 @@ const defaultDevenvPerfTaskProbe = (probe: DevenvPerfTaskProbe): DevenvPerfProbe
const label = typeof probe === 'string' ? undefined : probe.label
const group = typeof probe === 'string' ? undefined : probe.group
const description = typeof probe === 'string' ? undefined : probe.description
+ const warmupRepetitions = typeof probe === 'string' ? undefined : probe.warmupRepetitions
const repetitions = typeof probe === 'string' ? undefined : probe.repetitions
+ const gate = typeof probe === 'string' ? undefined : probe.gate
return {
id: id ?? `task_${task.replaceAll(':', '_')}`,
label: label ?? task,
group: group ?? 'devenv tasks',
description: description ?? `Runs the devenv task '${task}' in before mode without the TUI.`,
+ warmupRepetitions,
repetitions,
+ gate,
command: ['$DEVENV_BIN', 'tasks', 'run', task, '--mode', 'before', '--no-tui', '--show-output'],
}
}
@@ -177,7 +242,8 @@ const renderDevenvPerfScript = (
label: 'Warm shell eval',
group: 'devenv shell',
description: 'Evaluates a warm dev shell without reloading direnv state.',
- repetitions: 3,
+ warmupRepetitions: 1,
+ repetitions: 5,
command: ['$DEVENV_BIN', 'shell', '--no-reload', '--', 'true'],
},
{
@@ -185,7 +251,8 @@ const renderDevenvPerfScript = (
label: 'devenv tasks list',
group: 'devenv cli',
description: 'Lists devenv tasks to measure task graph loading overhead.',
- repetitions: 5,
+ warmupRepetitions: 1,
+ repetitions: 9,
command: ['$DEVENV_BIN', 'tasks', 'list'],
},
{
@@ -193,7 +260,8 @@ const renderDevenvPerfScript = (
label: 'devenv processes --help',
group: 'devenv cli',
description: 'Loads the devenv processes command help path.',
- repetitions: 5,
+ warmupRepetitions: 1,
+ repetitions: 9,
command: ['$DEVENV_BIN', 'processes', '--help'],
},
...opts.taskProbes.map(defaultDevenvPerfTaskProbe),
@@ -240,6 +308,7 @@ json_append_timing() {
local stdout="$7"
local stderr="$8"
local trace="$9"
+ local gate_policy="${dollar}{10}"
local samples_file="$ARTIFACT_DIR/$id.samples.json"
if [ "$first" -eq 0 ]; then
@@ -254,13 +323,15 @@ json_append_timing() {
--arg group "$group" \
--arg description "$description" \
--argjson status "$status" \
- --argjson durationMs "$duration_ms" \
- --arg stdout "$stdout" \
- --arg stderr "$stderr" \
- --arg trace "$trace" \
- '($samples[0] // []) as $sampleList
- | ($sampleList | map(select(.status == 0) | .durationMs)) as $successfulDurations
- | {
+ --argjson durationMs "$duration_ms" \
+ --arg stdout "$stdout" \
+ --arg stderr "$stderr" \
+ --arg trace "$trace" \
+ --argjson gatePolicy "$gate_policy" \
+ '($samples[0] // []) as $sampleList
+ | ($sampleList | map(select(.phase != "warmup" and .status == 0) | .durationMs)) as $successfulDurations
+ | ($sampleList | map(select(.phase == "warmup"))) as $warmupSamples
+ | {
id:$id,
name:$id,
label:$label,
@@ -270,9 +341,12 @@ json_append_timing() {
durationMs:$durationMs,
stdout:$stdout,
stderr:$stderr,
- trace:(if $trace == "" then null else $trace end),
- statistics: {
+ trace:(if $trace == "" then null else $trace end),
+ gatePolicy:$gatePolicy,
+ statistics: {
sampleCount: ($sampleList | length),
+ warmupCount: ($warmupSamples | length),
+ measuredSampleCount: (($sampleList | length) - ($warmupSamples | length)),
successfulSampleCount: ($successfulDurations | length),
minDurationMs: ($successfulDurations | min),
maxDurationMs: ($successfulDurations | max),
@@ -287,10 +361,12 @@ measure() {
local id="$1"
local label="$2"
local group="$3"
- local description="$4"
- local trace_file="$5"
- local repetitions="$6"
- shift 6
+ local description="$4"
+ local trace_file="$5"
+ local warmup_repetitions="$6"
+ local repetitions="$7"
+ local gate_policy="$8"
+ shift 8
case "$trace_file" in
'$ARTIFACT_DIR'*) trace_file="${dollar}{ARTIFACT_DIR}${dollar}{trace_file#'$ARTIFACT_DIR'}" ;;
esac
@@ -303,11 +379,22 @@ measure() {
if ! [[ "$repetitions" =~ ^[0-9]+$ ]] || [ "$repetitions" -lt 1 ]; then
repetitions=1
fi
+ if ! [[ "$warmup_repetitions" =~ ^[0-9]+$ ]] || [ "$warmup_repetitions" -lt 0 ]; then
+ warmup_repetitions=0
+ fi
printf '[' >"$samples_file"
local sample_first=1
- local sample_index sample_stdout sample_stderr sample_trace expanded
- for sample_index in $(seq 1 "$repetitions"); do
+ local sample_index measured_index total_repetitions phase sample_stdout sample_stderr sample_trace expanded
+ total_repetitions=$((warmup_repetitions + repetitions))
+ for sample_index in $(seq 1 "$total_repetitions"); do
+ if [ "$sample_index" -le "$warmup_repetitions" ]; then
+ phase="warmup"
+ measured_index=""
+ else
+ phase="measured"
+ measured_index=$((sample_index - warmup_repetitions))
+ fi
sample_stdout="$ARTIFACT_DIR/$id.$sample_index.stdout"
sample_stderr="$ARTIFACT_DIR/$id.$sample_index.stderr"
sample_trace=""
@@ -342,12 +429,14 @@ measure() {
sample_first=0
jq -cn \
--argjson index "$sample_index" \
+ --arg measuredIndex "$measured_index" \
+ --arg phase "$phase" \
--argjson status "$status" \
--argjson durationMs "$duration_ms" \
--arg stdout "$sample_stdout" \
--arg stderr "$sample_stderr" \
--arg trace "$sample_trace" \
- '{index:$index,status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:(if $trace == "" then null else $trace end)}' \
+ '{index:$index,measuredIndex:(if $measuredIndex == "" then null else ($measuredIndex | tonumber) end),phase:$phase,status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:(if $trace == "" then null else $trace end)}' \
>>"$samples_file"
stdout="$sample_stdout"
@@ -361,12 +450,12 @@ measure() {
printf ']\n' >>"$samples_file"
status="$(jq -r 'map(.status) | max // 0' "$samples_file")"
- duration_ms="$(jq -r 'map(select(.status == 0) | .durationMs) as $values | if ($values | length) == 0 then (map(.durationMs) | max // 0) else ($values | sort | .[(length - 1) / 2 | floor]) end' "$samples_file")"
+ duration_ms="$(jq -r 'map(select(.phase != "warmup" and .status == 0) | .durationMs) as $values | if ($values | length) == 0 then (map(.durationMs) | max // 0) else ($values | sort | .[(length - 1) / 2 | floor]) end' "$samples_file")"
cp "$stdout" "$ARTIFACT_DIR/$id.stdout" 2>/dev/null || true
cp "$stderr" "$ARTIFACT_DIR/$id.stderr" 2>/dev/null || true
- json_append_timing "$id" "$label" "$group" "$description" "$status" "$duration_ms" "$ARTIFACT_DIR/$id.stdout" "$ARTIFACT_DIR/$id.stderr" "$trace_file"
+ json_append_timing "$id" "$label" "$group" "$description" "$status" "$duration_ms" "$ARTIFACT_DIR/$id.stdout" "$ARTIFACT_DIR/$id.stderr" "$trace_file" "$gate_policy"
if [ "$status" -ne 0 ]; then
echo "::error::$id failed after ${dollar}{duration_ms}ms; stderr tail follows"
@@ -429,7 +518,11 @@ jq -n \
'{
schemaVersion: $schemaVersion,
generatedAt: $generatedAt,
- producer: { name: "effect-utils-ci-measurement", version: 1 },
+ producer: {
+ name: "effect-utils-ci-measurement",
+ version: 2,
+ measurementProtocol: "devenv-perf-warm-median-v2"
+ },
subject: {
repo: $repository,
branchKind: (if $branchKind == "" then "unknown" else $branchKind end),
@@ -457,9 +550,12 @@ jq -n \
group: .group,
name: ("devenv." + .id + ".duration"),
unit: "seconds",
- value: (.durationMs / 1000),
- statistics: {
+ value: (.durationMs / 1000),
+ policy: .gatePolicy,
+ statistics: {
sampleCount: (.statistics.sampleCount // 1),
+ warmupCount: (.statistics.warmupCount // 0),
+ measuredSampleCount: (.statistics.measuredSampleCount // (.statistics.sampleCount // 1)),
successfulSampleCount: (.statistics.successfulSampleCount // (if .status == 0 then 1 else 0 end)),
min: ((.statistics.minDurationMs // .durationMs) / 1000),
max: ((.statistics.maxDurationMs // .durationMs) / 1000),
@@ -470,6 +566,11 @@ jq -n \
probeLabel: .label,
status: .status,
sampleCount: (.statistics.sampleCount // 1),
+ warmupCount: (.statistics.warmupCount // 0),
+ measuredSampleCount: (.statistics.measuredSampleCount // (.statistics.sampleCount // 1)),
+ measurementProtocol: "devenv-perf-warm-median-v2",
+ aggregation: "median",
+ phase: "warm",
devenvRev: $devenvRev,
otelServiceName: $otelServiceName
}
@@ -490,109 +591,6 @@ jq -n \
}
}' >"$ARTIFACT_DIR/measurements.json"
-compare_baseline() {
- local baseline_path="${dollar}{DEVENV_PERF_BASELINE_SUMMARY:-$ARTIFACT_DIR/baseline/summary.json}"
- local mode="${dollar}{DEVENV_PERF_REGRESSION_MODE:-warn}"
-
- if [ "$mode" = "off" ]; then
- jq -n --argjson schemaVersion 1 --arg status skipped --arg mode "$mode" '{schemaVersion:$schemaVersion, status:$status, mode:$mode, checks:{}}' >"$ARTIFACT_DIR/perf-comparison.json"
- return 0
- fi
-
- if [ ! -f "$baseline_path" ]; then
- jq -n \
- --argjson schemaVersion 1 \
- --arg status baseline_missing \
- --arg mode "$mode" \
- --arg baseline "$baseline_path" \
- '{schemaVersion:$schemaVersion, status:$status, mode:$mode, baseline:$baseline, checks:{}}' \
- >"$ARTIFACT_DIR/perf-comparison.json"
- echo "::notice::devenv perf baseline not found at $baseline_path; recorded current measurements only"
- return 0
- fi
-
- jq -n \
- --slurpfile current "$ARTIFACT_DIR/summary.json" \
- --slurpfile baseline "$baseline_path" \
- --argjson schemaVersion 1 \
- --arg mode "$mode" \
- --arg baselinePath "$baseline_path" \
- '
- def budget($name):
- if $name == "shell_eval_traced" then
- {warnRatio:1.25, failRatio:1.5, warnMs:1500, failMs:3000}
- elif $name == "shell_eval_warm" then
- {warnRatio:1.5, failRatio:2.0, warnMs:500, failMs:1000}
- elif $name == "tasks_list" or $name == "processes_help" then
- {warnRatio:2.0, failRatio:3.0, warnMs:250, failMs:1000}
- else
- {warnRatio:1.5, failRatio:2.0, warnMs:1000, failMs:3000}
- end;
- def classify($name; $current; $baseline):
- budget($name) as $b
- | ($current - $baseline) as $delta
- | (if $baseline > 0 then ($current / $baseline) else null end) as $ratio
- | (
- if $baseline <= 0 then "unknown"
- elif ($delta > $b.failMs and $current > ($baseline * $b.failRatio)) then "fail"
- elif ($delta > $b.warnMs and $current > ($baseline * $b.warnRatio)) then "warn"
- else "pass"
- end
- ) as $status
- | {status:$status, currentMs:$current, baselineMs:$baseline, deltaMs:$delta, ratio:$ratio, budget:$b};
- ($current[0].checks // {}) as $currentChecks
- | ($baseline[0].checks // {}) as $baselineChecks
- | (
- $currentChecks
- | to_entries
- | map(
- .key as $name
- | .value as $current
- | ($baselineChecks[$name] // null) as $base
- | {
- key: $name,
- value: (
- if $base == null then
- {status:"missing_baseline", currentMs:$current.durationMs}
- elif ($current.status != 0) then
- {status:"current_failed", currentMs:$current.durationMs, baselineMs:$base.durationMs}
- elif ($base.status != 0) then
- {status:"baseline_failed", currentMs:$current.durationMs, baselineMs:$base.durationMs}
- else
- classify($name; $current.durationMs; $base.durationMs)
- end
- )
- }
- )
- | from_entries
- ) as $checks
- | (
- if any($checks[]; .status == "fail") then "fail"
- elif any($checks[]; .status == "warn") then "warn"
- elif any($checks[]; .status == "missing_baseline") then "partial"
- else "pass"
- end
- ) as $status
- | {schemaVersion:$schemaVersion, status:$status, mode:$mode, baseline:$baselinePath, checks:$checks}
- ' >"$ARTIFACT_DIR/perf-comparison.json"
-
- local status
- status="$(jq -r '.status' "$ARTIFACT_DIR/perf-comparison.json")"
- case "$status:$mode" in
- fail:fail)
- echo "::error::devenv perf regression detected"
- jq . "$ARTIFACT_DIR/perf-comparison.json"
- return 1
- ;;
- fail:*|warn:*)
- echo "::warning::devenv perf regression threshold exceeded"
- jq . "$ARTIFACT_DIR/perf-comparison.json"
- ;;
- esac
-}
-
-compare_baseline
-
if [ -n "${dollar}{GITHUB_STEP_SUMMARY:-}" ]; then
{
echo "### Devenv perf"
@@ -603,12 +601,6 @@ if [ -n "${dollar}{GITHUB_STEP_SUMMARY:-}" ]; then
echo ""
echo "- Artifact directory: \`$ARTIFACT_DIR\`"
echo "- OTEL service: \`${dollar}{OTEL_SERVICE_NAME:-unknown}\`"
- echo ""
- echo "#### Regression comparison"
- echo ""
- if [ -f "$ARTIFACT_DIR/perf-comparison.json" ]; then
- jq -r '["- Status: " + .status, "- Mode: " + .mode, "- Baseline: " + (.baseline // "none")] | .[]' "$ARTIFACT_DIR/perf-comparison.json"
- fi
} >>"$GITHUB_STEP_SUMMARY"
fi
@@ -976,7 +968,10 @@ fi
current_index="$(mktemp)"
baseline_index="$(mktemp)"
find "$current_dir" -path "$baseline_dir" -prune -o -name measurements.json -type f -print | sort >"$current_index" || true
-find "$baseline_dir" -name measurements.json -type f -print | sort >"$baseline_index" || true
+{
+ find "$baseline_dir" -maxdepth 1 -name measurements.json -type f -print
+ find "$baseline_dir" -mindepth 2 -maxdepth 2 -name measurements.json -type f -print
+} | sort -u >"$baseline_index" || true
if [ ! -s "$current_index" ]; then
echo "::error::no current measurements.json files found under $current_dir"
@@ -1003,7 +998,7 @@ jq -n \
def identity_dimensions:
(.dimensions // {})
| to_entries
- | map(select(.key as $key | ["devenvRev", "otelServiceName", "status", "probeLabel", "sampleCount"] | index($key) | not))
+ | map(select(.key as $key | ["devenvRev", "otelServiceName", "status", "probeLabel", "sampleCount", "measuredSampleCount"] | index($key) | not))
| sort_by(.key)
| map("\(.key)=\(.value|tostring)")
| join(",");
@@ -1026,6 +1021,13 @@ jq -n \
else (($sorted[($count / 2 - 1)] + $sorted[($count / 2)]) / 2)
end;
+ def percentile($p):
+ sort as $sorted
+ | ($sorted | length) as $count
+ | if $count == 0 then null
+ else $sorted[(($p * ($count - 1)) | floor)]
+ end;
+
def observations_by_key($docs):
reduce $docs[]? as $doc
({};
@@ -1042,13 +1044,14 @@ jq -n \
def observation_stats($items):
($items | map(.observation.value)) as $values
- | ($items | map(.observation.statistics.sampleCount // 1) | add // ($items | length)) as $sampleCount
+ | ($items | map(.observation.statistics.measuredSampleCount // .observation.statistics.sampleCount // 1) | add // ($items | length)) as $sampleCount
| {
target: ($items[0].target // {}),
observation: ($items[-1].observation // {}),
value: ($values | median),
min: ($values | min),
max: ($values | max),
+ p95: ($values | percentile(0.95)),
sourceCount: ($items | length),
sampleCount: $sampleCount,
generatedAt: ($items[-1].generatedAt // null)
@@ -1062,7 +1065,7 @@ jq -n \
elif $metric == "nix.closure.path_count" then
{warnRatio:1.10, failRatio:1.25, warnAbs:100, failAbs:500}
elif $unit == "seconds" then
- {warnRatio:1.25, failRatio:1.50, warnAbs:1.5, failAbs:3.0}
+ {warnRatio:1.10, failRatio:1.20, warnAbs:0.25, failAbs:0.5}
else
{warnRatio:1.25, failRatio:1.50, warnAbs:1, failAbs:3}
end;
@@ -1073,11 +1076,24 @@ jq -n \
elif $unit == "seconds" then 0.1
else 0
end;
- def abs_value: if . < 0 then -. else . end;
-
- def classify($metric; $unit; $current; $baseline; $baselineMin; $baselineMax; $currentSamples; $baselineSources):
+ def default_policy($metric; $unit):
budget($metric; $unit) as $b
| noise_floor($metric; $unit) as $noise
+ | $b + {
+ enabled:true,
+ minBaselineSources:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" then 3 else 10 end),
+ minCurrentSamples:(if $unit == "seconds" then 3 else 1 end),
+ noiseFloor:$noise
+ };
+ def observation_policy($obs):
+ default_policy($obs.name // "unknown"; $obs.unit // "unknown") + ($obs.policy // {});
+ def policy_enabled($policy):
+ if ($policy | has("enabled")) then $policy.enabled else true end;
+ def abs_value: if . < 0 then -. else . end;
+
+ def classify($metric; $unit; $policy; $current; $baseline; $baselineMin; $baselineMax; $baselineP95; $currentSamples; $baselineSources):
+ $policy as $b
+ | ($policy.noiseFloor // noise_floor($metric; $unit)) as $noise
| ($current - $baseline) as $delta
| (if $baseline > 0 then ($current / $baseline) else null end) as $ratio
| (
@@ -1093,17 +1109,34 @@ jq -n \
else "pass"
end
) as $thresholdStatus
+ | (
+ policy_enabled($policy) == true
+ and $baseline > 0
+ and $baselineSources >= ($policy.minBaselineSources // 1)
+ and $currentSamples >= ($policy.minCurrentSamples // 1)
+ ) as $gateable
+ | (
+ if (policy_enabled($policy) != true) then "disabled"
+ elif $baseline <= 0 then "missing_baseline"
+ elif $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
+ elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
+ else "eligible"
+ end
+ ) as $gateReason
| (
if $baseline <= 0 then "unknown"
+ elif (policy_enabled($policy) != true) then "diagnostic"
elif ($delta | abs_value) <= $noise then "noise_floor"
elif ($withinBaselineRange and $thresholdStatus == "pass") then "within_baseline_range"
- elif ($baselineSources < 3 or $currentSamples < 3) then "low_sample_count"
+ elif $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
+ elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
elif $thresholdStatus == "pass" then "within_budget"
+ elif ($baselineP95 != null and $current <= $baselineP95) then "within_baseline_distribution"
else "threshold_exceeded"
end
) as $confidence
| (
- if $confidence == "threshold_exceeded" then $thresholdStatus
+ if ($gateable and $confidence == "threshold_exceeded") then $thresholdStatus
elif $thresholdStatus == "unknown" then "unknown"
else "pass"
end
@@ -1116,7 +1149,7 @@ jq -n \
else "regressed"
end
) as $direction
- | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,confidence:$confidence,direction:$direction};
+ | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction};
(observations_by_key($current[0]) | with_entries(.value = observation_stats(.value))) as $currentObs
| (observations_by_key($baseline[0]) | with_entries(.value = observation_stats(.value))) as $baselineObs
@@ -1135,30 +1168,36 @@ jq -n \
status: "missing_baseline",
target: $currentValue.target,
observation: $currentValue.observation,
- current: $currentValue.value,
- currentSamples: $currentValue.sampleCount,
- baselineSources: 0,
- confidence: "missing_baseline",
- direction: "unknown"
- }
- else
- classify(
- $currentValue.observation.name;
- $currentValue.observation.unit;
- $currentValue.value;
- $baselineValue.value;
- $baselineValue.min;
- $baselineValue.max;
- $currentValue.sampleCount;
- $baselineValue.sourceCount
- ) + {
+ current: $currentValue.value,
+ currentSamples: $currentValue.sampleCount,
+ baselineSources: 0,
+ gatePolicy: ($currentValue.observation | observation_policy(.)),
+ gateable: false,
+ gateReason: "missing_baseline",
+ confidence: "missing_baseline",
+ direction: "unknown"
+ }
+ else
+ classify(
+ $currentValue.observation.name;
+ $currentValue.observation.unit;
+ ($currentValue.observation | observation_policy(.));
+ $currentValue.value;
+ $baselineValue.value;
+ $baselineValue.min;
+ $baselineValue.max;
+ $baselineValue.p95;
+ $currentValue.sampleCount;
+ $baselineValue.sourceCount
+ ) + {
target: $currentValue.target,
observation: $currentValue.observation,
- currentSamples: $currentValue.sampleCount,
- baselineSources: $baselineValue.sourceCount,
- baselineMin: $baselineValue.min,
- baselineMax: $baselineValue.max
- }
+ currentSamples: $currentValue.sampleCount,
+ baselineSources: $baselineValue.sourceCount,
+ baselineMin: $baselineValue.min,
+ baselineMax: $baselineValue.max,
+ baselineP95: $baselineValue.p95
+ }
end
)
}
@@ -1168,7 +1207,7 @@ jq -n \
| (
if any($comparisons[]?; .status == "fail") then "fail"
elif any($comparisons[]?; .status == "warn") then "warn"
- elif any($comparisons[]?; .status == "missing_baseline") then "partial"
+ elif any($comparisons[]?; .status == "missing_baseline" and (if (.gatePolicy | has("enabled")) then .gatePolicy.enabled else true end)) then "partial"
else "pass"
end
) as $status
@@ -1212,8 +1251,8 @@ if [ -n "${dollar}{GITHUB_STEP_SUMMARY:-}" ]; then
echo ""
jq -r '"- Status: " + .status + "\n- Mode: " + .mode + "\n- Baseline: " + .baselineDir' "$comparison_file"
echo ""
- echo "| Status | Target | Observation | Current | Baseline | Delta | Ratio |"
- echo "| --- | --- | --- | ---: | ---: | ---: | ---: |"
+ echo "| Status | Gate | Target | Observation | Current | Baseline | Delta | Ratio |"
+ echo "| --- | --- | --- | --- | ---: | ---: | ---: | ---: |"
jq -r '
.comparisons
| to_entries
@@ -1227,9 +1266,10 @@ if [ -n "${dollar}{GITHUB_STEP_SUMMARY:-}" ]; then
| .[:20]
| .[]
| .value as $v
- | [
- $v.status,
- (($v.target.kind // "unknown") + "/" + ($v.target.name // "unknown") + "/" + ($v.target.system // "unknown")),
+ | [
+ $v.status,
+ (if ($v.gateable // false) then "yes" else ($v.gateReason // "no") end),
+ (($v.target.kind // "unknown") + "/" + ($v.target.name // "unknown") + "/" + ($v.target.system // "unknown")),
($v.observation.name // "unknown"),
(($v.current // $v.observation.value // 0) | tostring),
(($v.baseline // "") | tostring),
@@ -1373,16 +1413,25 @@ const formatRatio = (value) => {
}
const formatResult = (row) => {
- if (row.confidence === 'low_sample_count') return 'gray needs repeat'
+ if (row.confidence === 'low_baseline_count') return 'gray needs baseline'
+ if (row.confidence === 'low_current_sample_count') return 'gray needs repeat'
+ if (row.confidence === 'diagnostic') return 'gray diagnostic'
if (row.status === 'fail') return 'red regression'
if (row.status === 'warn') return 'yellow regression'
if (row.status === 'missing_baseline') return 'gray no baseline'
if (row.confidence === 'noise_floor') return 'gray noise floor'
if (row.confidence === 'within_baseline_range') return 'gray within range'
+ if (row.confidence === 'within_baseline_distribution') return 'gray within p95'
if (row.direction === 'improved') return 'green improved'
return 'gray unchanged'
}
+const formatGate = (row) => {
+ if (row.gateable) return 'yes'
+ const reason = row.gateReason || row.confidence || 'unknown'
+ return 'no
' + reason + ''
+}
+
const escapeCell = (value) => String(value ?? '-').replaceAll('|', '\\|').replaceAll('\n', '
')
const escapeXml = (value) => String(value)
.replaceAll('&', '&')
@@ -1447,6 +1496,14 @@ const allRows = Object.values(comparison.comparisons || {}).sort((left, right) =
if (byRank !== 0) return byRank
return (right.delta || 0) - (left.delta || 0)
})
+const protocolLabel = (() => {
+ const protocols = new Set(
+ allRows
+ .map((row) => row.observation?.dimensions?.measurementProtocol)
+ .filter((value) => typeof value === 'string' && value.length > 0),
+ )
+ return protocols.size > 0 ? Array.from(protocols).join(', ') : 'legacy'
+})()
const visibleLimit = Number.isFinite(maxRows) && maxRows > 0 ? maxRows : 10
const comparableRows = allRows.filter((row) => typeof row.baseline === 'number')
const hasComparableBaseline = comparableRows.length > 0
@@ -1458,8 +1515,8 @@ const visibleRows = (hasComparableBaseline
const comparisonTable = (rows) => {
if (rows.length === 0) return 'No measurement regressions detected.'
return [
- '| Probe | Baseline | Current | Change | Result | Confidence |',
- '| --- | ---: | ---: | ---: | --- | --- |',
+ '| Probe | Baseline | Current | Change | Result | Gate | Confidence |',
+ '| --- | ---: | ---: | ---: | --- | --- | --- |',
...rows.map((row) => {
const unit = row.observation?.unit
const baselineRange = typeof row.baselineMin === 'number' && typeof row.baselineMax === 'number' && row.baselineMin !== row.baselineMax
@@ -1471,6 +1528,7 @@ const comparisonTable = (rows) => {
formatValue(row.current, unit),
formatDelta(row.delta, unit) + ' / ' + formatRatio(row.ratio),
formatResult(row),
+ formatGate(row),
(row.confidence || 'unknown') + '
baseline n=' + (row.baselineSources ?? 0) + ', current samples=' + (row.currentSamples ?? 1) + '',
].map(escapeCell).join(' | ') + ' |'
}),
@@ -1491,12 +1549,13 @@ const currentOnlyTable = (rows) => {
const allMeasurementsTable = (rows) => {
if (rows.length === 0) return 'No measurement regressions detected.'
return [
- '| Status | Target | Observation | Dimensions | Baseline | Current | Delta | Ratio |',
- '| --- | --- | --- | --- | ---: | ---: | ---: | ---: |',
+ '| Status | Gate | Target | Observation | Dimensions | Baseline | Current | Delta | Ratio |',
+ '| --- | --- | --- | --- | --- | ---: | ---: | ---: | ---: |',
...rows.map((row) => {
const unit = row.observation?.unit
return '| ' + [
row.status,
+ row.gateable ? 'yes' : (row.gateReason || 'no'),
row.target?.label || row.target?.name || 'unknown',
row.observation?.label || row.observation?.name || 'unknown',
dimensions(row),
@@ -1633,6 +1692,7 @@ const summaryLines = [
'- Commit: ' + shortSha,
'- Run: ' + runLink,
'- Baseline: ' + baselineLabel,
+ '- Protocol: ' + protocolLabel,
'',
hasComparableBaseline
? 'Chart: performance change versus baseline median. Green is faster, red is slower, gray is within noise or baseline range.'
@@ -1723,7 +1783,6 @@ export const devenvPerfJob = (opts?: DevenvPerfJobOptions) => {
...standardCIEnv,
ARTIFACT_DIR: artifactDir,
OTEL_SERVICE_NAME: 'devenv-perf-ci',
- DEVENV_PERF_REGRESSION_MODE: opts?.regressionMode ?? 'warn',
RUNNER_CLASS: (opts?.runsOn ?? linuxX64Runner).join(','),
...opts?.env,
},
diff --git a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
index 3f1d2f9c6..601c9b908 100644
--- a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
+++ b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
@@ -388,7 +388,7 @@ describe('ci workflow devenv perf helpers', () => {
expect(generatedCiWorkflowYamlSource).toContain('devenv-perf:')
expect(generatedCiWorkflowYamlSource).toContain('OTEL_SERVICE_NAME: devenv-perf-ci')
expect(generatedCiWorkflowYamlSource).toContain(
- "measure 'shell_eval_traced' 'Shell eval with OTEL trace' 'devenv shell' 'Evaluates the dev shell with native devenv JSON tracing enabled.' '$ARTIFACT_DIR/traces/shell_eval_traced.json' '1'",
+ "measure 'shell_eval_traced' 'Shell eval with OTEL trace' 'devenv shell' 'Evaluates the dev shell with native devenv JSON tracing enabled.' '$ARTIFACT_DIR/traces/shell_eval_traced.json' '0' '1'",
)
expect(generatedCiWorkflowYamlSource).toContain('--trace-to')
expect(generatedCiWorkflowYamlSource).toContain('json:file:$trace_file')
@@ -396,7 +396,7 @@ describe('ci workflow devenv perf helpers', () => {
expect(generatedCiWorkflowYamlSource).toContain("measure 'shell_eval_warm' 'Warm shell eval'")
expect(generatedCiWorkflowYamlSource).toContain("measure 'tasks_list' 'devenv tasks list'")
expect(generatedCiWorkflowYamlSource).toContain(
- "'Loads the devenv processes command help path.' '' '5'",
+ "'Loads the devenv processes command help path.' '' '1' '9'",
)
})
@@ -413,16 +413,23 @@ describe('ci workflow devenv perf helpers', () => {
expect(generatedCiWorkflowYamlSource).toContain('probeLabel: .label')
expect(generatedCiWorkflowYamlSource).toContain('sampleCount: (.statistics.sampleCount // 1)')
expect(generatedCiWorkflowYamlSource).toContain('baselineSources')
- expect(generatedCiWorkflowYamlSource).toContain('low_sample_count')
+ expect(generatedCiWorkflowYamlSource).toContain('low_baseline_count')
+ expect(generatedCiWorkflowYamlSource).toContain('low_current_sample_count')
expect(generatedCiWorkflowYamlSource).toContain('within_baseline_range')
expect(generatedCiWorkflowYamlSource).toContain(
- 'elif ($baselineSources < 3 or $currentSamples < 3) then "low_sample_count"',
+ 'elif $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"',
)
expect(generatedCiWorkflowYamlSource).toContain(
- 'if $confidence == "threshold_exceeded" then $thresholdStatus',
+ 'elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"',
+ )
+ expect(generatedCiWorkflowYamlSource).toContain(
+ 'if ($gateable and $confidence == "threshold_exceeded") then $thresholdStatus',
+ )
+ expect(ciWorkflowSource).toContain(
+ "if (row.confidence === 'low_baseline_count') return 'gray needs baseline'",
)
expect(ciWorkflowSource).toContain(
- "if (row.confidence === 'low_sample_count') return 'gray needs repeat'",
+ "if (row.confidence === 'low_current_sample_count') return 'gray needs repeat'",
)
expect(generatedCiWorkflowYamlSource).toContain('RUNNER_CLASS:')
expect(generatedCiWorkflowYamlSource).toContain('namespace-profile-linux-x86-64')
@@ -439,8 +446,9 @@ describe('ci workflow devenv perf helpers', () => {
'topPaths: ($closurePaths | sort_by(.narSize) | reverse | .[:30])',
)
expect(generatedCiWorkflowYamlSource).not.toContain('dev3')
- expect(generatedCiWorkflowYamlSource).toContain('perf-comparison.json')
- expect(generatedCiWorkflowYamlSource).toContain('DEVENV_PERF_REGRESSION_MODE')
+ expect(generatedCiWorkflowYamlSource).not.toContain('perf-comparison.json')
+ expect(generatedCiWorkflowYamlSource).not.toContain('DEVENV_PERF_REGRESSION_MODE')
+ expect(generatedCiWorkflowYamlSource).toContain('devenv-perf-warm-median-v2')
expect(generatedCiWorkflowYamlSource).toContain("CI_MEASUREMENT_PR_COMMENT_ENABLED: 'true'")
expect(generatedCiWorkflowYamlSource).toContain(
'CI_MEASUREMENT_PR_COMMENT_TITLE: Devenv Performance',
@@ -453,7 +461,7 @@ describe('ci workflow devenv perf helpers', () => {
expect(ciWorkflowSource).toContain('baselineSeedRunIds?: readonly string[]')
expect(ciWorkflowSource).toContain('baselineProvenance: ($baselineProvenance[0] // null)')
expect(ciWorkflowSource).toContain(
- '["devenvRev", "otelServiceName", "status", "probeLabel", "sampleCount"] | index($key) | not',
+ '["devenvRev", "otelServiceName", "status", "probeLabel", "sampleCount", "measuredSampleCount"] | index($key) | not',
)
expect(ciWorkflowSource).toContain('chart_file="$comment_tmp_dir/perf-change-vs-baseline.svg"')
expect(ciWorkflowSource).toContain(
From 2784c669089cd788020a72764831cf8cdfbb2e96 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Thu, 14 May 2026 15:16:40 +0200
Subject: [PATCH 02/81] Clarify CI measurement gate labels
---
.github/workflows/ci.yml | 12 +++++++++---
genie/ci-workflow/measurements.ts | 12 +++++++++---
2 files changed, 18 insertions(+), 6 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d3ace3a1d..cb40599b0 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3557,12 +3557,18 @@ jobs:
const previousRuns = (existingState?.runs || []).filter((run) => run.commitSha !== currentRun.commitSha)
const historyLimit = Number.isFinite(maxHistory) && maxHistory > 0 ? maxHistory : 20
const state = { _tag: stateTag, schemaVersion, title, runs: [currentRun, ...previousRuns].slice(0, historyLimit) }
+ const gateModeLabel = (mode) => {
+ if (mode === 'fail') return 'enforced'
+ if (mode === 'warn') return 'advisory'
+ if (mode === 'off') return 'off'
+ return mode || 'unknown'
+ }
const historyRows = state.runs.slice(1).map((run) => {
const link = run.runUrl ? '[' + run.shortSha + '](' + run.runUrl + ')' : run.shortSha
const top = Array.isArray(run.visibleRows) && run.visibleRows.length > 0
? run.visibleRows.slice(0, 3).map((row) => row.status + ' ' + row.target + ' ' + row.observation + ' ' + row.delta + ' / ' + row.ratio).join('
')
: 'No regressions'
- return '| ' + [link, run.status, run.mode, top].map(escapeCell).join(' | ') + ' |'
+ return '| ' + [link, run.status, gateModeLabel(run.mode), top].map(escapeCell).join(' | ') + ' |'
})
const runLink = runUrl ? '[workflow run](' + runUrl + ')' : 'workflow run unavailable'
@@ -3579,7 +3585,7 @@ jobs:
'## ' + title,
'',
'- Status: ' + statusWord,
- '- Mode: ' + (comparison.mode || 'unknown'),
+ '- Gate: ' + gateModeLabel(comparison.mode),
'- Commit: ' + shortSha,
'- Run: ' + runLink,
'- Baseline: ' + baselineLabel,
@@ -3607,7 +3613,7 @@ jobs:
'',
'Previous runs
',
'',
- '| Commit | Status | Mode | Top changes |',
+ '| Commit | Status | Gate | Top changes |',
'| --- | --- | --- | --- |',
...historyRows,
'',
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 300b621cf..33f049e99 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -1666,12 +1666,18 @@ const currentRun = {
const previousRuns = (existingState?.runs || []).filter((run) => run.commitSha !== currentRun.commitSha)
const historyLimit = Number.isFinite(maxHistory) && maxHistory > 0 ? maxHistory : 20
const state = { _tag: stateTag, schemaVersion, title, runs: [currentRun, ...previousRuns].slice(0, historyLimit) }
+const gateModeLabel = (mode) => {
+ if (mode === 'fail') return 'enforced'
+ if (mode === 'warn') return 'advisory'
+ if (mode === 'off') return 'off'
+ return mode || 'unknown'
+}
const historyRows = state.runs.slice(1).map((run) => {
const link = run.runUrl ? '[' + run.shortSha + '](' + run.runUrl + ')' : run.shortSha
const top = Array.isArray(run.visibleRows) && run.visibleRows.length > 0
? run.visibleRows.slice(0, 3).map((row) => row.status + ' ' + row.target + ' ' + row.observation + ' ' + row.delta + ' / ' + row.ratio).join('
')
: 'No regressions'
- return '| ' + [link, run.status, run.mode, top].map(escapeCell).join(' | ') + ' |'
+ return '| ' + [link, run.status, gateModeLabel(run.mode), top].map(escapeCell).join(' | ') + ' |'
})
const runLink = runUrl ? '[workflow run](' + runUrl + ')' : 'workflow run unavailable'
@@ -1688,7 +1694,7 @@ const summaryLines = [
'## ' + title,
'',
'- Status: ' + statusWord,
- '- Mode: ' + (comparison.mode || 'unknown'),
+ '- Gate: ' + gateModeLabel(comparison.mode),
'- Commit: ' + shortSha,
'- Run: ' + runLink,
'- Baseline: ' + baselineLabel,
@@ -1716,7 +1722,7 @@ if (historyRows.length > 0) {
'',
'Previous runs
',
'',
- '| Commit | Status | Mode | Top changes |',
+ '| Commit | Status | Gate | Top changes |',
'| --- | --- | --- | --- |',
...historyRows,
'',
From 64ff2880cfca78d5f9358461e95adf1d4be6e8bf Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Thu, 14 May 2026 15:19:47 +0200
Subject: [PATCH 03/81] Clarify measurement gate summary labels
---
.github/workflows/ci.yml | 2 +-
genie/ci-workflow/measurements.ts | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index cb40599b0..63811aca2 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3140,7 +3140,7 @@ jobs:
{
echo "### ${CI_MEASUREMENT_PR_COMMENT_TITLE:-CI Measurements}"
echo ""
- jq -r '"- Status: " + .status + "\n- Mode: " + .mode + "\n- Baseline: " + .baselineDir' "$comparison_file"
+ jq -r '"- Status: " + .status + "\n- Gate: " + (if .mode == "fail" then "enforced" elif .mode == "warn" then "advisory" elif .mode == "off" then "off" else (.mode // "unknown") end) + "\n- Baseline: " + .baselineDir' "$comparison_file"
echo ""
echo "| Status | Gate | Target | Observation | Current | Baseline | Delta | Ratio |"
echo "| --- | --- | --- | --- | ---: | ---: | ---: | ---: |"
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 33f049e99..bb9f3dc23 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -1249,7 +1249,7 @@ if [ -n "${dollar}{GITHUB_STEP_SUMMARY:-}" ]; then
{
echo "### ${dollar}{CI_MEASUREMENT_PR_COMMENT_TITLE:-CI Measurements}"
echo ""
- jq -r '"- Status: " + .status + "\n- Mode: " + .mode + "\n- Baseline: " + .baselineDir' "$comparison_file"
+ jq -r '"- Status: " + .status + "\n- Gate: " + (if .mode == "fail" then "enforced" elif .mode == "warn" then "advisory" elif .mode == "off" then "off" else (.mode // "unknown") end) + "\n- Baseline: " + .baselineDir' "$comparison_file"
echo ""
echo "| Status | Gate | Target | Observation | Current | Baseline | Delta | Ratio |"
echo "| --- | --- | --- | --- | ---: | ---: | ---: | ---: |"
From d455009d2b0a281bbfbae9c31fb9f81604af7800 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Thu, 14 May 2026 15:55:04 +0200
Subject: [PATCH 04/81] Make measurement comments hermetic
---
.github/workflows/ci.yml | 5 +++++
genie/ci-workflow/measurements.ts | 5 +++++
2 files changed, 10 insertions(+)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 63811aca2..6630ba5bc 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3174,6 +3174,11 @@ jobs:
if [ "${CI_MEASUREMENT_PR_COMMENT_ENABLED:-false}" = "true" ] && [ "${GITHUB_EVENT_NAME:-}" = "pull_request" ]; then
can_render_pr_comment=true
+ if ! command -v gh >/dev/null 2>&1 && command -v nix >/dev/null 2>&1; then
+ if gh_out="$(nix build --no-link --print-out-paths nixpkgs#gh 2>/dev/null)"; then
+ export PATH="$gh_out/bin:$PATH"
+ fi
+ fi
if ! command -v gh >/dev/null 2>&1; then
echo "::notice::gh is not available; skipping CI measurement PR comment"
can_render_pr_comment=false
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index bb9f3dc23..f4d84e580 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -1283,6 +1283,11 @@ fi
if [ "${dollar}{CI_MEASUREMENT_PR_COMMENT_ENABLED:-false}" = "true" ] && [ "${dollar}{GITHUB_EVENT_NAME:-}" = "pull_request" ]; then
can_render_pr_comment=true
+ if ! command -v gh >/dev/null 2>&1 && command -v nix >/dev/null 2>&1; then
+ if gh_out="$(nix build --no-link --print-out-paths nixpkgs#gh 2>/dev/null)"; then
+ export PATH="$gh_out/bin:$PATH"
+ fi
+ fi
if ! command -v gh >/dev/null 2>&1; then
echo "::notice::gh is not available; skipping CI measurement PR comment"
can_render_pr_comment=false
From f1ad6652b1e3848193e8037a950031ca2a5ebd02 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Thu, 14 May 2026 16:20:38 +0200
Subject: [PATCH 05/81] Make measurement comment renderer hermetic
---
.github/workflows/ci.yml | 33 ++++++++++++++++++++++++-------
genie/ci-workflow/measurements.ts | 33 ++++++++++++++++++++++++-------
2 files changed, 52 insertions(+), 14 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 6630ba5bc..f43df2c31 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3174,19 +3174,38 @@ jobs:
if [ "${CI_MEASUREMENT_PR_COMMENT_ENABLED:-false}" = "true" ] && [ "${GITHUB_EVENT_NAME:-}" = "pull_request" ]; then
can_render_pr_comment=true
- if ! command -v gh >/dev/null 2>&1 && command -v nix >/dev/null 2>&1; then
- if gh_out="$(nix build --no-link --print-out-paths nixpkgs#gh 2>/dev/null)"; then
- export PATH="$gh_out/bin:$PATH"
+
+ ensure_ci_measurement_tool() {
+ tool_name="$1"
+ nix_attr="$2"
+ if command -v "$tool_name" >/dev/null 2>&1; then
+ return 0
fi
- fi
- if ! command -v gh >/dev/null 2>&1; then
+ if ! command -v nix >/dev/null 2>&1; then
+ return 1
+ fi
+ if tool_out="$(nix build --no-link --print-out-paths "nixpkgs#$nix_attr" 2>/dev/null)"; then
+ export PATH="$tool_out/bin:$PATH"
+ fi
+ command -v "$tool_name" >/dev/null 2>&1
+ }
+
+ if ! ensure_ci_measurement_tool gh gh; then
echo "::notice::gh is not available; skipping CI measurement PR comment"
can_render_pr_comment=false
fi
- if ! command -v jq >/dev/null 2>&1; then
- echo "::notice::jq is not available; skipping CI measurement PR comment"
+ if ! ensure_ci_measurement_tool node nodejs; then
+ echo "::notice::node is not available; skipping CI measurement PR comment"
can_render_pr_comment=false
fi
+ if ! command -v jq >/dev/null 2>&1; then
+ if ensure_ci_measurement_tool jq jq; then
+ :
+ else
+ echo "::notice::jq is not available; skipping CI measurement PR comment"
+ can_render_pr_comment=false
+ fi
+ fi
if [ -z "${GH_TOKEN:-${GITHUB_TOKEN:-}}" ]; then
echo "::notice::GH_TOKEN/GITHUB_TOKEN is not set; skipping CI measurement PR comment"
can_render_pr_comment=false
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index f4d84e580..c0e3cd974 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -1283,19 +1283,38 @@ fi
if [ "${dollar}{CI_MEASUREMENT_PR_COMMENT_ENABLED:-false}" = "true" ] && [ "${dollar}{GITHUB_EVENT_NAME:-}" = "pull_request" ]; then
can_render_pr_comment=true
- if ! command -v gh >/dev/null 2>&1 && command -v nix >/dev/null 2>&1; then
- if gh_out="$(nix build --no-link --print-out-paths nixpkgs#gh 2>/dev/null)"; then
- export PATH="$gh_out/bin:$PATH"
+
+ ensure_ci_measurement_tool() {
+ tool_name="$1"
+ nix_attr="$2"
+ if command -v "$tool_name" >/dev/null 2>&1; then
+ return 0
fi
- fi
- if ! command -v gh >/dev/null 2>&1; then
+ if ! command -v nix >/dev/null 2>&1; then
+ return 1
+ fi
+ if tool_out="$(nix build --no-link --print-out-paths "nixpkgs#$nix_attr" 2>/dev/null)"; then
+ export PATH="$tool_out/bin:$PATH"
+ fi
+ command -v "$tool_name" >/dev/null 2>&1
+ }
+
+ if ! ensure_ci_measurement_tool gh gh; then
echo "::notice::gh is not available; skipping CI measurement PR comment"
can_render_pr_comment=false
fi
- if ! command -v jq >/dev/null 2>&1; then
- echo "::notice::jq is not available; skipping CI measurement PR comment"
+ if ! ensure_ci_measurement_tool node nodejs; then
+ echo "::notice::node is not available; skipping CI measurement PR comment"
can_render_pr_comment=false
fi
+ if ! command -v jq >/dev/null 2>&1; then
+ if ensure_ci_measurement_tool jq jq; then
+ :
+ else
+ echo "::notice::jq is not available; skipping CI measurement PR comment"
+ can_render_pr_comment=false
+ fi
+ fi
if [ -z "${dollar}{GH_TOKEN:-${dollar}{GITHUB_TOKEN:-}}" ]; then
echo "::notice::GH_TOKEN/GITHUB_TOKEN is not set; skipping CI measurement PR comment"
can_render_pr_comment=false
From 702a75f184475b6df3aec094135adbdbdb6c25b9 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Sat, 16 May 2026 01:40:19 +0200
Subject: [PATCH 06/81] Add CI measurement baseline backfill support
---
.github/workflows/ci.yml | 78 +++++++++++++++++++++++++++++--
.github/workflows/ci.yml.genie.ts | 33 +++++++++----
genie/ci-workflow.ts | 5 ++
genie/ci-workflow/measurements.ts | 47 +++++++++++++++++--
4 files changed, 148 insertions(+), 15 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f43df2c31..17662badd 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -14,6 +14,16 @@ on:
branches: [main]
workflow_dispatch:
inputs:
+ measurement_baseline_ref:
+ description: Optional ref/SHA to checkout before running CI measurement jobs. Used to backfill comparable baseline artifacts.
+ required: false
+ default: ''
+ type: string
+ measurement_baseline_label:
+ description: Optional human label for a measurement baseline backfill run, for example PR number.
+ required: false
+ default: ''
+ type: string
debug_force_nix_diagnostics_failure:
description: 'Temporary debug switch (#272): force post-validation failure to verify diagnostics artifact + summary'
required: false
@@ -22,6 +32,7 @@ on:
jobs:
typecheck:
+ if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
runs-on:
[namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}']
timeout-minutes: 30
@@ -34,6 +45,11 @@ jobs:
GITHUB_TOKEN: ${{ github.token }}
steps:
- uses: actions/checkout@v6
+ - name: Checkout CI measurement baseline ref
+ if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }}
+ uses: actions/checkout@v6
+ with:
+ ref: ${{ inputs.measurement_baseline_ref }}
- name: Install Nix
uses: DeterminateSystems/determinate-nix-action@v3
with:
@@ -486,6 +502,7 @@ jobs:
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
lint:
+ if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
runs-on:
[namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}']
timeout-minutes: 30
@@ -498,6 +515,11 @@ jobs:
GITHUB_TOKEN: ${{ github.token }}
steps:
- uses: actions/checkout@v6
+ - name: Checkout CI measurement baseline ref
+ if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }}
+ uses: actions/checkout@v6
+ with:
+ ref: ${{ inputs.measurement_baseline_ref }}
- name: Install Nix
uses: DeterminateSystems/determinate-nix-action@v3
with:
@@ -828,6 +850,7 @@ jobs:
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
test:
+ if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
strategy:
fail-fast: false
matrix:
@@ -843,6 +866,11 @@ jobs:
GITHUB_TOKEN: ${{ github.token }}
steps:
- uses: actions/checkout@v6
+ - name: Checkout CI measurement baseline ref
+ if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }}
+ uses: actions/checkout@v6
+ with:
+ ref: ${{ inputs.measurement_baseline_ref }}
- name: Install Nix
uses: DeterminateSystems/determinate-nix-action@v3
with:
@@ -1173,6 +1201,7 @@ jobs:
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
nix-check:
+ if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
strategy:
fail-fast: false
matrix:
@@ -1188,6 +1217,11 @@ jobs:
GITHUB_TOKEN: ${{ github.token }}
steps:
- uses: actions/checkout@v6
+ - name: Checkout CI measurement baseline ref
+ if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }}
+ uses: actions/checkout@v6
+ with:
+ ref: ${{ inputs.measurement_baseline_ref }}
- name: Install Nix
uses: DeterminateSystems/determinate-nix-action@v3
with:
@@ -1518,6 +1552,7 @@ jobs:
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
nix-fod-check:
+ if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
strategy:
fail-fast: false
matrix:
@@ -1533,6 +1568,11 @@ jobs:
GITHUB_TOKEN: ${{ github.token }}
steps:
- uses: actions/checkout@v6
+ - name: Checkout CI measurement baseline ref
+ if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }}
+ uses: actions/checkout@v6
+ with:
+ ref: ${{ inputs.measurement_baseline_ref }}
- name: Install Nix
uses: DeterminateSystems/determinate-nix-action@v3
with:
@@ -1675,6 +1715,7 @@ jobs:
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
pnpm-builder-contract:
+ if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
runs-on:
[namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}']
timeout-minutes: 30
@@ -1687,6 +1728,11 @@ jobs:
GITHUB_TOKEN: ${{ github.token }}
steps:
- uses: actions/checkout@v6
+ - name: Checkout CI measurement baseline ref
+ if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }}
+ uses: actions/checkout@v6
+ with:
+ ref: ${{ inputs.measurement_baseline_ref }}
- name: Install Nix
uses: DeterminateSystems/determinate-nix-action@v3
with:
@@ -1934,6 +1980,7 @@ jobs:
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
pnpm-regression:
+ if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
runs-on:
[namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}']
timeout-minutes: 30
@@ -1946,6 +1993,11 @@ jobs:
GITHUB_TOKEN: ${{ github.token }}
steps:
- uses: actions/checkout@v6
+ - name: Checkout CI measurement baseline ref
+ if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }}
+ uses: actions/checkout@v6
+ with:
+ ref: ${{ inputs.measurement_baseline_ref }}
- name: Install Nix
uses: DeterminateSystems/determinate-nix-action@v3
with:
@@ -2187,8 +2239,16 @@ jobs:
ARTIFACT_DIR: tmp/devenv-perf-ci
OTEL_SERVICE_NAME: devenv-perf-ci
RUNNER_CLASS: 'namespace-profile-linux-x86-64,namespace-features:github.run-id=${{ github.run_id }}'
+ CI_MEASUREMENT_SUBJECT_REF: ${{ inputs.measurement_baseline_ref || github.ref }}
+ CI_MEASUREMENT_SUBJECT_SHA: ${{ inputs.measurement_baseline_ref || github.sha }}
+ CI_MEASUREMENT_SUBJECT_LABEL: ${{ inputs.measurement_baseline_label }}
steps:
- uses: actions/checkout@v6
+ - name: Checkout CI measurement baseline ref
+ if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }}
+ uses: actions/checkout@v6
+ with:
+ ref: ${{ inputs.measurement_baseline_ref }}
- name: Install Nix
uses: DeterminateSystems/determinate-nix-action@v3
with:
@@ -2719,8 +2779,8 @@ jobs:
--arg generatedAt "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
--arg repository "${GITHUB_REPOSITORY:-unknown}" \
--arg branchKind "${GITHUB_EVENT_NAME:-unknown}" \
- --arg ref "${GITHUB_REF:-unknown}" \
- --arg headSha "${GITHUB_SHA:-unknown}" \
+ --arg ref "${CI_MEASUREMENT_SUBJECT_REF:-${GITHUB_REF:-unknown}}" \
+ --arg headSha "${CI_MEASUREMENT_SUBJECT_SHA:-${GITHUB_SHA:-unknown}}" \
--arg baseSha "${GITHUB_BASE_SHA:-}" \
--arg runnerName "${RUNNER_NAME:-unknown}" \
--arg runnerOs "${RUNNER_OS:-unknown}" \
@@ -3697,6 +3757,7 @@ jobs:
retention-days: 30
timeout-minutes: 30
test-integration-notion:
+ if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
runs-on:
[namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}']
timeout-minutes: 30
@@ -3710,6 +3771,11 @@ jobs:
NOTION_TOKEN: ${{ secrets.NOTION_TOKEN }}
steps:
- uses: actions/checkout@v6
+ - name: Checkout CI measurement baseline ref
+ if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }}
+ uses: actions/checkout@v6
+ with:
+ ref: ${{ inputs.measurement_baseline_ref }}
- name: Install Nix
uses: DeterminateSystems/determinate-nix-action@v3
with:
@@ -4040,6 +4106,7 @@ jobs:
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
deploy-storybooks:
+ if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
runs-on:
[namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}']
timeout-minutes: 30
@@ -4056,6 +4123,11 @@ jobs:
NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
steps:
- uses: actions/checkout@v6
+ - name: Checkout CI measurement baseline ref
+ if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }}
+ uses: actions/checkout@v6
+ with:
+ ref: ${{ inputs.measurement_baseline_ref }}
- name: Install Nix
uses: DeterminateSystems/determinate-nix-action@v3
with:
@@ -4870,7 +4942,7 @@ jobs:
- pnpm-builder-contract
- pnpm-regression
- deploy-storybooks
- if: (github.ref == 'refs/heads/main') && github.event_name == 'push'
+ if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
steps:
- name: Dispatch alignment to coordinator
env:
diff --git a/.github/workflows/ci.yml.genie.ts b/.github/workflows/ci.yml.genie.ts
index cb13e63d8..06b55b4d1 100644
--- a/.github/workflows/ci.yml.genie.ts
+++ b/.github/workflows/ci.yml.genie.ts
@@ -15,6 +15,10 @@ import {
savePnpmStateStep,
standardCIEnv,
ciWorkflow,
+ ciMeasurementBaselineCheckoutStep,
+ ciMeasurementBaselineWorkflowDispatchInputs,
+ ciMeasurementNotBaselineBackfillPredicate,
+ ciMeasurementSubjectEnv,
ciMeasurementsCommentPermissions,
devenvPerfJob,
namespaceRunner,
@@ -30,6 +34,7 @@ import { type GitHubWorkflowArgs } from '../../packages/@overeng/genie/src/runti
const baseSteps = [
checkoutStep(),
+ ciMeasurementBaselineCheckoutStep,
installNixStep(),
cachixCliBuildStep,
cachixStep({ name: 'overeng-effect-utils', authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' }),
@@ -140,8 +145,10 @@ const nixDiagnosticsSummaryStep = {
} as const
const jobTimeoutMinutes = 30
+const normalCiIf = `\${{ ${ciMeasurementNotBaselineBackfillPredicate} }}`
const job = (step: { name: string; run: string }, extraSteps: readonly any[] = []) => ({
+ if: normalCiIf,
'runs-on': namespaceRunner({
profile: 'namespace-profile-linux-x86-64',
runId: '${{ github.run_id }}',
@@ -161,6 +168,7 @@ const job = (step: { name: string; run: string }, extraSteps: readonly any[] = [
})
const multiPlatformJob = (step: { name: string; run: string }) => ({
+ if: normalCiIf,
strategy: {
'fail-fast': false,
matrix: {
@@ -186,6 +194,7 @@ const multiPlatformJob = (step: { name: string; run: string }) => ({
const strictNixJobBaseSteps = [
checkoutStep(),
+ ciMeasurementBaselineCheckoutStep,
installNixStep(),
cachixCliBuildStep,
cachixStep({ name: 'overeng-effect-utils', authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' }),
@@ -193,6 +202,7 @@ const strictNixJobBaseSteps = [
] as const
const multiPlatformStrictNixJob = (step: ReturnType) => ({
+ if: normalCiIf,
strategy: {
'fail-fast': false,
matrix: {
@@ -275,6 +285,7 @@ const extraJobs: Record = {
baselineSeedRunIds: ['25710204667'],
baselineMaxRuns: 20,
regressionMode: 'fail',
+ env: ciMeasurementSubjectEnv,
setupSteps: baseSteps,
taskProbes: [
{
@@ -336,6 +347,7 @@ const extraJobs: Record = {
},
/** Integration tests for Notion API (requires NOTION_TOKEN secret) */
'test-integration-notion': {
+ if: normalCiIf,
'runs-on': namespaceRunner({
profile: 'namespace-profile-linux-x86-64',
runId: '${{ github.run_id }}',
@@ -362,6 +374,7 @@ const extraJobs: Record = {
const deployJobs: Record = {
'deploy-storybooks': {
+ if: normalCiIf,
'runs-on': namespaceRunner({
profile: 'namespace-profile-linux-x86-64',
runId: '${{ github.run_id }}',
@@ -396,6 +409,7 @@ export default ciWorkflow({
pull_request: { branches: ['main'] },
workflow_dispatch: {
inputs: {
+ ...ciMeasurementBaselineWorkflowDispatchInputs,
debug_force_nix_diagnostics_failure: {
description:
'Temporary debug switch (#272): force post-validation failure to verify diagnostics artifact + summary',
@@ -410,13 +424,16 @@ export default ciWorkflow({
...jobs,
...extraJobs,
...deployJobs,
- 'notify-alignment': notifyAlignmentJob({
- targetRepo: 'schickling/megarepo-all',
- needs: [...Object.keys(jobs), ...Object.keys(deployJobs)],
- runner: [
- 'namespace-profile-linux-x86-64',
- 'namespace-features:github.run-id=${{ github.run_id }}',
- ],
- }),
+ 'notify-alignment': {
+ ...notifyAlignmentJob({
+ targetRepo: 'schickling/megarepo-all',
+ needs: [...Object.keys(jobs), ...Object.keys(deployJobs)],
+ runner: [
+ 'namespace-profile-linux-x86-64',
+ 'namespace-features:github.run-id=${{ github.run_id }}',
+ ],
+ }),
+ if: normalCiIf,
+ },
},
} satisfies GitHubWorkflowArgs)
diff --git a/genie/ci-workflow.ts b/genie/ci-workflow.ts
index 5f2dda0cb..5c2cb38c7 100644
--- a/genie/ci-workflow.ts
+++ b/genie/ci-workflow.ts
@@ -49,6 +49,11 @@ export {
} from './ci-workflow/shared.ts'
export {
ciMeasurementMetrics,
+ ciMeasurementBaselineBackfillPredicate,
+ ciMeasurementBaselineCheckoutStep,
+ ciMeasurementBaselineWorkflowDispatchInputs,
+ ciMeasurementNotBaselineBackfillPredicate,
+ ciMeasurementSubjectEnv,
ciMeasurementsArtifactStep,
ciMeasurementsCommentPermissions,
compareCiMeasurementsStep,
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index c0e3cd974..e9971c7b9 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -112,6 +112,45 @@ export const ciMeasurementsCommentPermissions = {
'pull-requests': 'write',
} as const
+/** Workflow-dispatch inputs used to recreate measurement baselines for older commits. */
+export const ciMeasurementBaselineWorkflowDispatchInputs = {
+ measurement_baseline_ref: {
+ description:
+ 'Optional ref/SHA to checkout before running CI measurement jobs. Used to backfill comparable baseline artifacts.',
+ required: false,
+ default: '',
+ type: 'string',
+ },
+ measurement_baseline_label: {
+ description: 'Optional human label for a measurement baseline backfill run, for example PR number.',
+ required: false,
+ default: '',
+ type: 'string',
+ },
+} as const
+
+export const ciMeasurementBaselineBackfillPredicate =
+ "github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != ''" as const
+export const ciMeasurementNotBaselineBackfillPredicate =
+ `!(${ciMeasurementBaselineBackfillPredicate})` as const
+
+/** Conditional checkout step that replaces the default checkout with the baseline subject. */
+export const ciMeasurementBaselineCheckoutStep = {
+ name: 'Checkout CI measurement baseline ref',
+ if: `\${{ ${ciMeasurementBaselineBackfillPredicate} }}`,
+ uses: 'actions/checkout@v6',
+ with: {
+ ref: '${{ inputs.measurement_baseline_ref }}',
+ },
+} as const
+
+/** Subject metadata env for measurement artifacts produced by a baseline backfill run. */
+export const ciMeasurementSubjectEnv = {
+ CI_MEASUREMENT_SUBJECT_REF: '${{ inputs.measurement_baseline_ref || github.ref }}',
+ CI_MEASUREMENT_SUBJECT_SHA: '${{ inputs.measurement_baseline_ref || github.sha }}',
+ CI_MEASUREMENT_SUBJECT_LABEL: '${{ inputs.measurement_baseline_label }}',
+} as const
+
type DevenvPerfSetupStep = GitHubWorkflowArgs['jobs'][string]['steps'][number]
export type DevenvPerfTaskProbe =
| string
@@ -499,8 +538,8 @@ jq -n \
--arg generatedAt "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
--arg repository "${dollar}{GITHUB_REPOSITORY:-unknown}" \
--arg branchKind "${dollar}{GITHUB_EVENT_NAME:-unknown}" \
- --arg ref "${dollar}{GITHUB_REF:-unknown}" \
- --arg headSha "${dollar}{GITHUB_SHA:-unknown}" \
+ --arg ref "${dollar}{CI_MEASUREMENT_SUBJECT_REF:-${dollar}{GITHUB_REF:-unknown}}" \
+ --arg headSha "${dollar}{CI_MEASUREMENT_SUBJECT_SHA:-${dollar}{GITHUB_SHA:-unknown}}" \
--arg baseSha "${dollar}{GITHUB_BASE_SHA:-}" \
--arg runnerName "${dollar}{RUNNER_NAME:-unknown}" \
--arg runnerOs "${dollar}{RUNNER_OS:-unknown}" \
@@ -829,8 +868,8 @@ jq -n \
--arg generatedAt "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
--arg repository "${dollar}{GITHUB_REPOSITORY:-unknown}" \
--arg branchKind "${dollar}{GITHUB_EVENT_NAME:-unknown}" \
- --arg ref "${dollar}{GITHUB_REF:-unknown}" \
- --arg headSha "${dollar}{GITHUB_SHA:-unknown}" \
+ --arg ref "${dollar}{CI_MEASUREMENT_SUBJECT_REF:-${dollar}{GITHUB_REF:-unknown}}" \
+ --arg headSha "${dollar}{CI_MEASUREMENT_SUBJECT_SHA:-${dollar}{GITHUB_SHA:-unknown}}" \
--arg baseSha "${dollar}{GITHUB_BASE_SHA:-}" \
--arg runnerName "${dollar}{RUNNER_NAME:-unknown}" \
--arg runnerOs "${dollar}{RUNNER_OS:-unknown}" \
From 988f1d3634d06069fd439a5d92ea623ab8306de6 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Sat, 16 May 2026 02:15:49 +0200
Subject: [PATCH 07/81] Use committed lock for closure measurements
---
genie/ci-workflow/measurements.ts | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index e9971c7b9..816961db1 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -855,7 +855,7 @@ target_group=${shellSingleQuote(targetGroup)}
artifact_file=${artifactFileAssignment}
${targetSystemAssignment}
-out_path="$(nix build --no-link --print-out-paths "$installable")"
+out_path="$(nix build --no-update-lock-file --no-link --print-out-paths "$installable")"
path_info="$ARTIFACT_DIR/nix-closure-path-info.json"
paths_file="$ARTIFACT_DIR/nix-closure-paths.json"
From c2a673ac61a985d2916fb094e4d48c8f0d29ce9e Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Sat, 16 May 2026 03:28:56 +0200
Subject: [PATCH 08/81] Support historical devenv tracing in CI measurements
---
.github/workflows/ci.yml | 13 ++++++++++++-
genie/ci-workflow/measurements.ts | 19 ++++++++++++-------
2 files changed, 24 insertions(+), 8 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 17662badd..aa85915ff 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2684,6 +2684,17 @@ jobs:
for arg in "$@"; do
case "$arg" in
'$DEVENV_BIN') expanded+=("${DEVENV_BIN:?DEVENV_BIN not set}") ;;
+ '$DEVENV_SHELL_TRACE_COMMAND')
+ if "${DEVENV_BIN:?DEVENV_BIN not set}" --help 2>&1 | grep -q -- '--trace-to'; then
+ expanded+=("${DEVENV_BIN:?DEVENV_BIN not set}" "--trace-to" "json:file:$sample_trace" "shell" "--no-reload" "--" "true")
+ elif "${DEVENV_BIN:?DEVENV_BIN not set}" --help 2>&1 | grep -q -- '--trace-format'; then
+ expanded+=("${DEVENV_BIN:?DEVENV_BIN not set}" "--trace-format" "json" "shell" "--no-reload" "--" "true")
+ sample_trace=""
+ else
+ expanded+=("${DEVENV_BIN:?DEVENV_BIN not set}" "shell" "--no-reload" "--" "true")
+ sample_trace=""
+ fi
+ ;;
'$ARTIFACT_DIR'*) expanded+=("${ARTIFACT_DIR}${arg#'$ARTIFACT_DIR'}") ;;
'json:file:$trace_file') expanded+=("json:file:$sample_trace") ;;
'$trace_file') expanded+=("file:$sample_trace") ;;
@@ -2737,7 +2748,7 @@ jobs:
fi
}
- measure 'shell_eval_traced' 'Shell eval with OTEL trace' 'devenv shell' 'Evaluates the dev shell with native devenv JSON tracing enabled.' '$ARTIFACT_DIR/traces/shell_eval_traced.json' '0' '1' '{"enabled":false,"minBaselineSources":10,"minCurrentSamples":3,"warnRatio":1.25,"failRatio":1.5,"warnAbs":1.5,"failAbs":3,"noiseFloor":0.5}' '$DEVENV_BIN' '--trace-to' 'json:file:$trace_file' 'shell' '--no-reload' '--' 'true'
+ measure 'shell_eval_traced' 'Shell eval with OTEL trace' 'devenv shell' 'Evaluates the dev shell with native devenv JSON tracing enabled.' '$ARTIFACT_DIR/traces/shell_eval_traced.json' '0' '1' '{"enabled":false,"minBaselineSources":10,"minCurrentSamples":3,"warnRatio":1.25,"failRatio":1.5,"warnAbs":1.5,"failAbs":3,"noiseFloor":0.5}' '$DEVENV_SHELL_TRACE_COMMAND'
measure 'shell_eval_warm' 'Warm shell eval' 'devenv shell' 'Evaluates a warm dev shell without reloading direnv state.' '' '1' '5' '{"enabled":true,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":0.5,"noiseFloor":0.1}' '$DEVENV_BIN' 'shell' '--no-reload' '--' 'true'
measure 'tasks_list' 'devenv tasks list' 'devenv cli' 'Lists devenv tasks to measure task graph loading overhead.' '' '1' '9' '{"enabled":true,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":2,"failRatio":3,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1}' '$DEVENV_BIN' 'tasks' 'list'
measure 'processes_help' 'devenv processes --help' 'devenv cli' 'Loads the devenv processes command help path.' '' '1' '9' '{"enabled":true,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":2,"failRatio":3,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1}' '$DEVENV_BIN' 'processes' '--help'
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 816961db1..bcf0883fe 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -266,13 +266,7 @@ const renderDevenvPerfScript = (
group: 'devenv shell',
description: 'Evaluates the dev shell with native devenv JSON tracing enabled.',
command: [
- '$DEVENV_BIN',
- '--trace-to',
- 'json:file:$trace_file',
- 'shell',
- '--no-reload',
- '--',
- 'true',
+ '$DEVENV_SHELL_TRACE_COMMAND',
],
traceOutput: '$ARTIFACT_DIR/traces/shell_eval_traced.json',
},
@@ -450,6 +444,17 @@ measure() {
for arg in "$@"; do
case "$arg" in
'$DEVENV_BIN') expanded+=("${dollar}{DEVENV_BIN:?DEVENV_BIN not set}") ;;
+ '$DEVENV_SHELL_TRACE_COMMAND')
+ if "${dollar}{DEVENV_BIN:?DEVENV_BIN not set}" --help 2>&1 | grep -q -- '--trace-to'; then
+ expanded+=("${dollar}{DEVENV_BIN:?DEVENV_BIN not set}" "--trace-to" "json:file:$sample_trace" "shell" "--no-reload" "--" "true")
+ elif "${dollar}{DEVENV_BIN:?DEVENV_BIN not set}" --help 2>&1 | grep -q -- '--trace-format'; then
+ expanded+=("${dollar}{DEVENV_BIN:?DEVENV_BIN not set}" "--trace-format" "json" "shell" "--no-reload" "--" "true")
+ sample_trace=""
+ else
+ expanded+=("${dollar}{DEVENV_BIN:?DEVENV_BIN not set}" "shell" "--no-reload" "--" "true")
+ sample_trace=""
+ fi
+ ;;
'$ARTIFACT_DIR'*) expanded+=("${dollar}{ARTIFACT_DIR}${dollar}{arg#'$ARTIFACT_DIR'}") ;;
'json:file:$trace_file') expanded+=("json:file:$sample_trace") ;;
'$trace_file') expanded+=("file:$sample_trace") ;;
From 61c147ad3a98cc16037a66418ea80844f7822ce0 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Sat, 16 May 2026 04:01:53 +0200
Subject: [PATCH 09/81] Allow historical baseline probes to publish partial
data
---
.github/workflows/ci.yml | 12 ++++++++++--
genie/ci-workflow/measurements.ts | 13 +++++++++++--
2 files changed, 21 insertions(+), 4 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index aa85915ff..0a3e1928f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2242,6 +2242,7 @@ jobs:
CI_MEASUREMENT_SUBJECT_REF: ${{ inputs.measurement_baseline_ref || github.ref }}
CI_MEASUREMENT_SUBJECT_SHA: ${{ inputs.measurement_baseline_ref || github.sha }}
CI_MEASUREMENT_SUBJECT_LABEL: ${{ inputs.measurement_baseline_label }}
+ CI_MEASUREMENT_ALLOW_PROBE_FAILURES: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && '1' || '' }}
steps:
- uses: actions/checkout@v6
- name: Checkout CI measurement baseline ref
@@ -2742,9 +2743,15 @@ jobs:
json_append_timing "$id" "$label" "$group" "$description" "$status" "$duration_ms" "$ARTIFACT_DIR/$id.stdout" "$ARTIFACT_DIR/$id.stderr" "$trace_file" "$gate_policy"
if [ "$status" -ne 0 ]; then
- echo "::error::$id failed after ${duration_ms}ms; stderr tail follows"
+ if [ "${CI_MEASUREMENT_ALLOW_PROBE_FAILURES:-}" = "1" ]; then
+ echo "::warning::$id failed after ${duration_ms}ms; keeping earlier successful baseline probes and excluding this failed probe from numeric observations"
+ else
+ echo "::error::$id failed after ${duration_ms}ms; stderr tail follows"
+ fi
tail -80 "$stderr" || true
- return "$status"
+ if [ "${CI_MEASUREMENT_ALLOW_PROBE_FAILURES:-}" != "1" ]; then
+ return "$status"
+ fi
fi
}
@@ -2835,6 +2842,7 @@ jobs:
target: { kind: "devenv", id: "dev-shell", name: "dev-shell", label: "Dev shell", group: "devenv", system: $targetSystem },
observations: (
$timings[0]
+ | map(select(.status == 0))
| map({
id: ("devenv." + .id + ".duration"),
label: .label,
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index bcf0883fe..48b44d5d6 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -149,6 +149,8 @@ export const ciMeasurementSubjectEnv = {
CI_MEASUREMENT_SUBJECT_REF: '${{ inputs.measurement_baseline_ref || github.ref }}',
CI_MEASUREMENT_SUBJECT_SHA: '${{ inputs.measurement_baseline_ref || github.sha }}',
CI_MEASUREMENT_SUBJECT_LABEL: '${{ inputs.measurement_baseline_label }}',
+ CI_MEASUREMENT_ALLOW_PROBE_FAILURES:
+ "${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && '1' || '' }}",
} as const
type DevenvPerfSetupStep = GitHubWorkflowArgs['jobs'][string]['steps'][number]
@@ -502,9 +504,15 @@ measure() {
json_append_timing "$id" "$label" "$group" "$description" "$status" "$duration_ms" "$ARTIFACT_DIR/$id.stdout" "$ARTIFACT_DIR/$id.stderr" "$trace_file" "$gate_policy"
if [ "$status" -ne 0 ]; then
- echo "::error::$id failed after ${dollar}{duration_ms}ms; stderr tail follows"
+ if [ "${dollar}{CI_MEASUREMENT_ALLOW_PROBE_FAILURES:-}" = "1" ]; then
+ echo "::warning::$id failed after ${dollar}{duration_ms}ms; keeping earlier successful baseline probes and excluding this failed probe from numeric observations"
+ else
+ echo "::error::$id failed after ${dollar}{duration_ms}ms; stderr tail follows"
+ fi
tail -80 "$stderr" || true
- return "$status"
+ if [ "${dollar}{CI_MEASUREMENT_ALLOW_PROBE_FAILURES:-}" != "1" ]; then
+ return "$status"
+ fi
fi
}
@@ -588,6 +596,7 @@ jq -n \
target: { kind: "devenv", id: "dev-shell", name: "dev-shell", label: "Dev shell", group: "devenv", system: $targetSystem },
observations: (
$timings[0]
+ | map(select(.status == 0))
| map({
id: ("devenv." + .id + ".duration"),
label: .label,
From c0a028f7a16f5aaa5b55645e1c96737d75866949 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Sat, 16 May 2026 07:21:25 +0200
Subject: [PATCH 10/81] Fix CI measurement seed artifact lookup
---
.github/workflows/ci.yml | 4 ++--
genie/ci-workflow/measurements.ts | 4 ++--
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 0a3e1928f..5b1561c33 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2476,9 +2476,9 @@ jobs:
artifact_json="$(
gh api "repos/$repo/actions/runs/$candidate_run/artifacts" \
- --jq '.artifacts
+ | jq --arg artifactName "$BASELINE_ARTIFACT_NAME" '.artifacts
| map(select(.expired == false))
- | map(select(.name == env.BASELINE_ARTIFACT_NAME or (.name | startswith(env.BASELINE_ARTIFACT_NAME + "-"))))
+ | map(select(.name == $artifactName or (.name | startswith($artifactName + "-"))))
| sort_by(.created_at // "")
| reverse
| .[0] // empty'
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 48b44d5d6..7f1374a92 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -740,9 +740,9 @@ for candidate_run in $candidate_runs; do
artifact_json="$(
gh api "repos/$repo/actions/runs/$candidate_run/artifacts" \
- --jq '.artifacts
+ | jq --arg artifactName "$BASELINE_ARTIFACT_NAME" '.artifacts
| map(select(.expired == false))
- | map(select(.name == env.BASELINE_ARTIFACT_NAME or (.name | startswith(env.BASELINE_ARTIFACT_NAME + "-"))))
+ | map(select(.name == $artifactName or (.name | startswith($artifactName + "-"))))
| sort_by(.created_at // "")
| reverse
| .[0] // empty'
From 97f0092244e94dcd3ccf2d9bcf7da8ae898323d5 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Sat, 16 May 2026 07:39:03 +0200
Subject: [PATCH 11/81] Resolve GitHub CLI for CI baseline downloads
---
.github/workflows/ci.yml | 18 ++++++++++++------
genie/ci-workflow/measurements.ts | 18 ++++++++++++------
2 files changed, 24 insertions(+), 12 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 5b1561c33..1b3e7c1da 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2426,17 +2426,23 @@ jobs:
mkdir -p "$BASELINE_OUTPUT_DIR"
- if ! command -v gh >/dev/null 2>&1; then
- echo "::notice::gh is not available; skipping previous artifact download"
- exit 0
+ if command -v gh >/dev/null 2>&1; then
+ GH_BIN="$(command -v gh)"
+ else
+ echo "::notice::gh is not on PATH; resolving GitHub CLI through Nix"
+ if ! GH_BIN="$(nix build --no-link --print-out-paths nixpkgs#gh 2>/dev/null)/bin/gh"; then
+ echo "::notice::unable to resolve GitHub CLI through Nix; skipping previous artifact download"
+ exit 0
+ fi
fi
+ echo "Using GitHub CLI: $GH_BIN"
repo="${GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}"
workflow="${BASELINE_WORKFLOW_NAME:-CI}"
branch="${BASELINE_BRANCH:-${GITHUB_BASE_REF:-${GITHUB_REF_NAME:-main}}}"
candidate_runs="$(
- gh run list \
+ "$GH_BIN" run list \
--repo "$repo" \
--workflow "$workflow" \
--branch "$branch" \
@@ -2475,7 +2481,7 @@ jobs:
fi
artifact_json="$(
- gh api "repos/$repo/actions/runs/$candidate_run/artifacts" \
+ "$GH_BIN" api "repos/$repo/actions/runs/$candidate_run/artifacts" \
| jq --arg artifactName "$BASELINE_ARTIFACT_NAME" '.artifacts
| map(select(.expired == false))
| map(select(.name == $artifactName or (.name | startswith($artifactName + "-"))))
@@ -2489,7 +2495,7 @@ jobs:
current_artifact_id="$(printf '%s' "$artifact_json" | jq -r '.id')"
current_output_dir="$BASELINE_OUTPUT_DIR/run-$candidate_run"
mkdir -p "$current_output_dir"
- if gh run download "$candidate_run" \
+ if "$GH_BIN" run download "$candidate_run" \
--repo "$repo" \
--name "$current_artifact_name" \
--dir "$current_output_dir"; then
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 7f1374a92..493a7070c 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -690,17 +690,23 @@ export const downloadPreviousGitHubArtifactStep = (opts: GitHubPreviousArtifactS
mkdir -p "$BASELINE_OUTPUT_DIR"
-if ! command -v gh >/dev/null 2>&1; then
- echo "::notice::gh is not available; skipping previous artifact download"
- exit 0
+if command -v gh >/dev/null 2>&1; then
+ GH_BIN="$(command -v gh)"
+else
+ echo "::notice::gh is not on PATH; resolving GitHub CLI through Nix"
+ if ! GH_BIN="$(nix build --no-link --print-out-paths nixpkgs#gh 2>/dev/null)/bin/gh"; then
+ echo "::notice::unable to resolve GitHub CLI through Nix; skipping previous artifact download"
+ exit 0
+ fi
fi
+echo "Using GitHub CLI: $GH_BIN"
repo="${dollar}{GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}"
workflow="${dollar}{BASELINE_WORKFLOW_NAME:-CI}"
branch="${dollar}{BASELINE_BRANCH:-${dollar}{GITHUB_BASE_REF:-${dollar}{GITHUB_REF_NAME:-main}}}"
candidate_runs="$(
- gh run list \
+ "$GH_BIN" run list \
--repo "$repo" \
--workflow "$workflow" \
--branch "$branch" \
@@ -739,7 +745,7 @@ for candidate_run in $candidate_runs; do
fi
artifact_json="$(
- gh api "repos/$repo/actions/runs/$candidate_run/artifacts" \
+ "$GH_BIN" api "repos/$repo/actions/runs/$candidate_run/artifacts" \
| jq --arg artifactName "$BASELINE_ARTIFACT_NAME" '.artifacts
| map(select(.expired == false))
| map(select(.name == $artifactName or (.name | startswith($artifactName + "-"))))
@@ -753,7 +759,7 @@ for candidate_run in $candidate_runs; do
current_artifact_id="$(printf '%s' "$artifact_json" | jq -r '.id')"
current_output_dir="$BASELINE_OUTPUT_DIR/run-$candidate_run"
mkdir -p "$current_output_dir"
- if gh run download "$candidate_run" \
+ if "$GH_BIN" run download "$candidate_run" \
--repo "$repo" \
--name "$current_artifact_name" \
--dir "$current_output_dir"; then
From 76c5c70e4549e0f61343b5c532b2d35b51f4a375 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Sat, 16 May 2026 07:51:28 +0200
Subject: [PATCH 12/81] Allow CI measurements to read baseline artifacts
---
.github/workflows/ci.yml | 1 +
genie/ci-workflow/measurements.ts | 1 +
2 files changed, 2 insertions(+)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 1b3e7c1da..78811e68c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2226,6 +2226,7 @@ jobs:
runs-on:
[namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}']
permissions:
+ actions: read
contents: write
issues: write
pull-requests: write
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 493a7070c..c5e2a8ef0 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -107,6 +107,7 @@ export type CiMeasurementsArtifactStepOptions = {
/** Job-level permissions required when CI measurement comparison posts PR comments. */
export const ciMeasurementsCommentPermissions = {
+ actions: 'read',
contents: 'write',
issues: 'write',
'pull-requests': 'write',
From 7b42b8d1ee88c3134a3c369f0844f02f8954985d Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Sat, 16 May 2026 08:07:51 +0200
Subject: [PATCH 13/81] Prefer explicit CI measurement baseline seeds
---
.github/workflows/ci.yml | 4 ++--
genie/ci-workflow/measurements.ts | 4 ++--
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 78811e68c..50da00c66 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2454,8 +2454,8 @@ jobs:
--jq '[.[] | select(.headSha != env.GITHUB_SHA) | .databaseId] | .[]'
)"
- candidate_runs="$candidate_runs
- $BASELINE_SEED_RUN_IDS"
+ candidate_runs="$BASELINE_SEED_RUN_IDS
+ $candidate_runs"
max_runs="${BASELINE_MAX_RUNS:-5}"
if ! [[ "$max_runs" =~ ^[0-9]+$ ]] || [ "$max_runs" -lt 1 ]; then
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index c5e2a8ef0..31009229b 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -718,8 +718,8 @@ candidate_runs="$(
--jq '[.[] | select(.headSha != env.GITHUB_SHA) | .databaseId] | .[]'
)"
-candidate_runs="$candidate_runs
-$BASELINE_SEED_RUN_IDS"
+candidate_runs="$BASELINE_SEED_RUN_IDS
+$candidate_runs"
max_runs="${dollar}{BASELINE_MAX_RUNS:-5}"
if ! [[ "$max_runs" =~ ^[0-9]+$ ]] || [ "$max_runs" -lt 1 ]; then
From 7e60e57d3dbd7485a83f771e80dd78b68cc3c4d3 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Sat, 16 May 2026 08:20:13 +0200
Subject: [PATCH 14/81] Read nested CI measurement baselines
---
.github/workflows/ci.yml | 3 +--
genie/ci-workflow/measurements.ts | 3 +--
2 files changed, 2 insertions(+), 4 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 50da00c66..3235b2e7f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2946,8 +2946,7 @@ jobs:
baseline_index="$(mktemp)"
find "$current_dir" -path "$baseline_dir" -prune -o -name measurements.json -type f -print | sort >"$current_index" || true
{
- find "$baseline_dir" -maxdepth 1 -name measurements.json -type f -print
- find "$baseline_dir" -mindepth 2 -maxdepth 2 -name measurements.json -type f -print
+ find "$baseline_dir" -name measurements.json -type f -print
} | sort -u >"$baseline_index" || true
if [ ! -s "$current_index" ]; then
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 31009229b..f1c3ac136 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -1029,8 +1029,7 @@ current_index="$(mktemp)"
baseline_index="$(mktemp)"
find "$current_dir" -path "$baseline_dir" -prune -o -name measurements.json -type f -print | sort >"$current_index" || true
{
- find "$baseline_dir" -maxdepth 1 -name measurements.json -type f -print
- find "$baseline_dir" -mindepth 2 -maxdepth 2 -name measurements.json -type f -print
+ find "$baseline_dir" -name measurements.json -type f -print
} | sort -u >"$baseline_index" || true
if [ ! -s "$current_index" ]; then
From ef7a4e3b74688a05798d016ecfa15a2406bd11e5 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Sat, 16 May 2026 10:44:22 +0200
Subject: [PATCH 15/81] Prune nested CI measurement baselines
---
.github/workflows/ci.yml | 4 ++--
genie/ci-workflow/measurements.ts | 4 ++--
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3235b2e7f..447ef1bc3 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2944,9 +2944,9 @@ jobs:
current_index="$(mktemp)"
baseline_index="$(mktemp)"
- find "$current_dir" -path "$baseline_dir" -prune -o -name measurements.json -type f -print | sort >"$current_index" || true
+ find "$current_dir" -name baseline -type d -prune -o -name measurements.json -type f -print | sort >"$current_index" || true
{
- find "$baseline_dir" -name measurements.json -type f -print
+ find "$baseline_dir" -name baseline -type d ! -path "$baseline_dir" -prune -o -name measurements.json -type f -print
} | sort -u >"$baseline_index" || true
if [ ! -s "$current_index" ]; then
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index f1c3ac136..6c9d00967 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -1027,9 +1027,9 @@ fi
current_index="$(mktemp)"
baseline_index="$(mktemp)"
-find "$current_dir" -path "$baseline_dir" -prune -o -name measurements.json -type f -print | sort >"$current_index" || true
+find "$current_dir" -name baseline -type d -prune -o -name measurements.json -type f -print | sort >"$current_index" || true
{
- find "$baseline_dir" -name measurements.json -type f -print
+ find "$baseline_dir" -name baseline -type d ! -path "$baseline_dir" -prune -o -name measurements.json -type f -print
} | sort -u >"$baseline_index" || true
if [ ! -s "$current_index" ]; then
From 7ad96c535bf16f269d0888fdadbf4f3d14866fe2 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Sat, 16 May 2026 10:59:19 +0200
Subject: [PATCH 16/81] Record only bounded baseline candidates
---
.github/workflows/ci.yml | 2 +-
genie/ci-workflow/measurements.ts | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 447ef1bc3..d26241075 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2476,10 +2476,10 @@ jobs:
if grep -qxF "$candidate_run" "$seen_runs_file"; then
continue
fi
- printf '%s\n' "$candidate_run" >>"$seen_runs_file"
if [ "$(wc -l <"$downloaded_runs_file" | tr -d ' ')" -ge "$max_runs" ]; then
break
fi
+ printf '%s\n' "$candidate_run" >>"$seen_runs_file"
artifact_json="$(
"$GH_BIN" api "repos/$repo/actions/runs/$candidate_run/artifacts" \
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 6c9d00967..365d5c2a1 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -740,10 +740,10 @@ for candidate_run in $candidate_runs; do
if grep -qxF "$candidate_run" "$seen_runs_file"; then
continue
fi
- printf '%s\n' "$candidate_run" >>"$seen_runs_file"
if [ "$(wc -l <"$downloaded_runs_file" | tr -d ' ')" -ge "$max_runs" ]; then
break
fi
+ printf '%s\n' "$candidate_run" >>"$seen_runs_file"
artifact_json="$(
"$GH_BIN" api "repos/$repo/actions/runs/$candidate_run/artifacts" \
From 5e842c815dacfeccf1b16f4e7d46e45f7a0a10b1 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Sat, 16 May 2026 11:27:20 +0200
Subject: [PATCH 17/81] Track typed measurement baseline seeds
---
.github/workflows/ci.yml | 23 ++++++++--
.github/workflows/ci.yml.genie.ts | 9 +++-
.../ci-measurement-comparison.test.sh | 13 ++++++
genie/ci-workflow/measurements.ts | 45 +++++++++++++++++--
4 files changed, 81 insertions(+), 9 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d26241075..3e6cb5edd 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2420,7 +2420,7 @@ jobs:
BASELINE_OUTPUT_DIR: tmp/devenv-perf-ci/baseline
BASELINE_WORKFLOW_NAME: ${{ github.workflow }}
BASELINE_BRANCH: ${{ github.base_ref || github.ref_name }}
- BASELINE_SEED_RUN_IDS: '25710204667'
+ BASELINE_SEED_RUNS_JSON: '[{"runId":"25710204667","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Known comparable v2 devenv performance baseline."}]'
BASELINE_MAX_RUNS: '20'
run: |
set -euo pipefail
@@ -2441,6 +2441,14 @@ jobs:
repo="${GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}"
workflow="${BASELINE_WORKFLOW_NAME:-CI}"
branch="${BASELINE_BRANCH:-${GITHUB_BASE_REF:-${GITHUB_REF_NAME:-main}}}"
+ seed_runs_file="$BASELINE_OUTPUT_DIR/baseline-seed-runs.json"
+ printf '%s' "${BASELINE_SEED_RUNS_JSON:-[]}" >"$seed_runs_file"
+ if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.runId | type == "string")) else false end' \
+ "$seed_runs_file" >/dev/null; then
+ echo "::error::BASELINE_SEED_RUNS_JSON must be an array of objects with string runId fields"
+ exit 1
+ fi
+ seed_run_ids="$(jq -r '.[].runId' "$seed_runs_file")"
candidate_runs="$(
"$GH_BIN" run list \
@@ -2454,7 +2462,7 @@ jobs:
--jq '[.[] | select(.headSha != env.GITHUB_SHA) | .databaseId] | .[]'
)"
- candidate_runs="$BASELINE_SEED_RUN_IDS
+ candidate_runs="$seed_run_ids
$candidate_runs"
max_runs="${BASELINE_MAX_RUNS:-5}"
@@ -2525,6 +2533,7 @@ jobs:
jq -n \
--slurpfile runs "$downloaded_runs_file" \
+ --slurpfile seedRuns "$seed_runs_file" \
--argjson schemaVersion 1 \
--arg repository "$repo" \
--arg workflow "$workflow" \
@@ -2541,6 +2550,7 @@ jobs:
runId: $runId,
artifactName: $artifactName,
artifactId: $artifactId,
+ seedRuns: ($seedRuns[0] // []),
runs: $runs
}' >"$BASELINE_OUTPUT_DIR/baseline-provenance.json"
@@ -3183,7 +3193,12 @@ jobs:
| (
if any($comparisons[]?; .status == "fail") then "fail"
elif any($comparisons[]?; .status == "warn") then "warn"
- elif any($comparisons[]?; .status == "missing_baseline" and (if (.gatePolicy | has("enabled")) then .gatePolicy.enabled else true end)) then "partial"
+ elif any($comparisons[]?;
+ (if (.gatePolicy | has("enabled")) then .gatePolicy.enabled else true end)
+ and (.gateReason == "missing_baseline"
+ or .gateReason == "low_baseline_count"
+ or .gateReason == "low_current_sample_count")
+ ) then "partial"
else "pass"
end
) as $status
@@ -3217,7 +3232,7 @@ jobs:
echo "::warning::CI measurement regression threshold exceeded"
;;
partial:*)
- echo "::notice::CI measurement baseline is missing for one or more observations"
+ echo "::notice::CI measurement comparison is partial because one or more enabled observations are not gateable"
;;
esac
diff --git a/.github/workflows/ci.yml.genie.ts b/.github/workflows/ci.yml.genie.ts
index 06b55b4d1..2aab51850 100644
--- a/.github/workflows/ci.yml.genie.ts
+++ b/.github/workflows/ci.yml.genie.ts
@@ -282,7 +282,14 @@ const extraJobs: Record = {
runId: '${{ github.run_id }}',
}),
artifactName: 'devenv-perf',
- baselineSeedRunIds: ['25710204667'],
+ baselineSeedRuns: [
+ {
+ runId: '25710204667',
+ source: 'manual-backfill',
+ artifacts: ['devenv-perf'],
+ notes: 'Known comparable v2 devenv performance baseline.',
+ },
+ ],
baselineMaxRuns: 20,
regressionMode: 'fail',
env: ciMeasurementSubjectEnv,
diff --git a/genie/ci-scripts/ci-measurement-comparison.test.sh b/genie/ci-scripts/ci-measurement-comparison.test.sh
index c30aa2069..09e583396 100755
--- a/genie/ci-scripts/ci-measurement-comparison.test.sh
+++ b/genie/ci-scripts/ci-measurement-comparison.test.sh
@@ -102,4 +102,17 @@ if [ "$actual_status" != "fail" ] || [ "$actual_row" != "fail" ]; then
exit 1
fi
+low_baseline_policy='{"enabled":true,"minBaselineSources":2,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":0.5,"noiseFloor":0.1}'
+rm -rf "$tmp_dir/current" "$tmp_dir/baseline"
+write_measurement "$tmp_dir/current/measurements.json" 10.5 devenv-perf-warm-median-v2 "$low_baseline_policy"
+write_measurement "$tmp_dir/baseline/run-1/measurements.json" 10 devenv-perf-warm-median-v2 "$low_baseline_policy"
+run_compare
+actual_status="$(jq -r '.status' "$tmp_dir/comparison.json")"
+actual_row="$(jq -r '.comparisons[] | .status' "$tmp_dir/comparison.json")"
+actual_gate="$(jq -r '.comparisons[] | .gateReason' "$tmp_dir/comparison.json")"
+if [ "$actual_status" != "partial" ] || [ "$actual_row" != "pass" ] || [ "$actual_gate" != "low_baseline_count" ]; then
+ echo "expected low baseline count to be partial but not a regression; got status=$actual_status row=$actual_row gate=$actual_gate" >&2
+ exit 1
+fi
+
echo "ci-measurement-comparison tests passed"
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 365d5c2a1..c90dcf973 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -79,11 +79,21 @@ export type GitHubPreviousArtifactStepOptions = {
readonly outputDir: string
readonly workflowName?: string
readonly branch?: string
+ readonly seedRuns?: readonly CiMeasurementBaselineSeedRun[]
readonly seedRunIds?: readonly string[]
readonly maxRuns?: number
readonly tokenExpression?: string
}
+export type CiMeasurementBaselineSeedRun = {
+ readonly runId: string
+ readonly label?: string
+ readonly sha?: string
+ readonly source?: 'manual-backfill' | 'main-history' | 'pr-history' | string
+ readonly artifacts?: readonly string[]
+ readonly notes?: string
+}
+
export type CiMeasurementsComparisonStepOptions = {
readonly currentDir?: string
readonly baselineDir?: string
@@ -173,6 +183,7 @@ export type DevenvPerfJobOptions = {
readonly artifactDir?: string
readonly artifactName?: string
readonly baselineArtifactName?: string
+ readonly baselineSeedRuns?: readonly CiMeasurementBaselineSeedRun[]
readonly baselineSeedRunIds?: readonly string[]
readonly baselineMaxRuns?: number
readonly setupSteps?: readonly DevenvPerfSetupStep[]
@@ -674,6 +685,16 @@ export const devenvPerfBenchmarkStep = (
}),
}) as const
+const ciMeasurementBaselineSeedRunsJson = (opts: GitHubPreviousArtifactStepOptions) =>
+ JSON.stringify(
+ opts.seedRuns ??
+ opts.seedRunIds?.map((runId) => ({
+ runId,
+ source: 'manual-backfill',
+ })) ??
+ [],
+ )
+
export const downloadPreviousGitHubArtifactStep = (opts: GitHubPreviousArtifactStepOptions) =>
({
name: `Download previous artifact: ${opts.artifactName}`,
@@ -684,7 +705,7 @@ export const downloadPreviousGitHubArtifactStep = (opts: GitHubPreviousArtifactS
BASELINE_OUTPUT_DIR: opts.outputDir,
BASELINE_WORKFLOW_NAME: opts.workflowName ?? '${{ github.workflow }}',
BASELINE_BRANCH: opts.branch ?? '${{ github.base_ref || github.ref_name }}',
- BASELINE_SEED_RUN_IDS: opts.seedRunIds?.join(' ') ?? '',
+ BASELINE_SEED_RUNS_JSON: ciMeasurementBaselineSeedRunsJson(opts),
BASELINE_MAX_RUNS: String(opts.maxRuns ?? 5),
},
run: String.raw`set -euo pipefail
@@ -705,6 +726,14 @@ echo "Using GitHub CLI: $GH_BIN"
repo="${dollar}{GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}"
workflow="${dollar}{BASELINE_WORKFLOW_NAME:-CI}"
branch="${dollar}{BASELINE_BRANCH:-${dollar}{GITHUB_BASE_REF:-${dollar}{GITHUB_REF_NAME:-main}}}"
+seed_runs_file="$BASELINE_OUTPUT_DIR/baseline-seed-runs.json"
+printf '%s' "${dollar}{BASELINE_SEED_RUNS_JSON:-[]}" >"$seed_runs_file"
+if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.runId | type == "string")) else false end' \
+ "$seed_runs_file" >/dev/null; then
+ echo "::error::BASELINE_SEED_RUNS_JSON must be an array of objects with string runId fields"
+ exit 1
+fi
+seed_run_ids="$(jq -r '.[].runId' "$seed_runs_file")"
candidate_runs="$(
"$GH_BIN" run list \
@@ -718,7 +747,7 @@ candidate_runs="$(
--jq '[.[] | select(.headSha != env.GITHUB_SHA) | .databaseId] | .[]'
)"
-candidate_runs="$BASELINE_SEED_RUN_IDS
+candidate_runs="$seed_run_ids
$candidate_runs"
max_runs="${dollar}{BASELINE_MAX_RUNS:-5}"
@@ -789,6 +818,7 @@ fi
jq -n \
--slurpfile runs "$downloaded_runs_file" \
+ --slurpfile seedRuns "$seed_runs_file" \
--argjson schemaVersion 1 \
--arg repository "$repo" \
--arg workflow "$workflow" \
@@ -805,6 +835,7 @@ jq -n \
runId: $runId,
artifactName: $artifactName,
artifactId: $artifactId,
+ seedRuns: ($seedRuns[0] // []),
runs: $runs
}' >"$BASELINE_OUTPUT_DIR/baseline-provenance.json"
@@ -1266,7 +1297,12 @@ jq -n \
| (
if any($comparisons[]?; .status == "fail") then "fail"
elif any($comparisons[]?; .status == "warn") then "warn"
- elif any($comparisons[]?; .status == "missing_baseline" and (if (.gatePolicy | has("enabled")) then .gatePolicy.enabled else true end)) then "partial"
+ elif any($comparisons[]?;
+ (if (.gatePolicy | has("enabled")) then .gatePolicy.enabled else true end)
+ and (.gateReason == "missing_baseline"
+ or .gateReason == "low_baseline_count"
+ or .gateReason == "low_current_sample_count")
+ ) then "partial"
else "pass"
end
) as $status
@@ -1300,7 +1336,7 @@ case "$status:$mode" in
echo "::warning::CI measurement regression threshold exceeded"
;;
partial:*)
- echo "::notice::CI measurement baseline is missing for one or more observations"
+ echo "::notice::CI measurement comparison is partial because one or more enabled observations are not gateable"
;;
esac
@@ -1888,6 +1924,7 @@ export const devenvPerfJob = (opts?: DevenvPerfJobOptions) => {
downloadPreviousGitHubArtifactStep({
artifactName: baselineArtifactName,
outputDir: `${artifactDir}/baseline`,
+ seedRuns: opts?.baselineSeedRuns,
seedRunIds: opts?.baselineSeedRunIds,
maxRuns: opts?.baselineMaxRuns,
}),
From 012fe75b61ba8710c162767c815f176d04ff7988 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Sat, 16 May 2026 11:39:34 +0200
Subject: [PATCH 18/81] Update measurement seed workflow tests
---
.../github-workflow/ci-workflow-helpers.unit.test.ts | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
index 601c9b908..a4cfb8a1c 100644
--- a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
+++ b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
@@ -453,12 +453,16 @@ describe('ci workflow devenv perf helpers', () => {
expect(generatedCiWorkflowYamlSource).toContain(
'CI_MEASUREMENT_PR_COMMENT_TITLE: Devenv Performance',
)
- expect(generatedCiWorkflowYamlSource).toContain("BASELINE_SEED_RUN_IDS: '25710204667'")
+ expect(generatedCiWorkflowYamlSource).toContain('BASELINE_SEED_RUNS_JSON:')
+ expect(generatedCiWorkflowYamlSource).toContain('"runId":"25710204667"')
expect(generatedCiWorkflowYamlSource).toContain('Upload devenv perf artifacts')
expect(generatedCiWorkflowYamlSource).toContain('retention-days: 30')
expect(ciWorkflowSource).toContain("contents: 'write'")
+ expect(ciWorkflowSource).toContain('seedRuns?: readonly CiMeasurementBaselineSeedRun[]')
expect(ciWorkflowSource).toContain('seedRunIds?: readonly string[]')
+ expect(ciWorkflowSource).toContain('baselineSeedRuns?: readonly CiMeasurementBaselineSeedRun[]')
expect(ciWorkflowSource).toContain('baselineSeedRunIds?: readonly string[]')
+ expect(ciWorkflowSource).toContain('seedRuns: ($seedRuns[0] // [])')
expect(ciWorkflowSource).toContain('baselineProvenance: ($baselineProvenance[0] // null)')
expect(ciWorkflowSource).toContain(
'["devenvRev", "otelServiceName", "status", "probeLabel", "sampleCount", "measuredSampleCount"] | index($key) | not',
From a6813f93f5cdfad73775615e8bfcf1bbfcbf6d9f Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Sat, 16 May 2026 11:56:23 +0200
Subject: [PATCH 19/81] Expose measurement gate readiness
---
.github/workflows/ci.yml | 20 +++++++++++++++++++
.../ci-measurement-comparison.test.sh | 17 ++++++++++------
genie/ci-workflow/measurements.ts | 20 +++++++++++++++++++
.../ci-workflow-helpers.unit.test.ts | 5 +++++
4 files changed, 56 insertions(+), 6 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3e6cb5edd..7ac622c86 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3202,10 +3202,25 @@ jobs:
else "pass"
end
) as $status
+ | (
+ [$comparisons[]?]
+ | {
+ enabledCount: (map(select((if (.gatePolicy | has("enabled")) then .gatePolicy.enabled else true end))) | length),
+ gateableCount: (map(select(.gateable == true)) | length),
+ missingBaselineCount: (map(select(.gateReason == "missing_baseline")) | length),
+ lowBaselineCount: (map(select(.gateReason == "low_baseline_count")) | length),
+ lowCurrentSampleCount: (map(select(.gateReason == "low_current_sample_count")) | length)
+ }
+ | . + {
+ nonGateableCount: (.enabledCount - .gateableCount),
+ enforceable: (.enabledCount == .gateableCount)
+ }
+ ) as $readiness
| {
schemaVersion:$schemaVersion,
status:$status,
mode:$mode,
+ readiness:$readiness,
currentDir:$currentDir,
baselineDir:$baselineDir,
comparisons:$comparisons
@@ -3654,6 +3669,10 @@ jobs:
}
const statusWord = comparison.status || 'unknown'
+ const readiness = comparison.readiness || {}
+ const readinessLabel = readiness.enforceable
+ ? 'enforceable'
+ : 'partial (' + (readiness.gateableCount ?? 0) + '/' + (readiness.enabledCount ?? 0) + ' enabled observations gateable)'
const runUrl = runId ? serverUrl + '/' + repo + '/actions/runs/' + runId : undefined
const shortSha = (headSha || sha || 'unknown').slice(0, 7)
const existingState = extractState(existing?.body)
@@ -3713,6 +3732,7 @@ jobs:
'- Commit: ' + shortSha,
'- Run: ' + runLink,
'- Baseline: ' + baselineLabel,
+ '- Readiness: ' + readinessLabel,
'- Protocol: ' + protocolLabel,
'',
hasComparableBaseline
diff --git a/genie/ci-scripts/ci-measurement-comparison.test.sh b/genie/ci-scripts/ci-measurement-comparison.test.sh
index 09e583396..fe2ea66af 100755
--- a/genie/ci-scripts/ci-measurement-comparison.test.sh
+++ b/genie/ci-scripts/ci-measurement-comparison.test.sh
@@ -86,8 +86,9 @@ write_measurement "$tmp_dir/baseline/run-1/measurements.json" 10 legacy "$policy
run_compare
actual_status="$(jq -r '.status' "$tmp_dir/comparison.json")"
actual_gate="$(jq -r '.comparisons[] | .gateReason' "$tmp_dir/comparison.json")"
-if [ "$actual_status" != "partial" ] || [ "$actual_gate" != "missing_baseline" ]; then
- echo "expected protocol mismatch to be missing_baseline; got status=$actual_status gate=$actual_gate" >&2
+actual_enforceable="$(jq -r '.readiness.enforceable' "$tmp_dir/comparison.json")"
+if [ "$actual_status" != "partial" ] || [ "$actual_gate" != "missing_baseline" ] || [ "$actual_enforceable" != "false" ]; then
+ echo "expected protocol mismatch to be missing_baseline and unenforceable; got status=$actual_status gate=$actual_gate enforceable=$actual_enforceable" >&2
exit 1
fi
@@ -97,8 +98,9 @@ write_measurement "$tmp_dir/baseline/run-1/measurements.json" 10 devenv-perf-war
run_compare
actual_status="$(jq -r '.status' "$tmp_dir/comparison.json")"
actual_row="$(jq -r '.comparisons[] | .status' "$tmp_dir/comparison.json")"
-if [ "$actual_status" != "fail" ] || [ "$actual_row" != "fail" ]; then
- echo "expected confirmed regression to fail; got status=$actual_status row=$actual_row" >&2
+actual_enforceable="$(jq -r '.readiness.enforceable' "$tmp_dir/comparison.json")"
+if [ "$actual_status" != "fail" ] || [ "$actual_row" != "fail" ] || [ "$actual_enforceable" != "true" ]; then
+ echo "expected confirmed regression to fail and be enforceable; got status=$actual_status row=$actual_row enforceable=$actual_enforceable" >&2
exit 1
fi
@@ -110,8 +112,11 @@ run_compare
actual_status="$(jq -r '.status' "$tmp_dir/comparison.json")"
actual_row="$(jq -r '.comparisons[] | .status' "$tmp_dir/comparison.json")"
actual_gate="$(jq -r '.comparisons[] | .gateReason' "$tmp_dir/comparison.json")"
-if [ "$actual_status" != "partial" ] || [ "$actual_row" != "pass" ] || [ "$actual_gate" != "low_baseline_count" ]; then
- echo "expected low baseline count to be partial but not a regression; got status=$actual_status row=$actual_row gate=$actual_gate" >&2
+actual_enforceable="$(jq -r '.readiness.enforceable' "$tmp_dir/comparison.json")"
+actual_gateable_count="$(jq -r '.readiness.gateableCount' "$tmp_dir/comparison.json")"
+actual_enabled_count="$(jq -r '.readiness.enabledCount' "$tmp_dir/comparison.json")"
+if [ "$actual_status" != "partial" ] || [ "$actual_row" != "pass" ] || [ "$actual_gate" != "low_baseline_count" ] || [ "$actual_enforceable" != "false" ] || [ "$actual_gateable_count" != "0" ] || [ "$actual_enabled_count" != "1" ]; then
+ echo "expected low baseline count to be partial but not enforceable; got status=$actual_status row=$actual_row gate=$actual_gate enforceable=$actual_enforceable readiness=$actual_gateable_count/$actual_enabled_count" >&2
exit 1
fi
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index c90dcf973..2e659357d 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -1306,10 +1306,25 @@ jq -n \
else "pass"
end
) as $status
+ | (
+ [$comparisons[]?]
+ | {
+ enabledCount: (map(select((if (.gatePolicy | has("enabled")) then .gatePolicy.enabled else true end))) | length),
+ gateableCount: (map(select(.gateable == true)) | length),
+ missingBaselineCount: (map(select(.gateReason == "missing_baseline")) | length),
+ lowBaselineCount: (map(select(.gateReason == "low_baseline_count")) | length),
+ lowCurrentSampleCount: (map(select(.gateReason == "low_current_sample_count")) | length)
+ }
+ | . + {
+ nonGateableCount: (.enabledCount - .gateableCount),
+ enforceable: (.enabledCount == .gateableCount)
+ }
+ ) as $readiness
| {
schemaVersion:$schemaVersion,
status:$status,
mode:$mode,
+ readiness:$readiness,
currentDir:$currentDir,
baselineDir:$baselineDir,
comparisons:$comparisons
@@ -1758,6 +1773,10 @@ const renderPerfChangeSvg = (rows) => {
}
const statusWord = comparison.status || 'unknown'
+const readiness = comparison.readiness || {}
+const readinessLabel = readiness.enforceable
+ ? 'enforceable'
+ : 'partial (' + (readiness.gateableCount ?? 0) + '/' + (readiness.enabledCount ?? 0) + ' enabled observations gateable)'
const runUrl = runId ? serverUrl + '/' + repo + '/actions/runs/' + runId : undefined
const shortSha = (headSha || sha || 'unknown').slice(0, 7)
const existingState = extractState(existing?.body)
@@ -1817,6 +1836,7 @@ const summaryLines = [
'- Commit: ' + shortSha,
'- Run: ' + runLink,
'- Baseline: ' + baselineLabel,
+ '- Readiness: ' + readinessLabel,
'- Protocol: ' + protocolLabel,
'',
hasComparableBaseline
diff --git a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
index a4cfb8a1c..4050f6934 100644
--- a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
+++ b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
@@ -415,6 +415,10 @@ describe('ci workflow devenv perf helpers', () => {
expect(generatedCiWorkflowYamlSource).toContain('baselineSources')
expect(generatedCiWorkflowYamlSource).toContain('low_baseline_count')
expect(generatedCiWorkflowYamlSource).toContain('low_current_sample_count')
+ expect(generatedCiWorkflowYamlSource).toContain('readiness:$readiness')
+ expect(generatedCiWorkflowYamlSource).toContain(
+ 'enforceable: (.enabledCount == .gateableCount)',
+ )
expect(generatedCiWorkflowYamlSource).toContain('within_baseline_range')
expect(generatedCiWorkflowYamlSource).toContain(
'elif $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"',
@@ -471,6 +475,7 @@ describe('ci workflow devenv perf helpers', () => {
expect(ciWorkflowSource).toContain(
'Chart: performance change versus baseline median. Green is faster, red is slower, gray is within noise or baseline range.',
)
+ expect(ciWorkflowSource).toContain("'- Readiness: ' + readinessLabel")
expect(ciWorkflowSource).toContain('renderPerfChangeSvg')
expect(ciWorkflowSource).toContain('Perf change vs baseline (%)')
expect(ciWorkflowSource).toContain('![Perf change vs baseline chart]')
From 1b45f54ad8fb9eca6deccdf98d3f67eac678c2c8 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Sat, 16 May 2026 12:37:05 +0200
Subject: [PATCH 20/81] Allow parallel measurement baseline backfills
---
.github/workflows/ci.yml | 2 +-
genie/ci-workflow/shared.ts | 6 +++++-
2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7ac622c86..c1ddd8efe 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,7 +2,7 @@
# Source: ci.yml.genie.ts
concurrency:
- group: '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}'
+ group: '${{ github.workflow }}-${{ github.event.pull_request.number || inputs.measurement_baseline_ref || github.ref }}'
cancel-in-progress: true
name: CI
diff --git a/genie/ci-workflow/shared.ts b/genie/ci-workflow/shared.ts
index 3d68f870a..9a8930990 100644
--- a/genie/ci-workflow/shared.ts
+++ b/genie/ci-workflow/shared.ts
@@ -54,9 +54,13 @@ export const standardCIEnv = {
* The group key intentionally does not include the job name so a new push
* cancels the entire older workflow run rather than letting stale sibling jobs
* continue consuming runner capacity.
+ *
+ * Measurement baseline backfills are keyed by their subject ref so several
+ * historical refs can be backfilled without canceling each other.
*/
export const ciWorkflowConcurrency = {
- group: '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}',
+ group:
+ '${{ github.workflow }}-${{ github.event.pull_request.number || inputs.measurement_baseline_ref || github.ref }}',
'cancel-in-progress': true,
} as const
From 3daf309dd0315f43fa5c860bf4f3775e678a9bbb Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Sat, 16 May 2026 12:52:58 +0200
Subject: [PATCH 21/81] Seed effect-utils devenv perf baselines
---
.github/workflows/ci.yml | 2 +-
.github/workflows/ci.yml.genie.ts | 31 ++++++++++++++++---
.../ci-workflow-helpers.unit.test.ts | 5 ++-
3 files changed, 32 insertions(+), 6 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c1ddd8efe..458e46f06 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2420,7 +2420,7 @@ jobs:
BASELINE_OUTPUT_DIR: tmp/devenv-perf-ci/baseline
BASELINE_WORKFLOW_NAME: ${{ github.workflow }}
BASELINE_BRANCH: ${{ github.base_ref || github.ref_name }}
- BASELINE_SEED_RUNS_JSON: '[{"runId":"25710204667","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Known comparable v2 devenv performance baseline."}]'
+ BASELINE_SEED_RUNS_JSON: '[{"runId":"25959801150","label":"PR #655","sha":"df0420cd0397ffc6928d3c6ccc9c23052d6bc255","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959802067","label":"PR #657","sha":"62833cba5d83b1c13462728edeafa684e61c006f","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959802958","label":"PR #656","sha":"21029998522a0e9435df151259611650fb948a20","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959803805","label":"PR #651","sha":"95515f971b27ef279e39c982f52e46cf9e8270e9","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959804678","label":"PR #654","sha":"58e96b9a2b87b3703de6920b6d9571f3805d0171","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959805512","label":"PR #653","sha":"d1cca16339f19d7e1a27b001edc4c2c7ecd13dc4","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959806473","label":"PR #652","sha":"acd6c63f5e235e7e5f2710fc62b2231e0ba904a6","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959807303","label":"PR #648","sha":"a5a07703ff951fb7396a40844e9491d88ed40edf","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959808097","label":"PR #649","sha":"360ff47c59a206064711dfcb6c610afd0e6b0d53","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959808775","label":"PR #647","sha":"8d1810b2c359ae95f245e56329018aab5020f8c0","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959809449","label":"PR #646","sha":"89e1396766ccd2a813680acd440cb78f540ca6c1","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959810069","label":"PR #643","sha":"239715520370436901a3f2218d162dc7b12f4b4c","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959810666","label":"PR #641","sha":"6b3751b4684ba45f496f1a1bff8b86ef6ba8275b","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959811321","label":"PR #640","sha":"fed50ae2502ac0a65395bbef5af43fcf384d5d04","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959811864","label":"PR #639","sha":"0e03df2c6f20e4d154f286fd69a4e2980d21a12d","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959812634","label":"PR #636","sha":"7efdbee4b571f2c80f5b6173bc9a84b51fbef5eb","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959813189","label":"PR #638","sha":"350d1b98baa943dcae63412eeffded7b5160bc8a","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959813761","label":"PR #637","sha":"f25336193b9f6b042eb027eca27acc4cc75a69d6","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959814335","label":"PR #634","sha":"4ba441d4ad8b6c49e9ee03d9cdfd2f04a129b714","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959814835","label":"PR #632","sha":"1ad5fd735c7f45ad5e07c8033e5b68a642ada69c","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."}]'
BASELINE_MAX_RUNS: '20'
run: |
set -euo pipefail
diff --git a/.github/workflows/ci.yml.genie.ts b/.github/workflows/ci.yml.genie.ts
index 2aab51850..c466ddd64 100644
--- a/.github/workflows/ci.yml.genie.ts
+++ b/.github/workflows/ci.yml.genie.ts
@@ -283,12 +283,35 @@ const extraJobs: Record = {
}),
artifactName: 'devenv-perf',
baselineSeedRuns: [
- {
- runId: '25710204667',
+ ...[
+ ['25959801150', '655', 'df0420cd0397ffc6928d3c6ccc9c23052d6bc255'],
+ ['25959802067', '657', '62833cba5d83b1c13462728edeafa684e61c006f'],
+ ['25959802958', '656', '21029998522a0e9435df151259611650fb948a20'],
+ ['25959803805', '651', '95515f971b27ef279e39c982f52e46cf9e8270e9'],
+ ['25959804678', '654', '58e96b9a2b87b3703de6920b6d9571f3805d0171'],
+ ['25959805512', '653', 'd1cca16339f19d7e1a27b001edc4c2c7ecd13dc4'],
+ ['25959806473', '652', 'acd6c63f5e235e7e5f2710fc62b2231e0ba904a6'],
+ ['25959807303', '648', 'a5a07703ff951fb7396a40844e9491d88ed40edf'],
+ ['25959808097', '649', '360ff47c59a206064711dfcb6c610afd0e6b0d53'],
+ ['25959808775', '647', '8d1810b2c359ae95f245e56329018aab5020f8c0'],
+ ['25959809449', '646', '89e1396766ccd2a813680acd440cb78f540ca6c1'],
+ ['25959810069', '643', '239715520370436901a3f2218d162dc7b12f4b4c'],
+ ['25959810666', '641', '6b3751b4684ba45f496f1a1bff8b86ef6ba8275b'],
+ ['25959811321', '640', 'fed50ae2502ac0a65395bbef5af43fcf384d5d04'],
+ ['25959811864', '639', '0e03df2c6f20e4d154f286fd69a4e2980d21a12d'],
+ ['25959812634', '636', '7efdbee4b571f2c80f5b6173bc9a84b51fbef5eb'],
+ ['25959813189', '638', '350d1b98baa943dcae63412eeffded7b5160bc8a'],
+ ['25959813761', '637', 'f25336193b9f6b042eb027eca27acc4cc75a69d6'],
+ ['25959814335', '634', '4ba441d4ad8b6c49e9ee03d9cdfd2f04a129b714'],
+ ['25959814835', '632', '1ad5fd735c7f45ad5e07c8033e5b68a642ada69c'],
+ ].map(([runId, pr, sha]) => ({
+ runId,
+ label: `PR #${pr}`,
+ sha,
source: 'manual-backfill',
artifacts: ['devenv-perf'],
- notes: 'Known comparable v2 devenv performance baseline.',
- },
+ notes: 'Backfilled with the current measurement workflow for the effect-utils #658 rollout.',
+ })),
],
baselineMaxRuns: 20,
regressionMode: 'fail',
diff --git a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
index 4050f6934..07e71845b 100644
--- a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
+++ b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
@@ -458,7 +458,10 @@ describe('ci workflow devenv perf helpers', () => {
'CI_MEASUREMENT_PR_COMMENT_TITLE: Devenv Performance',
)
expect(generatedCiWorkflowYamlSource).toContain('BASELINE_SEED_RUNS_JSON:')
- expect(generatedCiWorkflowYamlSource).toContain('"runId":"25710204667"')
+ expect(generatedCiWorkflowYamlSource).toContain('"runId":"25959801150"')
+ expect(generatedCiWorkflowYamlSource).toContain('"runId":"25959814835"')
+ expect(generatedCiWorkflowYamlSource).toContain('"label":"PR #655"')
+ expect(generatedCiWorkflowYamlSource).toContain('"label":"PR #632"')
expect(generatedCiWorkflowYamlSource).toContain('Upload devenv perf artifacts')
expect(generatedCiWorkflowYamlSource).toContain('retention-days: 30')
expect(ciWorkflowSource).toContain("contents: 'write'")
From 15226c8ef65675703f2cc0ce66f501e70b75aaf1 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Sat, 16 May 2026 13:01:06 +0200
Subject: [PATCH 22/81] Filter non-comparable measurement history
---
.github/workflows/ci.yml | 7 ++++++-
genie/ci-workflow/measurements.ts | 7 ++++++-
2 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 458e46f06..5b93ef473 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3697,7 +3697,12 @@ jobs:
ratio: formatRatio(row.ratio),
})),
}
- const previousRuns = (existingState?.runs || []).filter((run) => run.commitSha !== currentRun.commitSha)
+ const hasComparableHistory = (run) => Array.isArray(run.visibleRows) && run.visibleRows.some((row) =>
+ row.status !== 'missing_baseline' &&
+ row.baseline !== 'n/a' &&
+ row.ratio !== 'n/a'
+ )
+ const previousRuns = (existingState?.runs || []).filter((run) => run.commitSha !== currentRun.commitSha && hasComparableHistory(run))
const historyLimit = Number.isFinite(maxHistory) && maxHistory > 0 ? maxHistory : 20
const state = { _tag: stateTag, schemaVersion, title, runs: [currentRun, ...previousRuns].slice(0, historyLimit) }
const gateModeLabel = (mode) => {
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 2e659357d..918f5a243 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -1801,7 +1801,12 @@ const currentRun = {
ratio: formatRatio(row.ratio),
})),
}
-const previousRuns = (existingState?.runs || []).filter((run) => run.commitSha !== currentRun.commitSha)
+const hasComparableHistory = (run) => Array.isArray(run.visibleRows) && run.visibleRows.some((row) =>
+ row.status !== 'missing_baseline' &&
+ row.baseline !== 'n/a' &&
+ row.ratio !== 'n/a'
+)
+const previousRuns = (existingState?.runs || []).filter((run) => run.commitSha !== currentRun.commitSha && hasComparableHistory(run))
const historyLimit = Number.isFinite(maxHistory) && maxHistory > 0 ? maxHistory : 20
const state = { _tag: stateTag, schemaVersion, title, runs: [currentRun, ...previousRuns].slice(0, historyLimit) }
const gateModeLabel = (mode) => {
From d3139f898dd82c0bce61bb778be1cbac589e5ae0 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Sat, 16 May 2026 13:09:43 +0200
Subject: [PATCH 23/81] Make measurement backfill concurrency reusable
---
.github/workflows/ci.yml | 2 +-
genie/ci-workflow/shared.ts | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 5b93ef473..1b5f79391 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,7 +2,7 @@
# Source: ci.yml.genie.ts
concurrency:
- group: '${{ github.workflow }}-${{ github.event.pull_request.number || inputs.measurement_baseline_ref || github.ref }}'
+ group: '${{ github.workflow }}-${{ github.event.pull_request.number || github.event.inputs.measurement_baseline_ref || github.ref }}'
cancel-in-progress: true
name: CI
diff --git a/genie/ci-workflow/shared.ts b/genie/ci-workflow/shared.ts
index 9a8930990..29ba9deee 100644
--- a/genie/ci-workflow/shared.ts
+++ b/genie/ci-workflow/shared.ts
@@ -60,7 +60,7 @@ export const standardCIEnv = {
*/
export const ciWorkflowConcurrency = {
group:
- '${{ github.workflow }}-${{ github.event.pull_request.number || inputs.measurement_baseline_ref || github.ref }}',
+ '${{ github.workflow }}-${{ github.event.pull_request.number || github.event.inputs.measurement_baseline_ref || github.ref }}',
'cancel-in-progress': true,
} as const
From 5bc374d4eaa361fcee5e8cf743aa51cd94a4d3aa Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Sat, 16 May 2026 15:27:10 +0200
Subject: [PATCH 24/81] Accumulate compatible measurement baselines
---
.github/workflows/ci.yml | 70 ++++++-
genie/ci-workflow/measurements.ts | 185 +++++++++++++-----
.../ci-workflow-helpers.unit.test.ts | 9 +
3 files changed, 217 insertions(+), 47 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 1b5f79391..0b1da0efe 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2422,6 +2422,8 @@ jobs:
BASELINE_BRANCH: ${{ github.base_ref || github.ref_name }}
BASELINE_SEED_RUNS_JSON: '[{"runId":"25959801150","label":"PR #655","sha":"df0420cd0397ffc6928d3c6ccc9c23052d6bc255","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959802067","label":"PR #657","sha":"62833cba5d83b1c13462728edeafa684e61c006f","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959802958","label":"PR #656","sha":"21029998522a0e9435df151259611650fb948a20","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959803805","label":"PR #651","sha":"95515f971b27ef279e39c982f52e46cf9e8270e9","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959804678","label":"PR #654","sha":"58e96b9a2b87b3703de6920b6d9571f3805d0171","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959805512","label":"PR #653","sha":"d1cca16339f19d7e1a27b001edc4c2c7ecd13dc4","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959806473","label":"PR #652","sha":"acd6c63f5e235e7e5f2710fc62b2231e0ba904a6","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959807303","label":"PR #648","sha":"a5a07703ff951fb7396a40844e9491d88ed40edf","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959808097","label":"PR #649","sha":"360ff47c59a206064711dfcb6c610afd0e6b0d53","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959808775","label":"PR #647","sha":"8d1810b2c359ae95f245e56329018aab5020f8c0","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959809449","label":"PR #646","sha":"89e1396766ccd2a813680acd440cb78f540ca6c1","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959810069","label":"PR #643","sha":"239715520370436901a3f2218d162dc7b12f4b4c","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959810666","label":"PR #641","sha":"6b3751b4684ba45f496f1a1bff8b86ef6ba8275b","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959811321","label":"PR #640","sha":"fed50ae2502ac0a65395bbef5af43fcf384d5d04","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959811864","label":"PR #639","sha":"0e03df2c6f20e4d154f286fd69a4e2980d21a12d","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959812634","label":"PR #636","sha":"7efdbee4b571f2c80f5b6173bc9a84b51fbef5eb","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959813189","label":"PR #638","sha":"350d1b98baa943dcae63412eeffded7b5160bc8a","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959813761","label":"PR #637","sha":"f25336193b9f6b042eb027eca27acc4cc75a69d6","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959814335","label":"PR #634","sha":"4ba441d4ad8b6c49e9ee03d9cdfd2f04a129b714","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959814835","label":"PR #632","sha":"1ad5fd735c7f45ad5e07c8033e5b68a642ada69c","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."}]'
BASELINE_MAX_RUNS: '20'
+ BASELINE_MAX_CANDIDATE_RUNS: '60'
+ BASELINE_REQUIRED_OBSERVATIONS_JSON: '[{"id":"devenv.shell_eval_warm.duration","minSources":10},{"id":"devenv.tasks_list.duration","minSources":10},{"id":"devenv.processes_help.duration","minSources":10},{"id":"devenv.task_pnpm_install.duration","minSources":10},{"id":"devenv.task_genie_run.duration","minSources":10},{"id":"devenv.task_check_quick.duration","minSources":10},{"id":"devenv.genie_check_direct.duration","minSources":10}]'
run: |
set -euo pipefail
@@ -2442,13 +2444,25 @@ jobs:
workflow="${BASELINE_WORKFLOW_NAME:-CI}"
branch="${BASELINE_BRANCH:-${GITHUB_BASE_REF:-${GITHUB_REF_NAME:-main}}}"
seed_runs_file="$BASELINE_OUTPUT_DIR/baseline-seed-runs.json"
+ required_observations_file="$BASELINE_OUTPUT_DIR/baseline-required-observations.json"
printf '%s' "${BASELINE_SEED_RUNS_JSON:-[]}" >"$seed_runs_file"
+ printf '%s' "${BASELINE_REQUIRED_OBSERVATIONS_JSON:-[]}" >"$required_observations_file"
if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.runId | type == "string")) else false end' \
"$seed_runs_file" >/dev/null; then
echo "::error::BASELINE_SEED_RUNS_JSON must be an array of objects with string runId fields"
exit 1
fi
+ if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.id | type == "string") and (.minSources | type == "number")) else false end' \
+ "$required_observations_file" >/dev/null; then
+ echo "::error::BASELINE_REQUIRED_OBSERVATIONS_JSON must be an array of objects with string id and numeric minSources fields"
+ exit 1
+ fi
seed_run_ids="$(jq -r '.[].runId' "$seed_runs_file")"
+ required_observation_count="$(jq 'length' "$required_observations_file")"
+ max_candidate_runs="${BASELINE_MAX_CANDIDATE_RUNS:-${BASELINE_MAX_RUNS:-5}}"
+ if ! [[ "$max_candidate_runs" =~ ^[0-9]+$ ]] || [ "$max_candidate_runs" -lt 1 ]; then
+ max_candidate_runs=1
+ fi
candidate_runs="$(
"$GH_BIN" run list \
@@ -2458,7 +2472,7 @@ jobs:
--event push \
--status success \
--json databaseId,headSha \
- --limit 20 \
+ --limit "$max_candidate_runs" \
--jq '[.[] | select(.headSha != env.GITHUB_SHA) | .databaseId] | .[]'
)"
@@ -2470,6 +2484,45 @@ jobs:
max_runs=1
fi
+ write_baseline_observation_counts() {
+ local measurement_index="$BASELINE_OUTPUT_DIR/baseline-measurement-files.txt"
+ local counts_file="$BASELINE_OUTPUT_DIR/baseline-observation-counts.json"
+ find "$BASELINE_OUTPUT_DIR" \
+ -mindepth 2 \
+ -maxdepth 2 \
+ -name measurements.json \
+ -type f \
+ -print \
+ | sort >"$measurement_index" || true
+
+ if [ -s "$measurement_index" ]; then
+ xargs -r jq -s \
+ --slurpfile required "$required_observations_file" \
+ '
+ ([.[] | (.observations // [])[]? | select(.value | type == "number") | .id] | sort | group_by(.) | map({id: .[0], sources: length})) as $counts
+ | ($required[0] // []) as $requiredRows
+ | {
+ counts: $counts,
+ required: (
+ $requiredRows
+ | map(. as $requiredRow | ($counts | map(select(.id == $requiredRow.id)) | .[0].sources // 0) as $actual | $requiredRow + {sources:$actual, satisfied:($actual >= $requiredRow.minSources)})
+ )
+ }
+ ' <"$measurement_index" >"$counts_file"
+ else
+ jq -n --slurpfile required "$required_observations_file" \
+ '{counts: [], required: (($required[0] // []) | map(. + {sources:0, satisfied:false}))}' >"$counts_file"
+ fi
+ }
+
+ baseline_requirements_satisfied() {
+ if [ "$required_observation_count" -eq 0 ]; then
+ return 1
+ fi
+ write_baseline_observation_counts
+ jq -e '.required | all(.satisfied == true)' "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" >/dev/null
+ }
+
run_id=""
artifact_name=""
artifact_id=""
@@ -2484,7 +2537,14 @@ jobs:
if grep -qxF "$candidate_run" "$seen_runs_file"; then
continue
fi
- if [ "$(wc -l <"$downloaded_runs_file" | tr -d ' ')" -ge "$max_runs" ]; then
+ downloaded_count="$(wc -l <"$downloaded_runs_file" | tr -d ' ')"
+ if [ "$downloaded_count" -ge "$max_runs" ]; then
+ if baseline_requirements_satisfied; then
+ break
+ fi
+ echo "::notice::downloaded $downloaded_count baseline artifact(s), but required observation counts are not satisfied yet; continuing through bounded candidate history"
+ fi
+ if [ "$(wc -l <"$seen_runs_file" | tr -d ' ')" -ge "$max_candidate_runs" ]; then
break
fi
printf '%s\n' "$candidate_run" >>"$seen_runs_file"
@@ -2526,6 +2586,8 @@ jobs:
fi
done
+ write_baseline_observation_counts
+
if [ -z "$run_id" ] || [ -z "$artifact_name" ]; then
echo "::notice::no successful baseline run found for $repo workflow=$workflow branch=$branch"
exit 0
@@ -2534,6 +2596,7 @@ jobs:
jq -n \
--slurpfile runs "$downloaded_runs_file" \
--slurpfile seedRuns "$seed_runs_file" \
+ --slurpfile observationCounts "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" \
--argjson schemaVersion 1 \
--arg repository "$repo" \
--arg workflow "$workflow" \
@@ -2551,7 +2614,8 @@ jobs:
artifactName: $artifactName,
artifactId: $artifactId,
seedRuns: ($seedRuns[0] // []),
- runs: $runs
+ runs: $runs,
+ observationCounts: ($observationCounts[0] // null)
}' >"$BASELINE_OUTPUT_DIR/baseline-provenance.json"
echo "Downloaded $(wc -l <"$downloaded_runs_file" | tr -d ' ') baseline artifact(s), latest $artifact_name from run $run_id into $BASELINE_OUTPUT_DIR"
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 918f5a243..3aa60e761 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -82,6 +82,8 @@ export type GitHubPreviousArtifactStepOptions = {
readonly seedRuns?: readonly CiMeasurementBaselineSeedRun[]
readonly seedRunIds?: readonly string[]
readonly maxRuns?: number
+ readonly maxCandidateRuns?: number
+ readonly requiredObservations?: readonly CiMeasurementRequiredBaselineObservation[]
readonly tokenExpression?: string
}
@@ -94,6 +96,11 @@ export type CiMeasurementBaselineSeedRun = {
readonly notes?: string
}
+export type CiMeasurementRequiredBaselineObservation = {
+ readonly id: string
+ readonly minSources: number
+}
+
export type CiMeasurementsComparisonStepOptions = {
readonly currentDir?: string
readonly baselineDir?: string
@@ -186,6 +193,7 @@ export type DevenvPerfJobOptions = {
readonly baselineSeedRuns?: readonly CiMeasurementBaselineSeedRun[]
readonly baselineSeedRunIds?: readonly string[]
readonly baselineMaxRuns?: number
+ readonly baselineMaxCandidateRuns?: number
readonly setupSteps?: readonly DevenvPerfSetupStep[]
readonly env?: Record
readonly taskProbes?: readonly DevenvPerfTaskProbe[]
@@ -270,50 +278,66 @@ const defaultDevenvPerfTaskProbe = (probe: DevenvPerfTaskProbe): DevenvPerfProbe
}
}
+const devenvPerfProbes = (
+ opts: Required>,
+): readonly DevenvPerfProbe[] => [
+ {
+ id: 'shell_eval_traced',
+ label: 'Shell eval with OTEL trace',
+ group: 'devenv shell',
+ description: 'Evaluates the dev shell with native devenv JSON tracing enabled.',
+ command: [
+ '$DEVENV_SHELL_TRACE_COMMAND',
+ ],
+ traceOutput: '$ARTIFACT_DIR/traces/shell_eval_traced.json',
+ },
+ {
+ id: 'shell_eval_warm',
+ label: 'Warm shell eval',
+ group: 'devenv shell',
+ description: 'Evaluates a warm dev shell without reloading direnv state.',
+ warmupRepetitions: 1,
+ repetitions: 5,
+ command: ['$DEVENV_BIN', 'shell', '--no-reload', '--', 'true'],
+ },
+ {
+ id: 'tasks_list',
+ label: 'devenv tasks list',
+ group: 'devenv cli',
+ description: 'Lists devenv tasks to measure task graph loading overhead.',
+ warmupRepetitions: 1,
+ repetitions: 9,
+ command: ['$DEVENV_BIN', 'tasks', 'list'],
+ },
+ {
+ id: 'processes_help',
+ label: 'devenv processes --help',
+ group: 'devenv cli',
+ description: 'Loads the devenv processes command help path.',
+ warmupRepetitions: 1,
+ repetitions: 9,
+ command: ['$DEVENV_BIN', 'processes', '--help'],
+ },
+ ...opts.taskProbes.map(defaultDevenvPerfTaskProbe),
+ ...opts.probes,
+]
+
+const devenvPerfRequiredBaselineObservations = (
+ probes: readonly DevenvPerfProbe[],
+): readonly CiMeasurementRequiredBaselineObservation[] =>
+ probes
+ .map((probe) => ({
+ id: `devenv.${probe.id}.duration`,
+ minSources: devenvPerfGatePolicy(probe).minBaselineSources ?? 1,
+ enabled: devenvPerfGatePolicy(probe).enabled ?? true,
+ }))
+ .filter((probe) => probe.enabled)
+ .map(({ id, minSources }) => ({ id, minSources }))
+
const renderDevenvPerfScript = (
opts: Required>,
) => {
- const probes: readonly DevenvPerfProbe[] = [
- {
- id: 'shell_eval_traced',
- label: 'Shell eval with OTEL trace',
- group: 'devenv shell',
- description: 'Evaluates the dev shell with native devenv JSON tracing enabled.',
- command: [
- '$DEVENV_SHELL_TRACE_COMMAND',
- ],
- traceOutput: '$ARTIFACT_DIR/traces/shell_eval_traced.json',
- },
- {
- id: 'shell_eval_warm',
- label: 'Warm shell eval',
- group: 'devenv shell',
- description: 'Evaluates a warm dev shell without reloading direnv state.',
- warmupRepetitions: 1,
- repetitions: 5,
- command: ['$DEVENV_BIN', 'shell', '--no-reload', '--', 'true'],
- },
- {
- id: 'tasks_list',
- label: 'devenv tasks list',
- group: 'devenv cli',
- description: 'Lists devenv tasks to measure task graph loading overhead.',
- warmupRepetitions: 1,
- repetitions: 9,
- command: ['$DEVENV_BIN', 'tasks', 'list'],
- },
- {
- id: 'processes_help',
- label: 'devenv processes --help',
- group: 'devenv cli',
- description: 'Loads the devenv processes command help path.',
- warmupRepetitions: 1,
- repetitions: 9,
- command: ['$DEVENV_BIN', 'processes', '--help'],
- },
- ...opts.taskProbes.map(defaultDevenvPerfTaskProbe),
- ...opts.probes,
- ]
+ const probes = devenvPerfProbes(opts)
return String.raw`set -euo pipefail
@@ -695,6 +719,9 @@ const ciMeasurementBaselineSeedRunsJson = (opts: GitHubPreviousArtifactStepOptio
[],
)
+const ciMeasurementRequiredObservationsJson = (opts: GitHubPreviousArtifactStepOptions) =>
+ JSON.stringify(opts.requiredObservations ?? [])
+
export const downloadPreviousGitHubArtifactStep = (opts: GitHubPreviousArtifactStepOptions) =>
({
name: `Download previous artifact: ${opts.artifactName}`,
@@ -707,6 +734,8 @@ export const downloadPreviousGitHubArtifactStep = (opts: GitHubPreviousArtifactS
BASELINE_BRANCH: opts.branch ?? '${{ github.base_ref || github.ref_name }}',
BASELINE_SEED_RUNS_JSON: ciMeasurementBaselineSeedRunsJson(opts),
BASELINE_MAX_RUNS: String(opts.maxRuns ?? 5),
+ BASELINE_MAX_CANDIDATE_RUNS: String(opts.maxCandidateRuns ?? Math.max((opts.maxRuns ?? 5) * 3, 20)),
+ BASELINE_REQUIRED_OBSERVATIONS_JSON: ciMeasurementRequiredObservationsJson(opts),
},
run: String.raw`set -euo pipefail
@@ -727,13 +756,25 @@ repo="${dollar}{GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}"
workflow="${dollar}{BASELINE_WORKFLOW_NAME:-CI}"
branch="${dollar}{BASELINE_BRANCH:-${dollar}{GITHUB_BASE_REF:-${dollar}{GITHUB_REF_NAME:-main}}}"
seed_runs_file="$BASELINE_OUTPUT_DIR/baseline-seed-runs.json"
+required_observations_file="$BASELINE_OUTPUT_DIR/baseline-required-observations.json"
printf '%s' "${dollar}{BASELINE_SEED_RUNS_JSON:-[]}" >"$seed_runs_file"
+printf '%s' "${dollar}{BASELINE_REQUIRED_OBSERVATIONS_JSON:-[]}" >"$required_observations_file"
if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.runId | type == "string")) else false end' \
"$seed_runs_file" >/dev/null; then
echo "::error::BASELINE_SEED_RUNS_JSON must be an array of objects with string runId fields"
exit 1
fi
+if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.id | type == "string") and (.minSources | type == "number")) else false end' \
+ "$required_observations_file" >/dev/null; then
+ echo "::error::BASELINE_REQUIRED_OBSERVATIONS_JSON must be an array of objects with string id and numeric minSources fields"
+ exit 1
+fi
seed_run_ids="$(jq -r '.[].runId' "$seed_runs_file")"
+required_observation_count="$(jq 'length' "$required_observations_file")"
+max_candidate_runs="${dollar}{BASELINE_MAX_CANDIDATE_RUNS:-${dollar}{BASELINE_MAX_RUNS:-5}}"
+if ! [[ "$max_candidate_runs" =~ ^[0-9]+$ ]] || [ "$max_candidate_runs" -lt 1 ]; then
+ max_candidate_runs=1
+fi
candidate_runs="$(
"$GH_BIN" run list \
@@ -743,7 +784,7 @@ candidate_runs="$(
--event push \
--status success \
--json databaseId,headSha \
- --limit 20 \
+ --limit "$max_candidate_runs" \
--jq '[.[] | select(.headSha != env.GITHUB_SHA) | .databaseId] | .[]'
)"
@@ -755,6 +796,45 @@ if ! [[ "$max_runs" =~ ^[0-9]+$ ]] || [ "$max_runs" -lt 1 ]; then
max_runs=1
fi
+write_baseline_observation_counts() {
+ local measurement_index="$BASELINE_OUTPUT_DIR/baseline-measurement-files.txt"
+ local counts_file="$BASELINE_OUTPUT_DIR/baseline-observation-counts.json"
+ find "$BASELINE_OUTPUT_DIR" \
+ -mindepth 2 \
+ -maxdepth 2 \
+ -name measurements.json \
+ -type f \
+ -print \
+ | sort >"$measurement_index" || true
+
+ if [ -s "$measurement_index" ]; then
+ xargs -r jq -s \
+ --slurpfile required "$required_observations_file" \
+ '
+ ([.[] | (.observations // [])[]? | select(.value | type == "number") | .id] | sort | group_by(.) | map({id: .[0], sources: length})) as $counts
+ | ($required[0] // []) as $requiredRows
+ | {
+ counts: $counts,
+ required: (
+ $requiredRows
+ | map(. as $requiredRow | ($counts | map(select(.id == $requiredRow.id)) | .[0].sources // 0) as $actual | $requiredRow + {sources:$actual, satisfied:($actual >= $requiredRow.minSources)})
+ )
+ }
+ ' <"$measurement_index" >"$counts_file"
+ else
+ jq -n --slurpfile required "$required_observations_file" \
+ '{counts: [], required: (($required[0] // []) | map(. + {sources:0, satisfied:false}))}' >"$counts_file"
+ fi
+}
+
+baseline_requirements_satisfied() {
+ if [ "$required_observation_count" -eq 0 ]; then
+ return 1
+ fi
+ write_baseline_observation_counts
+ jq -e '.required | all(.satisfied == true)' "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" >/dev/null
+}
+
run_id=""
artifact_name=""
artifact_id=""
@@ -769,7 +849,14 @@ for candidate_run in $candidate_runs; do
if grep -qxF "$candidate_run" "$seen_runs_file"; then
continue
fi
- if [ "$(wc -l <"$downloaded_runs_file" | tr -d ' ')" -ge "$max_runs" ]; then
+ downloaded_count="$(wc -l <"$downloaded_runs_file" | tr -d ' ')"
+ if [ "$downloaded_count" -ge "$max_runs" ]; then
+ if baseline_requirements_satisfied; then
+ break
+ fi
+ echo "::notice::downloaded $downloaded_count baseline artifact(s), but required observation counts are not satisfied yet; continuing through bounded candidate history"
+ fi
+ if [ "$(wc -l <"$seen_runs_file" | tr -d ' ')" -ge "$max_candidate_runs" ]; then
break
fi
printf '%s\n' "$candidate_run" >>"$seen_runs_file"
@@ -811,6 +898,8 @@ for candidate_run in $candidate_runs; do
fi
done
+write_baseline_observation_counts
+
if [ -z "$run_id" ] || [ -z "$artifact_name" ]; then
echo "::notice::no successful baseline run found for $repo workflow=$workflow branch=$branch"
exit 0
@@ -819,6 +908,7 @@ fi
jq -n \
--slurpfile runs "$downloaded_runs_file" \
--slurpfile seedRuns "$seed_runs_file" \
+ --slurpfile observationCounts "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" \
--argjson schemaVersion 1 \
--arg repository "$repo" \
--arg workflow "$workflow" \
@@ -836,7 +926,8 @@ jq -n \
artifactName: $artifactName,
artifactId: $artifactId,
seedRuns: ($seedRuns[0] // []),
- runs: $runs
+ runs: $runs,
+ observationCounts: ($observationCounts[0] // null)
}' >"$BASELINE_OUTPUT_DIR/baseline-provenance.json"
echo "Downloaded $(wc -l <"$downloaded_runs_file" | tr -d ' ') baseline artifact(s), latest $artifact_name from run $run_id into $BASELINE_OUTPUT_DIR"
@@ -1924,6 +2015,10 @@ export const devenvPerfJob = (opts?: DevenvPerfJobOptions) => {
opts?.artifactName ??
'devenv-perf-${{ github.job }}-${{ github.run_id }}-attempt-${{ github.run_attempt }}'
const baselineArtifactName = opts?.baselineArtifactName ?? opts?.artifactName
+ const probes = devenvPerfProbes({
+ taskProbes: opts?.taskProbes ?? [],
+ probes: opts?.probes ?? [],
+ })
return {
'runs-on': opts?.runsOn ?? linuxX64Runner,
@@ -1952,6 +2047,8 @@ export const devenvPerfJob = (opts?: DevenvPerfJobOptions) => {
seedRuns: opts?.baselineSeedRuns,
seedRunIds: opts?.baselineSeedRunIds,
maxRuns: opts?.baselineMaxRuns,
+ maxCandidateRuns: opts?.baselineMaxCandidateRuns,
+ requiredObservations: devenvPerfRequiredBaselineObservations(probes),
}),
]),
devenvPerfBenchmarkStep({
diff --git a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
index 07e71845b..524691c03 100644
--- a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
+++ b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
@@ -458,6 +458,15 @@ describe('ci workflow devenv perf helpers', () => {
'CI_MEASUREMENT_PR_COMMENT_TITLE: Devenv Performance',
)
expect(generatedCiWorkflowYamlSource).toContain('BASELINE_SEED_RUNS_JSON:')
+ expect(generatedCiWorkflowYamlSource).toContain('BASELINE_REQUIRED_OBSERVATIONS_JSON:')
+ expect(generatedCiWorkflowYamlSource).toContain('BASELINE_MAX_CANDIDATE_RUNS:')
+ expect(generatedCiWorkflowYamlSource).toContain('"id":"devenv.task_check_quick.duration"')
+ expect(ciWorkflowSource).toContain(
+ 'requiredObservations?: readonly CiMeasurementRequiredBaselineObservation[]',
+ )
+ expect(ciWorkflowSource).toContain('baselineMaxCandidateRuns?: number')
+ expect(ciWorkflowSource).toContain('baseline_requirements_satisfied')
+ expect(ciWorkflowSource).toContain('observationCounts: ($observationCounts[0] // null)')
expect(generatedCiWorkflowYamlSource).toContain('"runId":"25959801150"')
expect(generatedCiWorkflowYamlSource).toContain('"runId":"25959814835"')
expect(generatedCiWorkflowYamlSource).toContain('"label":"PR #655"')
From 50278258adf1bd197150a0677adc5f53986dacd3 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Sun, 17 May 2026 02:54:23 +0200
Subject: [PATCH 25/81] Clarify measurement comment interpretation
---
.github/workflows/ci.yml | 145 +++++++++++++-----
genie/ci-workflow/measurements.ts | 145 +++++++++++++-----
.../ci-workflow-helpers.unit.test.ts | 21 ++-
3 files changed, 229 insertions(+), 82 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 0b1da0efe..3804870c8 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3506,18 +3506,79 @@ jobs:
return formatNumber(Math.round((value - 1) * 1000) / 10) + '%'
}
- const formatResult = (row) => {
- if (row.confidence === 'low_baseline_count') return 'gray needs baseline'
- if (row.confidence === 'low_current_sample_count') return 'gray needs repeat'
- if (row.confidence === 'diagnostic') return 'gray diagnostic'
- if (row.status === 'fail') return 'red regression'
- if (row.status === 'warn') return 'yellow regression'
- if (row.status === 'missing_baseline') return 'gray no baseline'
- if (row.confidence === 'noise_floor') return 'gray noise floor'
- if (row.confidence === 'within_baseline_range') return 'gray within range'
- if (row.confidence === 'within_baseline_distribution') return 'gray within p95'
- if (row.direction === 'improved') return 'green improved'
- return 'gray unchanged'
+ const interpretation = (row) => {
+ if (row.confidence === 'low_baseline_count') return {
+ label: 'Needs more baseline',
+ detail: 'Not enough compatible baseline runs to make this gate trustworthy.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ if (row.confidence === 'low_current_sample_count') return {
+ label: 'Needs repeat',
+ detail: 'Current run has too few successful measured samples.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ if (row.confidence === 'diagnostic') return {
+ label: 'Diagnostic only',
+ detail: 'Shown for investigation, but intentionally excluded from gating.',
+ tone: 'diagnostic',
+ color: '#a78bfa',
+ }
+ if (row.status === 'fail') return {
+ label: 'Regression - blocks merge',
+ detail: 'Slower than the configured fail threshold with enough samples.',
+ tone: 'bad',
+ color: '#ef4444',
+ }
+ if (row.status === 'warn') return {
+ label: 'Regression - review',
+ detail: 'Slower than the configured warning threshold.',
+ tone: 'warn',
+ color: '#f59e0b',
+ }
+ if (row.status === 'missing_baseline') return {
+ label: 'No baseline yet',
+ detail: 'Current value is measured, but no comparable baseline exists.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ if (row.confidence === 'noise_floor') return {
+ label: 'Too small to matter',
+ detail: 'The absolute change is below the noise floor for this metric.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ if (row.confidence === 'within_baseline_range') return {
+ label: 'Normal variance',
+ detail: 'Current value is inside the observed baseline range.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ if (row.confidence === 'within_baseline_distribution') return {
+ label: 'Within historical p95',
+ detail: 'Current value is inside the historical baseline distribution.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ if (row.direction === 'improved') return {
+ label: 'Meaningfully faster',
+ detail: 'Faster than baseline by more than the noise floor and outside normal range.',
+ tone: 'good',
+ color: '#10b981',
+ }
+ if (row.direction === 'regressed') return {
+ label: 'Slightly slower, ok',
+ detail: 'Slower than baseline but still inside the configured budget.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ return {
+ label: 'Unchanged',
+ detail: 'No meaningful movement from baseline.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
}
const formatGate = (row) => {
@@ -3609,19 +3670,20 @@ jobs:
const comparisonTable = (rows) => {
if (rows.length === 0) return 'No measurement regressions detected.'
return [
- '| Probe | Baseline | Current | Change | Result | Gate | Confidence |',
+ '| Probe | Baseline | Current | Change | Meaning | Gate | Evidence |',
'| --- | ---: | ---: | ---: | --- | --- | --- |',
...rows.map((row) => {
const unit = row.observation?.unit
const baselineRange = typeof row.baselineMin === 'number' && typeof row.baselineMax === 'number' && row.baselineMin !== row.baselineMax
? '
range ' + formatValue(row.baselineMin, unit) + ' - ' + formatValue(row.baselineMax, unit) + ''
: ''
+ const meaning = interpretation(row)
return '| ' + [
humanProbe(row),
formatValue(row.baseline, unit) + baselineRange,
formatValue(row.current, unit),
formatDelta(row.delta, unit) + ' / ' + formatRatio(row.ratio),
- formatResult(row),
+ meaning.label + '
' + meaning.detail + '',
formatGate(row),
(row.confidence || 'unknown') + '
baseline n=' + (row.baselineSources ?? 0) + ', current samples=' + (row.currentSamples ?? 1) + '',
].map(escapeCell).join(' | ') + ' |'
@@ -3684,27 +3746,31 @@ jobs:
const lower = Math.floor(minPct)
const upper = Math.ceil(maxPct)
const span = upper - lower || 1
- const width = 900
- const rowHeight = 42
- const height = 96 + chartRows.length * rowHeight + 34
- const labelX = 238
- const plotX = 260
- const plotWidth = 342
- const percentX = 626
- const nominalX = 704
- const topY = 78
+ const width = 1040
+ const rowHeight = 46
+ const height = 112 + chartRows.length * rowHeight + 34
+ const labelX = 230
+ const plotX = 252
+ const plotWidth = 320
+ const percentX = 596
+ const nominalX = 672
+ const meaningX = 804
+ const topY = 92
const barHeight = 18
const zeroX = plotX + ((0 - lower) / span) * plotWidth
const svg = [
'',
'',
)
return svg.join('\n')
@@ -3754,6 +3824,7 @@ jobs:
status: row.status,
target: row.target?.label || row.target?.name || 'unknown',
observation: row.observation?.label || row.observation?.name || 'unknown',
+ meaning: interpretation(row).label,
dimensions: dimensions(row).replaceAll('
', ', '),
baseline: formatValue(row.baseline, row.observation?.unit),
current: formatValue(row.current, row.observation?.unit),
@@ -3778,7 +3849,7 @@ jobs:
const historyRows = state.runs.slice(1).map((run) => {
const link = run.runUrl ? '[' + run.shortSha + '](' + run.runUrl + ')' : run.shortSha
const top = Array.isArray(run.visibleRows) && run.visibleRows.length > 0
- ? run.visibleRows.slice(0, 3).map((row) => row.status + ' ' + row.target + ' ' + row.observation + ' ' + row.delta + ' / ' + row.ratio).join('
')
+ ? run.visibleRows.slice(0, 3).map((row) => (row.meaning || row.status) + ' ' + row.target + ' ' + row.observation + ' ' + row.delta + ' / ' + row.ratio).join('
')
: 'No regressions'
return '| ' + [link, run.status, gateModeLabel(run.mode), top].map(escapeCell).join(' | ') + ' |'
})
@@ -3805,7 +3876,7 @@ jobs:
'- Protocol: ' + protocolLabel,
'',
hasComparableBaseline
- ? 'Chart: performance change versus baseline median. Green is faster, red is slower, gray is within noise or baseline range.'
+ ? 'Chart: bars show percentage change; the meaning labels explain whether the movement is actionable, noise, normal variance, or diagnostic.'
: 'No compatible baseline was available, so this run shows current measurements only.',
'',
chartMarkdown,
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 3aa60e761..ffe388d81 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -1637,18 +1637,79 @@ const formatRatio = (value) => {
return formatNumber(Math.round((value - 1) * 1000) / 10) + '%'
}
-const formatResult = (row) => {
- if (row.confidence === 'low_baseline_count') return 'gray needs baseline'
- if (row.confidence === 'low_current_sample_count') return 'gray needs repeat'
- if (row.confidence === 'diagnostic') return 'gray diagnostic'
- if (row.status === 'fail') return 'red regression'
- if (row.status === 'warn') return 'yellow regression'
- if (row.status === 'missing_baseline') return 'gray no baseline'
- if (row.confidence === 'noise_floor') return 'gray noise floor'
- if (row.confidence === 'within_baseline_range') return 'gray within range'
- if (row.confidence === 'within_baseline_distribution') return 'gray within p95'
- if (row.direction === 'improved') return 'green improved'
- return 'gray unchanged'
+const interpretation = (row) => {
+ if (row.confidence === 'low_baseline_count') return {
+ label: 'Needs more baseline',
+ detail: 'Not enough compatible baseline runs to make this gate trustworthy.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ if (row.confidence === 'low_current_sample_count') return {
+ label: 'Needs repeat',
+ detail: 'Current run has too few successful measured samples.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ if (row.confidence === 'diagnostic') return {
+ label: 'Diagnostic only',
+ detail: 'Shown for investigation, but intentionally excluded from gating.',
+ tone: 'diagnostic',
+ color: '#a78bfa',
+ }
+ if (row.status === 'fail') return {
+ label: 'Regression - blocks merge',
+ detail: 'Slower than the configured fail threshold with enough samples.',
+ tone: 'bad',
+ color: '#ef4444',
+ }
+ if (row.status === 'warn') return {
+ label: 'Regression - review',
+ detail: 'Slower than the configured warning threshold.',
+ tone: 'warn',
+ color: '#f59e0b',
+ }
+ if (row.status === 'missing_baseline') return {
+ label: 'No baseline yet',
+ detail: 'Current value is measured, but no comparable baseline exists.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ if (row.confidence === 'noise_floor') return {
+ label: 'Too small to matter',
+ detail: 'The absolute change is below the noise floor for this metric.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ if (row.confidence === 'within_baseline_range') return {
+ label: 'Normal variance',
+ detail: 'Current value is inside the observed baseline range.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ if (row.confidence === 'within_baseline_distribution') return {
+ label: 'Within historical p95',
+ detail: 'Current value is inside the historical baseline distribution.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ if (row.direction === 'improved') return {
+ label: 'Meaningfully faster',
+ detail: 'Faster than baseline by more than the noise floor and outside normal range.',
+ tone: 'good',
+ color: '#10b981',
+ }
+ if (row.direction === 'regressed') return {
+ label: 'Slightly slower, ok',
+ detail: 'Slower than baseline but still inside the configured budget.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ return {
+ label: 'Unchanged',
+ detail: 'No meaningful movement from baseline.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
}
const formatGate = (row) => {
@@ -1740,19 +1801,20 @@ const visibleRows = (hasComparableBaseline
const comparisonTable = (rows) => {
if (rows.length === 0) return 'No measurement regressions detected.'
return [
- '| Probe | Baseline | Current | Change | Result | Gate | Confidence |',
+ '| Probe | Baseline | Current | Change | Meaning | Gate | Evidence |',
'| --- | ---: | ---: | ---: | --- | --- | --- |',
...rows.map((row) => {
const unit = row.observation?.unit
const baselineRange = typeof row.baselineMin === 'number' && typeof row.baselineMax === 'number' && row.baselineMin !== row.baselineMax
? '
range ' + formatValue(row.baselineMin, unit) + ' - ' + formatValue(row.baselineMax, unit) + ''
: ''
+ const meaning = interpretation(row)
return '| ' + [
humanProbe(row),
formatValue(row.baseline, unit) + baselineRange,
formatValue(row.current, unit),
formatDelta(row.delta, unit) + ' / ' + formatRatio(row.ratio),
- formatResult(row),
+ meaning.label + '
' + meaning.detail + '',
formatGate(row),
(row.confidence || 'unknown') + '
baseline n=' + (row.baselineSources ?? 0) + ', current samples=' + (row.currentSamples ?? 1) + '',
].map(escapeCell).join(' | ') + ' |'
@@ -1815,27 +1877,31 @@ const renderPerfChangeSvg = (rows) => {
const lower = Math.floor(minPct)
const upper = Math.ceil(maxPct)
const span = upper - lower || 1
- const width = 900
- const rowHeight = 42
- const height = 96 + chartRows.length * rowHeight + 34
- const labelX = 238
- const plotX = 260
- const plotWidth = 342
- const percentX = 626
- const nominalX = 704
- const topY = 78
+ const width = 1040
+ const rowHeight = 46
+ const height = 112 + chartRows.length * rowHeight + 34
+ const labelX = 230
+ const plotX = 252
+ const plotWidth = 320
+ const percentX = 596
+ const nominalX = 672
+ const meaningX = 804
+ const topY = 92
const barHeight = 18
const zeroX = plotX + ((0 - lower) / span) * plotWidth
const svg = [
'',
'',
)
return svg.join('\n')
@@ -1885,6 +1955,7 @@ const currentRun = {
status: row.status,
target: row.target?.label || row.target?.name || 'unknown',
observation: row.observation?.label || row.observation?.name || 'unknown',
+ meaning: interpretation(row).label,
dimensions: dimensions(row).replaceAll('
', ', '),
baseline: formatValue(row.baseline, row.observation?.unit),
current: formatValue(row.current, row.observation?.unit),
@@ -1909,7 +1980,7 @@ const gateModeLabel = (mode) => {
const historyRows = state.runs.slice(1).map((run) => {
const link = run.runUrl ? '[' + run.shortSha + '](' + run.runUrl + ')' : run.shortSha
const top = Array.isArray(run.visibleRows) && run.visibleRows.length > 0
- ? run.visibleRows.slice(0, 3).map((row) => row.status + ' ' + row.target + ' ' + row.observation + ' ' + row.delta + ' / ' + row.ratio).join('
')
+ ? run.visibleRows.slice(0, 3).map((row) => (row.meaning || row.status) + ' ' + row.target + ' ' + row.observation + ' ' + row.delta + ' / ' + row.ratio).join('
')
: 'No regressions'
return '| ' + [link, run.status, gateModeLabel(run.mode), top].map(escapeCell).join(' | ') + ' |'
})
@@ -1936,7 +2007,7 @@ const summaryLines = [
'- Protocol: ' + protocolLabel,
'',
hasComparableBaseline
- ? 'Chart: performance change versus baseline median. Green is faster, red is slower, gray is within noise or baseline range.'
+ ? 'Chart: bars show percentage change; the meaning labels explain whether the movement is actionable, noise, normal variance, or diagnostic.'
: 'No compatible baseline was available, so this run shows current measurements only.',
'',
chartMarkdown,
diff --git a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
index 524691c03..2ebdd423b 100644
--- a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
+++ b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
@@ -429,12 +429,11 @@ describe('ci workflow devenv perf helpers', () => {
expect(generatedCiWorkflowYamlSource).toContain(
'if ($gateable and $confidence == "threshold_exceeded") then $thresholdStatus',
)
- expect(ciWorkflowSource).toContain(
- "if (row.confidence === 'low_baseline_count') return 'gray needs baseline'",
- )
- expect(ciWorkflowSource).toContain(
- "if (row.confidence === 'low_current_sample_count') return 'gray needs repeat'",
- )
+ expect(ciWorkflowSource).toContain("label: 'Needs more baseline'")
+ expect(ciWorkflowSource).toContain("label: 'Needs repeat'")
+ expect(ciWorkflowSource).toContain("label: 'Too small to matter'")
+ expect(ciWorkflowSource).toContain("label: 'Normal variance'")
+ expect(ciWorkflowSource).toContain("label: 'Meaningfully faster'")
expect(generatedCiWorkflowYamlSource).toContain('RUNNER_CLASS:')
expect(generatedCiWorkflowYamlSource).toContain('namespace-profile-linux-x86-64')
expect(ciWorkflowSource).toContain('nix.closure.nar_size')
@@ -485,11 +484,17 @@ describe('ci workflow devenv perf helpers', () => {
)
expect(ciWorkflowSource).toContain('chart_file="$comment_tmp_dir/perf-change-vs-baseline.svg"')
expect(ciWorkflowSource).toContain(
- 'Chart: performance change versus baseline median. Green is faster, red is slower, gray is within noise or baseline range.',
+ 'Chart: bars show percentage change; the meaning labels explain whether the movement is actionable, noise, normal variance, or diagnostic.',
+ )
+ expect(ciWorkflowSource).toContain(
+ '| Probe | Baseline | Current | Change | Meaning | Gate | Evidence |',
)
expect(ciWorkflowSource).toContain("'- Readiness: ' + readinessLabel")
expect(ciWorkflowSource).toContain('renderPerfChangeSvg')
- expect(ciWorkflowSource).toContain('Perf change vs baseline (%)')
+ expect(ciWorkflowSource).toContain('Perf change vs baseline')
+ expect(ciWorkflowSource).toContain(
+ 'Bars show percent change; meaning explains whether the number is actionable.',
+ )
expect(ciWorkflowSource).toContain('![Perf change vs baseline chart]')
expect(ciWorkflowSource).toContain('https://raw.githubusercontent.com')
expect(ciWorkflowSource).toContain('gh api "repos/$repo/contents/$asset_path"')
From 1c9b55423731e7e709c5b1961dc3273942ab6efa Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Mon, 18 May 2026 02:54:09 +0200
Subject: [PATCH 26/81] Render CI measurement chart as PNG
---
.github/workflows/ci.yml | 67 ++++++++++++++-----
genie/ci-workflow/measurements.ts | 67 ++++++++++++++-----
.../ci-workflow-helpers.unit.test.ts | 11 ++-
3 files changed, 110 insertions(+), 35 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3804870c8..b4f6ad60c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3407,6 +3407,7 @@ jobs:
comment_body="$comment_tmp_dir/comment.md"
comment_id_file="$comment_tmp_dir/comment-id.txt"
chart_file="$comment_tmp_dir/perf-change-vs-baseline.svg"
+ chart_png_file="$comment_tmp_dir/perf-change-vs-baseline.png"
renderer_script="$comment_tmp_dir/render-ci-measurement-comment.mjs"
if ! gh api "repos/$repo/issues/$pr_number/comments" --paginate >"$comments_json"; then
@@ -3423,13 +3424,17 @@ jobs:
asset_head_sha="${GITHUB_HEAD_SHA:-${GITHUB_SHA:-unknown}}"
asset_run_id="${GITHUB_RUN_ID:-local}"
asset_run_attempt="${GITHUB_RUN_ATTEMPT:-0}"
- asset_path="ci-measurements/pr-$pr_number/${asset_head_sha}/run-${asset_run_id}-attempt-${asset_run_attempt}/${asset_title}.svg"
+ asset_svg_path="ci-measurements/pr-$pr_number/${asset_head_sha}/run-${asset_run_id}-attempt-${asset_run_attempt}/${asset_title}.svg"
+ asset_png_path="ci-measurements/pr-$pr_number/${asset_head_sha}/run-${asset_run_id}-attempt-${asset_run_attempt}/${asset_title}.png"
if [ "${GITHUB_SERVER_URL:-https://github.com}" = "https://github.com" ]; then
- chart_url="https://raw.githubusercontent.com/$repo/$asset_branch/$asset_path"
+ chart_url="https://raw.githubusercontent.com/$repo/$asset_branch/$asset_png_path"
+ chart_source_url="https://raw.githubusercontent.com/$repo/$asset_branch/$asset_svg_path"
else
- chart_url="${GITHUB_SERVER_URL:-https://github.com}/$repo/raw/$asset_branch/$asset_path"
+ chart_url="${GITHUB_SERVER_URL:-https://github.com}/$repo/raw/$asset_branch/$asset_png_path"
+ chart_source_url="${GITHUB_SERVER_URL:-https://github.com}/$repo/raw/$asset_branch/$asset_svg_path"
fi
export CI_MEASUREMENT_PR_COMMENT_CHART_URL="$chart_url"
+ export CI_MEASUREMENT_PR_COMMENT_CHART_SOURCE_URL="$chart_source_url"
cat > "$renderer_script" <<'EOF'
import { readFileSync, writeFileSync } from 'node:fs'
@@ -3447,6 +3452,7 @@ jobs:
const workflow = process.env.GITHUB_WORKFLOW || 'CI'
const job = process.env.GITHUB_JOB || ''
const chartUrl = process.env.CI_MEASUREMENT_PR_COMMENT_CHART_URL || ''
+ const chartSourceUrl = process.env.CI_MEASUREMENT_PR_COMMENT_CHART_SOURCE_URL || ''
const marker = ''
const statePrefix = ''
const statePrefix = ''
@@ -3746,7 +3755,7 @@ jobs:
return text.slice(0, Math.max(0, maxLength - 3)) + '...'
}
- const renderPerfChangeSvg = (rows) => {
+ const renderPerfChangeSvg = (rows, theme = 'adaptive') => {
const chartRows = rows
.filter((row) => row.observation?.unit === 'seconds')
.filter((row) => typeof row.current === 'number' && typeof row.baseline === 'number')
@@ -3773,29 +3782,47 @@ jobs:
const topY = 92
const barHeight = 18
const zeroX = plotX + ((0 - lower) / span) * plotWidth
+ const themeCss = theme === 'dark'
+ ? [
+ ' .chart-bg { fill: #0d1117; }',
+ ' .chart-border { fill: none; stroke: #30363d; }',
+ ' .chart-title { fill: #f0f6fc; }',
+ ' .chart-muted { fill: #8b949e; }',
+ ' .chart-axis { stroke: #8b949e; }',
+ ' .chart-label { fill: #c9d1d9; }',
+ ' .chart-value { fill: #8b949e; }',
+ ' .chart-track { fill: #21262d; }',
+ ]
+ : [
+ ' .chart-bg { fill: #ffffff; }',
+ ' .chart-border { fill: none; stroke: #d0d7de; }',
+ ' .chart-title { fill: #24292f; }',
+ ' .chart-muted { fill: #57606a; }',
+ ' .chart-axis { stroke: #8c959f; }',
+ ' .chart-label { fill: #24292f; }',
+ ' .chart-value { fill: #57606a; }',
+ ' .chart-track { fill: #f6f8fa; }',
+ ...(theme === 'adaptive'
+ ? [
+ ' @media (prefers-color-scheme: dark) {',
+ ' .chart-bg { fill: #0d1117; }',
+ ' .chart-border { stroke: #30363d; }',
+ ' .chart-title { fill: #f0f6fc; }',
+ ' .chart-muted { fill: #8b949e; }',
+ ' .chart-axis { stroke: #8b949e; }',
+ ' .chart-label { fill: #c9d1d9; }',
+ ' .chart-value { fill: #8b949e; }',
+ ' .chart-track { fill: #21262d; }',
+ ' }',
+ ]
+ : []),
+ ]
const svg = [
'',
'',
)
return svg.join('\n')
@@ -5564,6 +5629,7 @@ jobs:
current: formatValue(row.current, row.observation?.unit),
delta: formatDelta(row.delta, row.observation?.unit),
ratio: formatRatio(row.ratio),
+ impact: formatSemanticImpact(row.semanticImpactScore),
})),
}
const hasComparableHistory = (run) => Array.isArray(run.visibleRows) && run.visibleRows.some((row) =>
@@ -5624,7 +5690,7 @@ jobs:
'- Protocol: ' + protocolLabel,
'',
hasComparableBaseline
- ? 'Chart: bars show percentage change. Gate decisions use configured budgets and robust current/baseline noise bands.'
+ ? 'Chart: bars show semantic impact. A value of 0 means the raw change is not actionable for this PR; raw percentage and nominal values stay in the table.'
: 'No compatible baseline was available, so this run shows current measurements only.',
'',
chartMarkdown,
diff --git a/genie/ci-scripts/ci-measurement-comparison.test.sh b/genie/ci-scripts/ci-measurement-comparison.test.sh
index c327ff396..02b4ac0fd 100755
--- a/genie/ci-scripts/ci-measurement-comparison.test.sh
+++ b/genie/ci-scripts/ci-measurement-comparison.test.sh
@@ -99,8 +99,10 @@ run_compare
actual_status="$(jq -r '.status' "$tmp_dir/comparison.json")"
actual_row="$(jq -r '.comparisons[] | .status' "$tmp_dir/comparison.json")"
actual_enforceable="$(jq -r '.readiness.enforceable' "$tmp_dir/comparison.json")"
-if [ "$actual_status" != "fail" ] || [ "$actual_row" != "fail" ] || [ "$actual_enforceable" != "true" ]; then
- echo "expected confirmed regression to fail and be enforceable; got status=$actual_status row=$actual_row enforceable=$actual_enforceable" >&2
+actual_impact="$(jq -r '.comparisons[] | .semanticImpactScore' "$tmp_dir/comparison.json")"
+actual_impact_kind="$(jq -r '.comparisons[] | .semanticImpactKind' "$tmp_dir/comparison.json")"
+if [ "$actual_status" != "fail" ] || [ "$actual_row" != "fail" ] || [ "$actual_enforceable" != "true" ] || [ "$actual_impact_kind" != "fail_boundary" ] || ! awk "BEGIN { exit !($actual_impact > 1) }"; then
+ echo "expected confirmed regression to fail and have fail-boundary impact; got status=$actual_status row=$actual_row enforceable=$actual_enforceable impact=$actual_impact kind=$actual_impact_kind" >&2
exit 1
fi
@@ -119,8 +121,10 @@ actual_row="$(jq -r '.comparisons[] | .status' "$tmp_dir/comparison.json")"
actual_confidence="$(jq -r '.comparisons[] | .confidence' "$tmp_dir/comparison.json")"
actual_current_lower="$(jq -r '.comparisons[] | .currentRobustLower' "$tmp_dir/comparison.json")"
actual_baseline_upper="$(jq -r '.comparisons[] | .baselineRobustUpper' "$tmp_dir/comparison.json")"
-if [ "$actual_status" != "pass" ] || [ "$actual_row" != "pass" ] || [ "$actual_confidence" != "within_robust_band" ] || ! awk "BEGIN { exit !($actual_current_lower <= $actual_baseline_upper) }"; then
- echo "expected overlapping current/baseline robust bands to pass; got status=$actual_status row=$actual_row confidence=$actual_confidence currentLower=$actual_current_lower baselineUpper=$actual_baseline_upper" >&2
+actual_impact="$(jq -r '.comparisons[] | .semanticImpactScore' "$tmp_dir/comparison.json")"
+actual_impact_kind="$(jq -r '.comparisons[] | .semanticImpactKind' "$tmp_dir/comparison.json")"
+if [ "$actual_status" != "pass" ] || [ "$actual_row" != "pass" ] || [ "$actual_confidence" != "within_robust_band" ] || [ "$actual_impact" != "0" ] || [ "$actual_impact_kind" != "neutral" ] || ! awk "BEGIN { exit !($actual_current_lower <= $actual_baseline_upper) }"; then
+ echo "expected overlapping current/baseline robust bands to pass with neutral impact; got status=$actual_status row=$actual_row confidence=$actual_confidence impact=$actual_impact kind=$actual_impact_kind currentLower=$actual_current_lower baselineUpper=$actual_baseline_upper" >&2
exit 1
fi
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 5caaed2dd..783adb6f7 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -1605,6 +1605,8 @@ jq -n \
| ($baseline - $robustTolerance) as $robustLower
| ($current + $currentRobustTolerance) as $currentRobustUpper
| ($current - $currentRobustTolerance) as $currentRobustLower
+ | ([($b.warnAbs // 0), (if $baseline > 0 then ($baseline * (($b.warnRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $warnBudget
+ | ([($b.failAbs // 0), (if $baseline > 0 then ($baseline * (($b.failRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $failBudget
| (
($current >= $robustLower and $current <= $robustUpper)
or ($currentRobustTolerance > 0 and $currentRobustLower <= $robustUpper and $currentRobustUpper >= $robustLower)
@@ -1661,7 +1663,27 @@ jq -n \
else "regressed"
end
) as $direction
- | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange};
+ | (
+ if $baseline <= 0 then null
+ elif (policy_enabled($policy) != true) then 0
+ elif $withinRobustBand then 0
+ elif ($delta | abs_value) <= $noise then 0
+ elif ($confidence == "threshold_exceeded" and $delta > 0) then ([0, ($currentRobustLower - $robustUpper), $delta] | max) / $warnBudget
+ elif ($confidence == "threshold_exceeded" and $delta < 0) then -(([0, ($robustLower - $currentRobustUpper), (-$delta)] | max) / $warnBudget)
+ elif $delta > 0 then ([0, ($currentRobustLower - $robustUpper)] | max) / $warnBudget
+ else -(([0, ($robustLower - $currentRobustUpper)] | max) / $warnBudget)
+ end
+ ) as $semanticImpactScore
+ | (
+ if $semanticImpactScore == null then "unknown"
+ elif $semanticImpactScore == 0 then "neutral"
+ elif $semanticImpactScore >= ($failBudget / $warnBudget) then "fail_boundary"
+ elif $semanticImpactScore >= 1 then "warn_boundary"
+ elif $semanticImpactScore > 0 then "below_warn_boundary"
+ else "improvement"
+ end
+ ) as $semanticImpactKind
+ | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,semanticImpactScore:$semanticImpactScore,semanticImpactKind:$semanticImpactKind,semanticWarnBudget:$warnBudget,semanticFailBudget:$failBudget,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange};
(observations_by_key($current[0]) | with_entries(.value = observation_stats(.value))) as $currentObs
| (observations_by_key($baseline[0]) | with_entries(.value = observation_stats(.value))) as $baselineObs
@@ -2012,6 +2034,13 @@ const formatRatio = (value) => {
return formatNumber(Math.round((value - 1) * 1000) / 10) + '%'
}
+const formatSemanticImpact = (value) => {
+ if (value === null || value === undefined || Number.isNaN(value)) return 'n/a'
+ if (Math.abs(value) < 0.005) return '0.00x'
+ const sign = value > 0 ? '+' : ''
+ return sign + formatNumber(Math.round(value * 100) / 100) + 'x'
+}
+
const interpretation = (row) => {
if (row.confidence === 'low_baseline_count') return {
label: 'Needs more baseline',
@@ -2061,7 +2090,7 @@ const interpretation = (row) => {
tone: 'neutral',
color: '#94a3b8',
}
- if (row.confidence === 'within_baseline_distribution') return {
+ if (row.confidence === 'within_robust_band' || row.confidence === 'within_baseline_distribution') return {
label: 'Within noise band',
detail: 'Current and baseline robust noise bands overlap.',
tone: 'neutral',
@@ -2192,8 +2221,8 @@ const visibleRows = (hasComparableBaseline
const comparisonTable = (rows) => {
if (rows.length === 0) return 'No measurement regressions detected.'
return [
- '| Group | Measurement | Baseline | Current | Change | Meaning | Gate | Evidence |',
- '| --- | --- | ---: | ---: | ---: | --- | --- | --- |',
+ '| Group | Measurement | Baseline | Current | Raw change | Impact | Meaning | Gate | Evidence |',
+ '| --- | --- | ---: | ---: | ---: | ---: | --- | --- | --- |',
...rows.map((row) => {
const unit = row.observation?.unit
const baselineRange = typeof row.baselineRobustLower === 'number' && typeof row.baselineRobustUpper === 'number' && row.baselineRobustLower !== row.baselineRobustUpper
@@ -2208,6 +2237,7 @@ const comparisonTable = (rows) => {
formatValue(row.baseline, unit) + baselineRange,
formatValue(row.current, unit),
formatDelta(row.delta, unit) + ' / ' + formatRatio(row.ratio),
+ formatSemanticImpact(row.semanticImpactScore),
meaning.label + '
' + meaning.detail + '',
formatGate(row),
(row.confidence || 'unknown') + '
baseline n=' + (row.baselineSources ?? 0) + ', current samples=' + (row.currentSamples ?? 1) + '',
@@ -2230,8 +2260,8 @@ const currentOnlyTable = (rows) => {
const allMeasurementsTable = (rows) => {
if (rows.length === 0) return 'No measurement regressions detected.'
return [
- '| Status | Gate | Target | Observation | Dimensions | Baseline | Current | Delta | Ratio |',
- '| --- | --- | --- | --- | --- | ---: | ---: | ---: | ---: |',
+ '| Status | Gate | Target | Observation | Dimensions | Baseline | Current | Delta | Ratio | Impact |',
+ '| --- | --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: |',
...rows.map((row) => {
const unit = row.observation?.unit
return '| ' + [
@@ -2244,6 +2274,7 @@ const allMeasurementsTable = (rows) => {
formatValue(row.current, unit),
formatDelta(row.delta, unit),
formatRatio(row.ratio),
+ formatSemanticImpact(row.semanticImpactScore),
].map(escapeCell).join(' | ') + ' |'
}),
].join('\n')
@@ -2259,16 +2290,16 @@ const truncate = (value, maxLength) => {
const renderPerfChangeSvg = (rows, theme = 'adaptive') => {
const chartRows = rows
.filter((row) => typeof row.current === 'number' && typeof row.baseline === 'number')
- .filter((row) => typeof row.ratio === 'number')
- .sort((left, right) => ((left.ratio || 1) - 1) - ((right.ratio || 1) - 1))
+ .filter((row) => typeof row.semanticImpactScore === 'number')
+ .sort((left, right) => (left.semanticImpactScore || 0) - (right.semanticImpactScore || 0))
.slice(0, visibleLimit)
if (chartRows.length === 0) return ''
- const percentages = chartRows.map((row) => ((row.ratio || 1) - 1) * 100)
- const minPct = Math.min(-1, ...percentages)
- const maxPct = Math.max(1, ...percentages)
- const lower = Math.floor(minPct)
- const upper = Math.ceil(maxPct)
+ const impactScores = chartRows.map((row) => row.semanticImpactScore || 0)
+ const minImpact = Math.min(-1, ...impactScores)
+ const maxImpact = Math.max(1, ...impactScores)
+ const lower = Math.floor(minImpact)
+ const upper = Math.ceil(maxImpact)
const span = upper - lower || 1
const width = 1040
const rowHeight = 46
@@ -2276,7 +2307,7 @@ const renderPerfChangeSvg = (rows, theme = 'adaptive') => {
const labelX = 230
const plotX = 252
const plotWidth = 320
- const percentX = 596
+ const impactX = 596
const nominalX = 672
const meaningX = 804
const topY = 92
@@ -2326,23 +2357,24 @@ const renderPerfChangeSvg = (rows, theme = 'adaptive') => {
'',
'',
'',
- 'Measurement change vs baseline',
- 'Bars show percent change; meaning explains whether the number is actionable.',
- 'lower',
- 'higher',
+ 'Actionable measurement impact',
+ '0 means no actionable PR impact; 1x reaches the warning budget.',
+ 'improved',
+ 'regressed',
+ 'impact',
'baseline -> current',
'meaning',
'',
]
for (const [index, row] of chartRows.entries()) {
- const pct = ((row.ratio || 1) - 1) * 100
+ const impact = row.semanticImpactScore || 0
const y = topY + index * rowHeight
- const valueWidth = Math.max(2, Math.abs(pct) / span * plotWidth)
- const x = pct < 0 ? zeroX - valueWidth : zeroX
+ const valueWidth = Math.max(2, Math.abs(impact) / span * plotWidth)
+ const x = impact < 0 ? zeroX - valueWidth : zeroX
const meaning = interpretation(row)
const color = meaning.color
- const formattedPct = (pct > 0 ? '+' : '') + formatNumber(Math.round(pct * 10) / 10) + '%'
+ const formattedImpact = formatSemanticImpact(impact)
const label = chartProbe(row)
const nominal = formatValue(row.baseline, row.observation?.unit).replaceAll(' ', '') + ' -> ' + formatValue(row.current, row.observation?.unit).replaceAll(' ', '')
const barOpacity = meaning.tone === 'neutral' ? '0.65' : '1'
@@ -2351,14 +2383,14 @@ const renderPerfChangeSvg = (rows, theme = 'adaptive') => {
'' + escapeXml(label) + '' + escapeXml(truncate(label, 28)) + '',
'',
'',
- '' + escapeXml(formattedPct) + '',
+ '' + escapeXml(formattedImpact) + '',
'' + escapeXml(nominal) + '' + escapeXml(truncate(nominal, 21)) + '',
'' + escapeXml(meaning.detail) + '' + escapeXml(truncate(meaning.label, 30)) + '',
)
}
svg.push(
- '0%',
+ '0',
'',
)
return svg.join('\n')
@@ -2392,6 +2424,7 @@ const currentRun = {
current: formatValue(row.current, row.observation?.unit),
delta: formatDelta(row.delta, row.observation?.unit),
ratio: formatRatio(row.ratio),
+ impact: formatSemanticImpact(row.semanticImpactScore),
})),
}
const hasComparableHistory = (run) => Array.isArray(run.visibleRows) && run.visibleRows.some((row) =>
@@ -2452,7 +2485,7 @@ const summaryLines = [
'- Protocol: ' + protocolLabel,
'',
hasComparableBaseline
- ? 'Chart: bars show percentage change. Gate decisions use configured budgets and robust current/baseline noise bands.'
+ ? 'Chart: bars show semantic impact. A value of 0 means the raw change is not actionable for this PR; raw percentage and nominal values stay in the table.'
: 'No compatible baseline was available, so this run shows current measurements only.',
'',
chartMarkdown,
diff --git a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
index 84101cd1b..a76233b0a 100644
--- a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
+++ b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
@@ -493,16 +493,16 @@ describe('ci workflow devenv perf helpers', () => {
'chart_dark_png_file="$comment_tmp_dir/perf-change-vs-baseline-dark.png"',
)
expect(ciWorkflowSource).toContain(
- 'Chart: bars show percentage change. Gate decisions use configured budgets and robust current/baseline noise bands.',
+ 'Chart: bars show semantic impact. A value of 0 means the raw change is not actionable for this PR; raw percentage and nominal values stay in the table.',
)
expect(ciWorkflowSource).toContain(
- '| Group | Measurement | Baseline | Current | Change | Meaning | Gate | Evidence |',
+ '| Group | Measurement | Baseline | Current | Raw change | Impact | Meaning | Gate | Evidence |',
)
expect(ciWorkflowSource).toContain("'- Readiness: ' + readinessLabel")
expect(ciWorkflowSource).toContain('renderPerfChangeSvg')
- expect(ciWorkflowSource).toContain('Perf change vs baseline')
+ expect(ciWorkflowSource).toContain('Actionable measurement impact')
expect(ciWorkflowSource).toContain(
- 'Bars show percent change; meaning explains whether the number is actionable.',
+ '0 means no actionable PR impact; 1x reaches the warning budget.',
)
expect(ciWorkflowSource).toContain('@media (prefers-color-scheme: dark)')
expect(ciWorkflowSource).toContain('.chart-bg { fill: #0d1117; }')
From c9b324e189244a70fd81ff857cbbb575ea61a446 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Tue, 19 May 2026 02:56:58 +0200
Subject: [PATCH 37/81] Clarify diagnostic measurement impact
---
.github/workflows/ci.yml | 34 +++++++++++++++++++++++--------
genie/ci-workflow/measurements.ts | 17 ++++++++++++----
2 files changed, 39 insertions(+), 12 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 2226e9d50..4010191a6 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3233,7 +3233,7 @@ jobs:
) as $direction
| (
if $baseline <= 0 then null
- elif (policy_enabled($policy) != true) then 0
+ elif (policy_enabled($policy) != true) then null
elif $withinRobustBand then 0
elif ($delta | abs_value) <= $noise then 0
elif ($confidence == "threshold_exceeded" and $delta > 0) then ([0, ($currentRobustLower - $robustUpper), $delta] | max) / $warnBudget
@@ -3243,7 +3243,8 @@ jobs:
end
) as $semanticImpactScore
| (
- if $semanticImpactScore == null then "unknown"
+ if (policy_enabled($policy) != true) then "diagnostic"
+ elif $semanticImpactScore == null then "unknown"
elif $semanticImpactScore == 0 then "neutral"
elif $semanticImpactScore >= ($failBudget / $warnBudget) then "fail_boundary"
elif $semanticImpactScore >= 1 then "warn_boundary"
@@ -3609,6 +3610,13 @@ jobs:
return sign + formatNumber(Math.round(value * 100) / 100) + 'x'
}
+ const formatRowImpact = (row) => {
+ if (row.confidence === 'diagnostic' || row.gateReason === 'disabled' || row.semanticImpactKind === 'diagnostic') {
+ return 'diagnostic'
+ }
+ return formatSemanticImpact(row.semanticImpactScore)
+ }
+
const interpretation = (row) => {
if (row.confidence === 'low_baseline_count') return {
label: 'Needs more baseline',
@@ -3805,7 +3813,7 @@ jobs:
formatValue(row.baseline, unit) + baselineRange,
formatValue(row.current, unit),
formatDelta(row.delta, unit) + ' / ' + formatRatio(row.ratio),
- formatSemanticImpact(row.semanticImpactScore),
+ formatRowImpact(row),
meaning.label + '
' + meaning.detail + '',
formatGate(row),
(row.confidence || 'unknown') + '
baseline n=' + (row.baselineSources ?? 0) + ', current samples=' + (row.currentSamples ?? 1) + '',
@@ -3842,7 +3850,7 @@ jobs:
formatValue(row.current, unit),
formatDelta(row.delta, unit),
formatRatio(row.ratio),
- formatSemanticImpact(row.semanticImpactScore),
+ formatRowImpact(row),
].map(escapeCell).join(' | ') + ' |'
}),
].join('\n')
@@ -3858,6 +3866,7 @@ jobs:
const renderPerfChangeSvg = (rows, theme = 'adaptive') => {
const chartRows = rows
.filter((row) => typeof row.current === 'number' && typeof row.baseline === 'number')
+ .filter((row) => row.gateable === true)
.filter((row) => typeof row.semanticImpactScore === 'number')
.sort((left, right) => (left.semanticImpactScore || 0) - (right.semanticImpactScore || 0))
.slice(0, visibleLimit)
@@ -4870,7 +4879,7 @@ jobs:
) as $direction
| (
if $baseline <= 0 then null
- elif (policy_enabled($policy) != true) then 0
+ elif (policy_enabled($policy) != true) then null
elif $withinRobustBand then 0
elif ($delta | abs_value) <= $noise then 0
elif ($confidence == "threshold_exceeded" and $delta > 0) then ([0, ($currentRobustLower - $robustUpper), $delta] | max) / $warnBudget
@@ -4880,7 +4889,8 @@ jobs:
end
) as $semanticImpactScore
| (
- if $semanticImpactScore == null then "unknown"
+ if (policy_enabled($policy) != true) then "diagnostic"
+ elif $semanticImpactScore == null then "unknown"
elif $semanticImpactScore == 0 then "neutral"
elif $semanticImpactScore >= ($failBudget / $warnBudget) then "fail_boundary"
elif $semanticImpactScore >= 1 then "warn_boundary"
@@ -5246,6 +5256,13 @@ jobs:
return sign + formatNumber(Math.round(value * 100) / 100) + 'x'
}
+ const formatRowImpact = (row) => {
+ if (row.confidence === 'diagnostic' || row.gateReason === 'disabled' || row.semanticImpactKind === 'diagnostic') {
+ return 'diagnostic'
+ }
+ return formatSemanticImpact(row.semanticImpactScore)
+ }
+
const interpretation = (row) => {
if (row.confidence === 'low_baseline_count') return {
label: 'Needs more baseline',
@@ -5442,7 +5459,7 @@ jobs:
formatValue(row.baseline, unit) + baselineRange,
formatValue(row.current, unit),
formatDelta(row.delta, unit) + ' / ' + formatRatio(row.ratio),
- formatSemanticImpact(row.semanticImpactScore),
+ formatRowImpact(row),
meaning.label + '
' + meaning.detail + '',
formatGate(row),
(row.confidence || 'unknown') + '
baseline n=' + (row.baselineSources ?? 0) + ', current samples=' + (row.currentSamples ?? 1) + '',
@@ -5479,7 +5496,7 @@ jobs:
formatValue(row.current, unit),
formatDelta(row.delta, unit),
formatRatio(row.ratio),
- formatSemanticImpact(row.semanticImpactScore),
+ formatRowImpact(row),
].map(escapeCell).join(' | ') + ' |'
}),
].join('\n')
@@ -5495,6 +5512,7 @@ jobs:
const renderPerfChangeSvg = (rows, theme = 'adaptive') => {
const chartRows = rows
.filter((row) => typeof row.current === 'number' && typeof row.baseline === 'number')
+ .filter((row) => row.gateable === true)
.filter((row) => typeof row.semanticImpactScore === 'number')
.sort((left, right) => (left.semanticImpactScore || 0) - (right.semanticImpactScore || 0))
.slice(0, visibleLimit)
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 783adb6f7..4720a8e79 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -1665,7 +1665,7 @@ jq -n \
) as $direction
| (
if $baseline <= 0 then null
- elif (policy_enabled($policy) != true) then 0
+ elif (policy_enabled($policy) != true) then null
elif $withinRobustBand then 0
elif ($delta | abs_value) <= $noise then 0
elif ($confidence == "threshold_exceeded" and $delta > 0) then ([0, ($currentRobustLower - $robustUpper), $delta] | max) / $warnBudget
@@ -1675,7 +1675,8 @@ jq -n \
end
) as $semanticImpactScore
| (
- if $semanticImpactScore == null then "unknown"
+ if (policy_enabled($policy) != true) then "diagnostic"
+ elif $semanticImpactScore == null then "unknown"
elif $semanticImpactScore == 0 then "neutral"
elif $semanticImpactScore >= ($failBudget / $warnBudget) then "fail_boundary"
elif $semanticImpactScore >= 1 then "warn_boundary"
@@ -2041,6 +2042,13 @@ const formatSemanticImpact = (value) => {
return sign + formatNumber(Math.round(value * 100) / 100) + 'x'
}
+const formatRowImpact = (row) => {
+ if (row.confidence === 'diagnostic' || row.gateReason === 'disabled' || row.semanticImpactKind === 'diagnostic') {
+ return 'diagnostic'
+ }
+ return formatSemanticImpact(row.semanticImpactScore)
+}
+
const interpretation = (row) => {
if (row.confidence === 'low_baseline_count') return {
label: 'Needs more baseline',
@@ -2237,7 +2245,7 @@ const comparisonTable = (rows) => {
formatValue(row.baseline, unit) + baselineRange,
formatValue(row.current, unit),
formatDelta(row.delta, unit) + ' / ' + formatRatio(row.ratio),
- formatSemanticImpact(row.semanticImpactScore),
+ formatRowImpact(row),
meaning.label + '
' + meaning.detail + '',
formatGate(row),
(row.confidence || 'unknown') + '
baseline n=' + (row.baselineSources ?? 0) + ', current samples=' + (row.currentSamples ?? 1) + '',
@@ -2274,7 +2282,7 @@ const allMeasurementsTable = (rows) => {
formatValue(row.current, unit),
formatDelta(row.delta, unit),
formatRatio(row.ratio),
- formatSemanticImpact(row.semanticImpactScore),
+ formatRowImpact(row),
].map(escapeCell).join(' | ') + ' |'
}),
].join('\n')
@@ -2290,6 +2298,7 @@ const truncate = (value, maxLength) => {
const renderPerfChangeSvg = (rows, theme = 'adaptive') => {
const chartRows = rows
.filter((row) => typeof row.current === 'number' && typeof row.baseline === 'number')
+ .filter((row) => row.gateable === true)
.filter((row) => typeof row.semanticImpactScore === 'number')
.sort((left, right) => (left.semanticImpactScore || 0) - (right.semanticImpactScore || 0))
.slice(0, visibleLimit)
From c3a9b9c21d3b7197f2e527237c4db536cfc00632 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Tue, 19 May 2026 07:33:01 +0200
Subject: [PATCH 38/81] Require paired evidence for wall-clock gates
---
.github/workflows/ci.yml | 83 +++++++++----
.github/workflows/ci.yml.genie.ts | 5 +-
context/ci-measurements.md | 111 ++++++++++++++++++
.../ci-measurement-comparison.test.sh | 16 +++
genie/ci-workflow/measurements.ts | 46 ++++++--
.../ci-workflow-helpers.unit.test.ts | 4 +-
6 files changed, 234 insertions(+), 31 deletions(-)
create mode 100644 context/ci-measurements.md
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4010191a6..910d8f819 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2837,13 +2837,13 @@ jobs:
}
measure 'shell_eval_traced' 'Shell eval with OTEL trace' 'devenv shell' 'Evaluates the dev shell with native devenv JSON tracing enabled.' '$ARTIFACT_DIR/traces/shell_eval_traced.json' '0' '1' '{"enabled":false,"minBaselineSources":10,"minCurrentSamples":3,"warnRatio":1.25,"failRatio":1.5,"warnAbs":1.5,"failAbs":3,"noiseFloor":0.5,"statisticalToleranceRatio":0.2,"statisticalToleranceAbs":1}' '$DEVENV_SHELL_TRACE_COMMAND'
- measure 'shell_eval_warm' 'Warm shell eval' 'devenv shell' 'Evaluates a warm dev shell without reloading direnv state.' '' '1' '5' '{"enabled":true,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.25}' '$DEVENV_BIN' 'shell' '--no-reload' '--' 'true'
- measure 'tasks_list' 'devenv tasks list' 'devenv cli' 'Lists devenv tasks to measure task graph loading overhead.' '' '1' '9' '{"enabled":true,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.25,"failRatio":1.5,"warnAbs":0.05,"failAbs":0.15,"noiseFloor":0.03,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.03}' '$DEVENV_BIN' 'tasks' 'list'
- measure 'processes_help' 'devenv processes --help' 'devenv cli' 'Loads the devenv processes command help path.' '' '1' '9' '{"enabled":true,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.25,"failRatio":1.5,"warnAbs":0.05,"failAbs":0.15,"noiseFloor":0.03,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.03}' '$DEVENV_BIN' 'processes' '--help'
- measure 'task_pnpm_install' 'pnpm install task' 'workspace setup' 'Runs the cached pnpm install devenv task.' '' '1' '5' '{"enabled":true,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.25}' '$DEVENV_BIN' 'tasks' 'run' 'pnpm:install' '--mode' 'before' '--no-tui' '--show-output'
- measure 'task_genie_run' 'Genie run task' 'genie' 'Runs the normal devenv genie:run task including its declared dependencies.' '' '1' '5' '{"enabled":true,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.25}' '$DEVENV_BIN' 'tasks' 'run' 'genie:run' '--mode' 'before' '--no-tui' '--show-output'
- measure 'task_check_quick' 'Quick check task' 'quality gates' 'Runs the fast local quality gate through devenv.' '' '1' '5' '{"enabled":true,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.25}' '$DEVENV_BIN' 'tasks' 'run' 'check:quick' '--mode' 'before' '--no-tui' '--show-output'
- measure 'genie_check_direct' 'Genie check direct' 'genie' 'Runs Genie directly in check mode to isolate generator runtime from devenv task dependency overhead.' '' '1' '5' '{"enabled":true,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.25}' '$DEVENV_BIN' 'shell' '--no-reload' '--' 'bun' 'packages/@overeng/genie/bin/genie.tsx' '--output' 'ci-plain' '--check'
+ measure 'shell_eval_warm' 'Warm shell eval' 'devenv shell' 'Evaluates a warm dev shell without reloading direnv state.' '' '1' '5' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":5,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.25}' '$DEVENV_BIN' 'shell' '--no-reload' '--' 'true'
+ measure 'tasks_list' 'devenv tasks list' 'devenv cli' 'Lists devenv tasks to measure task graph loading overhead.' '' '1' '9' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":7,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.25,"failRatio":1.5,"warnAbs":0.05,"failAbs":0.15,"noiseFloor":0.03,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.03}' '$DEVENV_BIN' 'tasks' 'list'
+ measure 'processes_help' 'devenv processes --help' 'devenv cli' 'Loads the devenv processes command help path.' '' '1' '9' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":7,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.25,"failRatio":1.5,"warnAbs":0.05,"failAbs":0.15,"noiseFloor":0.03,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.03}' '$DEVENV_BIN' 'processes' '--help'
+ measure 'task_pnpm_install' 'pnpm install task' 'workspace setup' 'Runs the cached pnpm install devenv task.' '' '1' '5' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":5,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.25}' '$DEVENV_BIN' 'tasks' 'run' 'pnpm:install' '--mode' 'before' '--no-tui' '--show-output'
+ measure 'task_genie_run' 'Genie run task' 'genie' 'Runs the normal devenv genie:run task including its declared dependencies.' '' '1' '5' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":5,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.25}' '$DEVENV_BIN' 'tasks' 'run' 'genie:run' '--mode' 'before' '--no-tui' '--show-output'
+ measure 'task_check_quick' 'Quick check task' 'quality gates' 'Runs the fast local quality gate through devenv.' '' '1' '5' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":5,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.25}' '$DEVENV_BIN' 'tasks' 'run' 'check:quick' '--mode' 'before' '--no-tui' '--show-output'
+ measure 'genie_check_direct' 'Genie check direct' 'genie' 'Runs Genie directly in check mode to isolate generator runtime from devenv task dependency overhead.' '' '1' '5' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":5,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.25}' '$DEVENV_BIN' 'shell' '--no-reload' '--' 'bun' 'packages/@overeng/genie/bin/genie.tsx' '--output' 'ci-plain' '--check'
printf ']\n' >>"$ARTIFACT_DIR/timings.json"
@@ -2929,6 +2929,7 @@ jobs:
label: .label,
group: .group,
description: .description,
+ measurementKind: (if (.gatePolicy.enabled == false) then "diagnostic" else "wall-clock" end),
name: ("devenv." + .id + ".duration"),
unit: "seconds",
value: (.durationMs / 1000),
@@ -2993,7 +2994,7 @@ jobs:
CI_MEASUREMENT_CURRENT_DIR: tmp/devenv-perf-ci
CI_MEASUREMENT_BASELINE_DIR: tmp/devenv-perf-ci/baseline
CI_MEASUREMENT_COMPARISON_FILE: tmp/devenv-perf-ci/measurement-comparison.json
- CI_MEASUREMENT_REGRESSION_MODE: fail
+ CI_MEASUREMENT_REGRESSION_MODE: warn
CI_MEASUREMENT_PR_COMMENT_ENABLED: 'true'
CI_MEASUREMENT_PR_COMMENT_TITLE: Devenv Performance
CI_MEASUREMENT_PR_COMMENT_MAX_ROWS: '8'
@@ -3103,6 +3104,7 @@ jobs:
| {
target: ($items[0].target // {}),
observation: ($items[-1].observation // {}),
+ measurementKind: ($items[-1].observation.measurementKind // null),
value: $median,
min: ($values | min),
max: ($values | max),
@@ -3112,6 +3114,7 @@ jobs:
mad: ($values | map(. - $median | if . < 0 then -. else . end) | median),
sourceCount: ($items | length),
sampleCount: $sampleCount,
+ pairedSampleCount: ($items | map(.observation.statistics.pairedSampleCount // .observation.comparison.pairedSampleCount // 0) | add // 0),
generatedAt: ($items[-1].generatedAt // null)
};
@@ -3139,8 +3142,10 @@ jobs:
| noise_floor($metric; $unit) as $noise
| $b + {
enabled:true,
- minBaselineSources:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" then 3 else 10 end),
+ comparisonMode:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" or $unit != "seconds" then "budget" else "historical" end),
+ minBaselineSources:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" or $unit != "seconds" then 1 else 10 end),
minCurrentSamples:(if $unit == "seconds" then 3 else 1 end),
+ minPairedSamples:(if $unit == "seconds" then 5 else 0 end),
noiseFloor:$noise
};
def observation_policy($obs):
@@ -3148,8 +3153,9 @@ jobs:
def policy_enabled($policy):
if ($policy | has("enabled")) then $policy.enabled else true end;
- def classify($metric; $unit; $policy; $current; $currentP25; $currentP75; $currentMad; $baseline; $baselineMin; $baselineMax; $baselineP25; $baselineP75; $baselineP95; $baselineMad; $currentSamples; $baselineSources):
+ def classify($metric; $unit; $measurementKind; $policy; $current; $currentP25; $currentP75; $currentMad; $baseline; $baselineMin; $baselineMax; $baselineP25; $baselineP75; $baselineP95; $baselineMad; $currentSamples; $baselineSources; $pairedSamples):
$policy as $b
+ | ($policy.comparisonMode // (if $measurementKind == "deterministic" or $unit != "seconds" then "budget" elif $measurementKind == "diagnostic" then "diagnostic" else "historical" end)) as $comparisonMode
| ($policy.noiseFloor // noise_floor($metric; $unit)) as $noise
| ($current - $baseline) as $delta
| (if $baseline > 0 then ($current / $baseline) else null end) as $ratio
@@ -3197,12 +3203,14 @@ jobs:
and $baseline > 0
and $baselineSources >= ($policy.minBaselineSources // 1)
and $currentSamples >= ($policy.minCurrentSamples // 1)
+ and (if $comparisonMode == "paired" then $pairedSamples >= ($policy.minPairedSamples // 1) else true end)
) as $gateable
| (
if (policy_enabled($policy) != true) then "disabled"
elif $baseline <= 0 then "missing_baseline"
elif $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
+ elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
else "eligible"
end
) as $gateReason
@@ -3212,7 +3220,8 @@ jobs:
elif ($delta | abs_value) <= $noise then "noise_floor"
elif $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
- elif ($thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band"
+ elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
+ elif ($comparisonMode == "historical" and $thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band"
elif $thresholdStatus == "pass" then "within_budget"
else "threshold_exceeded"
end
@@ -3252,7 +3261,7 @@ jobs:
else "improvement"
end
) as $semanticImpactKind
- | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,semanticImpactScore:$semanticImpactScore,semanticImpactKind:$semanticImpactKind,semanticWarnBudget:$warnBudget,semanticFailBudget:$failBudget,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange};
+ | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,comparisonMode:$comparisonMode,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,semanticImpactScore:$semanticImpactScore,semanticImpactKind:$semanticImpactKind,semanticWarnBudget:$warnBudget,semanticFailBudget:$failBudget,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange,pairedSamples:$pairedSamples};
(observations_by_key($current[0]) | with_entries(.value = observation_stats(.value))) as $currentObs
| (observations_by_key($baseline[0]) | with_entries(.value = observation_stats(.value))) as $baselineObs
@@ -3284,6 +3293,7 @@ jobs:
classify(
$currentValue.observation.name;
$currentValue.observation.unit;
+ ($currentValue.observation.measurementKind // $currentValue.measurementKind);
($currentValue.observation | observation_policy(.));
$currentValue.value;
$currentValue.p25;
@@ -3297,7 +3307,8 @@ jobs:
$baselineValue.p95;
$baselineValue.mad;
$currentValue.sampleCount;
- $baselineValue.sourceCount
+ $baselineValue.sourceCount;
+ $currentValue.pairedSampleCount
) + {
target: $currentValue.target,
observation: $currentValue.observation,
@@ -3323,7 +3334,8 @@ jobs:
(if (.gatePolicy | has("enabled")) then .gatePolicy.enabled else true end)
and (.gateReason == "missing_baseline"
or .gateReason == "low_baseline_count"
- or .gateReason == "low_current_sample_count")
+ or .gateReason == "low_current_sample_count"
+ or .gateReason == "low_paired_sample_count")
) then "partial"
else "pass"
end
@@ -3335,7 +3347,8 @@ jobs:
gateableCount: (map(select(.gateable == true)) | length),
missingBaselineCount: (map(select(.gateReason == "missing_baseline")) | length),
lowBaselineCount: (map(select(.gateReason == "low_baseline_count")) | length),
- lowCurrentSampleCount: (map(select(.gateReason == "low_current_sample_count")) | length)
+ lowCurrentSampleCount: (map(select(.gateReason == "low_current_sample_count")) | length),
+ lowPairedSampleCount: (map(select(.gateReason == "low_paired_sample_count")) | length)
}
| . + {
nonGateableCount: (.enabledCount - .gateableCount),
@@ -3630,6 +3643,12 @@ jobs:
tone: 'neutral',
color: '#94a3b8',
}
+ if (row.confidence === 'low_paired_sample_count') return {
+ label: 'Needs paired evidence',
+ detail: 'Wall-clock gates require same-run base/head samples before they can block merges.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
if (row.confidence === 'diagnostic') return {
label: 'Diagnostic only',
detail: 'Shown for investigation, but intentionally excluded from gating.',
@@ -4551,6 +4570,7 @@ jobs:
group,
path: scopePath,
description: 'Tracked non-binary source lines in the configured scope.',
+ measurementKind: 'deterministic',
name: 'source.lines',
unit: 'lines',
value: lineCount,
@@ -4564,6 +4584,7 @@ jobs:
group,
path: scopePath,
description: 'Tracked non-binary source files in the configured scope.',
+ measurementKind: 'deterministic',
name: 'source.files',
unit: 'count',
value: measuredFileCount,
@@ -4749,6 +4770,7 @@ jobs:
| {
target: ($items[0].target // {}),
observation: ($items[-1].observation // {}),
+ measurementKind: ($items[-1].observation.measurementKind // null),
value: $median,
min: ($values | min),
max: ($values | max),
@@ -4758,6 +4780,7 @@ jobs:
mad: ($values | map(. - $median | if . < 0 then -. else . end) | median),
sourceCount: ($items | length),
sampleCount: $sampleCount,
+ pairedSampleCount: ($items | map(.observation.statistics.pairedSampleCount // .observation.comparison.pairedSampleCount // 0) | add // 0),
generatedAt: ($items[-1].generatedAt // null)
};
@@ -4785,8 +4808,10 @@ jobs:
| noise_floor($metric; $unit) as $noise
| $b + {
enabled:true,
- minBaselineSources:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" then 3 else 10 end),
+ comparisonMode:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" or $unit != "seconds" then "budget" else "historical" end),
+ minBaselineSources:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" or $unit != "seconds" then 1 else 10 end),
minCurrentSamples:(if $unit == "seconds" then 3 else 1 end),
+ minPairedSamples:(if $unit == "seconds" then 5 else 0 end),
noiseFloor:$noise
};
def observation_policy($obs):
@@ -4794,8 +4819,9 @@ jobs:
def policy_enabled($policy):
if ($policy | has("enabled")) then $policy.enabled else true end;
- def classify($metric; $unit; $policy; $current; $currentP25; $currentP75; $currentMad; $baseline; $baselineMin; $baselineMax; $baselineP25; $baselineP75; $baselineP95; $baselineMad; $currentSamples; $baselineSources):
+ def classify($metric; $unit; $measurementKind; $policy; $current; $currentP25; $currentP75; $currentMad; $baseline; $baselineMin; $baselineMax; $baselineP25; $baselineP75; $baselineP95; $baselineMad; $currentSamples; $baselineSources; $pairedSamples):
$policy as $b
+ | ($policy.comparisonMode // (if $measurementKind == "deterministic" or $unit != "seconds" then "budget" elif $measurementKind == "diagnostic" then "diagnostic" else "historical" end)) as $comparisonMode
| ($policy.noiseFloor // noise_floor($metric; $unit)) as $noise
| ($current - $baseline) as $delta
| (if $baseline > 0 then ($current / $baseline) else null end) as $ratio
@@ -4843,12 +4869,14 @@ jobs:
and $baseline > 0
and $baselineSources >= ($policy.minBaselineSources // 1)
and $currentSamples >= ($policy.minCurrentSamples // 1)
+ and (if $comparisonMode == "paired" then $pairedSamples >= ($policy.minPairedSamples // 1) else true end)
) as $gateable
| (
if (policy_enabled($policy) != true) then "disabled"
elif $baseline <= 0 then "missing_baseline"
elif $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
+ elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
else "eligible"
end
) as $gateReason
@@ -4858,7 +4886,8 @@ jobs:
elif ($delta | abs_value) <= $noise then "noise_floor"
elif $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
- elif ($thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band"
+ elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
+ elif ($comparisonMode == "historical" and $thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band"
elif $thresholdStatus == "pass" then "within_budget"
else "threshold_exceeded"
end
@@ -4898,7 +4927,7 @@ jobs:
else "improvement"
end
) as $semanticImpactKind
- | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,semanticImpactScore:$semanticImpactScore,semanticImpactKind:$semanticImpactKind,semanticWarnBudget:$warnBudget,semanticFailBudget:$failBudget,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange};
+ | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,comparisonMode:$comparisonMode,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,semanticImpactScore:$semanticImpactScore,semanticImpactKind:$semanticImpactKind,semanticWarnBudget:$warnBudget,semanticFailBudget:$failBudget,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange,pairedSamples:$pairedSamples};
(observations_by_key($current[0]) | with_entries(.value = observation_stats(.value))) as $currentObs
| (observations_by_key($baseline[0]) | with_entries(.value = observation_stats(.value))) as $baselineObs
@@ -4930,6 +4959,7 @@ jobs:
classify(
$currentValue.observation.name;
$currentValue.observation.unit;
+ ($currentValue.observation.measurementKind // $currentValue.measurementKind);
($currentValue.observation | observation_policy(.));
$currentValue.value;
$currentValue.p25;
@@ -4943,7 +4973,8 @@ jobs:
$baselineValue.p95;
$baselineValue.mad;
$currentValue.sampleCount;
- $baselineValue.sourceCount
+ $baselineValue.sourceCount;
+ $currentValue.pairedSampleCount
) + {
target: $currentValue.target,
observation: $currentValue.observation,
@@ -4969,7 +5000,8 @@ jobs:
(if (.gatePolicy | has("enabled")) then .gatePolicy.enabled else true end)
and (.gateReason == "missing_baseline"
or .gateReason == "low_baseline_count"
- or .gateReason == "low_current_sample_count")
+ or .gateReason == "low_current_sample_count"
+ or .gateReason == "low_paired_sample_count")
) then "partial"
else "pass"
end
@@ -4981,7 +5013,8 @@ jobs:
gateableCount: (map(select(.gateable == true)) | length),
missingBaselineCount: (map(select(.gateReason == "missing_baseline")) | length),
lowBaselineCount: (map(select(.gateReason == "low_baseline_count")) | length),
- lowCurrentSampleCount: (map(select(.gateReason == "low_current_sample_count")) | length)
+ lowCurrentSampleCount: (map(select(.gateReason == "low_current_sample_count")) | length),
+ lowPairedSampleCount: (map(select(.gateReason == "low_paired_sample_count")) | length)
}
| . + {
nonGateableCount: (.enabledCount - .gateableCount),
@@ -5276,6 +5309,12 @@ jobs:
tone: 'neutral',
color: '#94a3b8',
}
+ if (row.confidence === 'low_paired_sample_count') return {
+ label: 'Needs paired evidence',
+ detail: 'Wall-clock gates require same-run base/head samples before they can block merges.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
if (row.confidence === 'diagnostic') return {
label: 'Diagnostic only',
detail: 'Shown for investigation, but intentionally excluded from gating.',
diff --git a/.github/workflows/ci.yml.genie.ts b/.github/workflows/ci.yml.genie.ts
index f17f90e1f..1fd33e14f 100644
--- a/.github/workflows/ci.yml.genie.ts
+++ b/.github/workflows/ci.yml.genie.ts
@@ -319,7 +319,10 @@ const extraJobs: Record = {
})),
],
baselineMaxRuns: 20,
- regressionMode: 'fail',
+ // Wall-clock measurements are advisory until they have paired same-run
+ // base/head evidence. Deterministic measurements such as closure sizes
+ // can still use budget-style gates in consuming repos.
+ regressionMode: 'warn',
env: ciMeasurementSubjectEnv,
setupSteps: baseSteps,
taskProbes: [
diff --git a/context/ci-measurements.md b/context/ci-measurements.md
new file mode 100644
index 000000000..4c0fb4999
--- /dev/null
+++ b/context/ci-measurements.md
@@ -0,0 +1,111 @@
+# CI Measurements
+
+This document specifies the shared CI measurement architecture used by generated workflows.
+
+## Status
+
+Active.
+
+## Measurement Classes
+
+| Class | Examples | Primary Question | Gate Model |
+| --- | --- | --- | --- |
+| `deterministic` | Nix closure size, source lines, file counts | Did a structural quantity exceed its budget? | Budget/diff against a comparable baseline. |
+| `wall-clock` | Devenv shell eval, task runtime, CLI command latency | Did this PR make this operation slower on the same runner conditions? | Paired same-run base/head samples before merge blocking. |
+| `diagnostic` | OTEL-traced shell eval, host context, trace breakdowns | Where did time go? | Never merge-blocking; explains measurements. |
+
+The class is part of the observation contract through `measurementKind`.
+The comparison policy is part of the gate contract through `comparisonMode`.
+
+## Observation Contract
+
+Every observation has a stable `id`, human `label`, semantic `group`/`path`,
+numeric `value`, `unit`, `measurementKind`, and a gate `policy`.
+
+```json
+{
+ "id": "devenv.shell_eval_warm.duration",
+ "label": "Warm shell eval",
+ "measurementKind": "wall-clock",
+ "unit": "seconds",
+ "value": 6.067,
+ "policy": {
+ "enabled": true,
+ "comparisonMode": "paired",
+ "minPairedSamples": 5,
+ "minCurrentSamples": 5
+ }
+}
+```
+
+## Gate Semantics
+
+Deterministic observations use `comparisonMode: "budget"`.
+They require a comparable baseline and then evaluate configured absolute and
+relative budgets. Historical variance is not treated as statistical evidence.
+
+Wall-clock observations use `comparisonMode: "paired"` for enforced gates.
+They need same-run base/head evidence before they can block a merge. Historical
+baselines remain useful for trend context, but they do not prove PR causality.
+
+Historical wall-clock comparison may be used as an advisory transition mode.
+It can warn, visualize trends, and guide investigation, but it must not be the
+required merge gate for noisy runner-dependent timings.
+
+Diagnostic observations set `enabled: false` or `measurementKind: "diagnostic"`.
+They appear in reports, but their impact is rendered as `diagnostic` and they
+are excluded from actionable impact charts.
+
+## Data Flow
+
+```text
+probe execution
+ -> measurements.json artifact
+ -> comparison engine
+ -> PR summary/comment + SVG asset
+ -> optional branch-protection gate
+```
+
+The artifact is the source of truth. OTEL traces and host context are evidence
+attachments, not the canonical numeric store. PR comments are projections of
+the artifact and can be regenerated.
+
+## Wall-Clock Soundness
+
+Wall-clock timings on CI runners are noisy, often non-normal, and affected by
+load, caches, CPU frequency, storage, network fetches, and process scheduling.
+For merge-blocking use, same-run paired measurement is required:
+
+```text
+base warmup
+head warmup
+base sample 1
+head sample 1
+head sample 2
+base sample 2
+...
+```
+
+The comparison operates on per-pair deltas. A wall-clock row becomes gateable
+only when the configured minimum paired sample count is present. Until then,
+the row is partial/advisory even if the historical raw delta is large.
+
+## Deterministic Measurements
+
+Nix closure size and source shape are not statistical performance probes. They
+should use explicit budgets and semantic buckets. A closure-size regression is
+actionable because the same installable and lock graph should produce a stable
+closure. Source-shape growth is an architecture signal and should remain
+advisory unless a repo defines an explicit owner-approved budget.
+
+## Visualization
+
+Reports must distinguish raw movement from actionable evidence.
+
+- Raw delta and percentage are always shown.
+- Actionable impact is only shown for gateable rows.
+- Diagnostic rows render as `diagnostic`, not `0.00x`.
+- Non-gateable paired wall-clock rows render as needing paired evidence.
+
+This prevents a large historical wall-clock delta from looking like a proven
+PR regression when the measurement lacks causal evidence.
diff --git a/genie/ci-scripts/ci-measurement-comparison.test.sh b/genie/ci-scripts/ci-measurement-comparison.test.sh
index 02b4ac0fd..93eca6a9e 100755
--- a/genie/ci-scripts/ci-measurement-comparison.test.sh
+++ b/genie/ci-scripts/ci-measurement-comparison.test.sh
@@ -66,6 +66,7 @@ run_compare() {
}
policy='{"enabled":true,"minBaselineSources":1,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":0.5,"noiseFloor":0.1}'
+paired_policy='{"enabled":true,"comparisonMode":"paired","minBaselineSources":1,"minCurrentSamples":5,"minPairedSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":0.5,"noiseFloor":0.1}'
emit_compare_script
rm -rf "$tmp_dir/current" "$tmp_dir/baseline"
@@ -106,6 +107,21 @@ if [ "$actual_status" != "fail" ] || [ "$actual_row" != "fail" ] || [ "$actual_e
exit 1
fi
+rm -rf "$tmp_dir/current" "$tmp_dir/baseline"
+write_measurement "$tmp_dir/current/measurements.json" 13 devenv-perf-warm-median-v2 "$paired_policy"
+write_measurement "$tmp_dir/baseline/run-1/measurements.json" 10 devenv-perf-warm-median-v2 "$paired_policy"
+run_compare
+actual_status="$(jq -r '.status' "$tmp_dir/comparison.json")"
+actual_row="$(jq -r '.comparisons[] | .status' "$tmp_dir/comparison.json")"
+actual_gate="$(jq -r '.comparisons[] | .gateReason' "$tmp_dir/comparison.json")"
+actual_confidence="$(jq -r '.comparisons[] | .confidence' "$tmp_dir/comparison.json")"
+actual_enforceable="$(jq -r '.readiness.enforceable' "$tmp_dir/comparison.json")"
+actual_low_paired="$(jq -r '.readiness.lowPairedSampleCount' "$tmp_dir/comparison.json")"
+if [ "$actual_status" != "partial" ] || [ "$actual_row" != "pass" ] || [ "$actual_gate" != "low_paired_sample_count" ] || [ "$actual_confidence" != "low_paired_sample_count" ] || [ "$actual_enforceable" != "false" ] || [ "$actual_low_paired" != "1" ]; then
+ echo "expected paired wall-clock policy without paired evidence to be partial/non-enforceable; got status=$actual_status row=$actual_row gate=$actual_gate confidence=$actual_confidence enforceable=$actual_enforceable lowPaired=$actual_low_paired" >&2
+ exit 1
+fi
+
rm -rf "$tmp_dir/current" "$tmp_dir/baseline"
write_measurement "$tmp_dir/current/run-1/measurements.json" 5.1 devenv-perf-warm-median-v2 "$policy"
write_measurement "$tmp_dir/current/run-2/measurements.json" 5.2 devenv-perf-warm-median-v2 "$policy"
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 4720a8e79..848386a65 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -23,8 +23,10 @@ export type CiMeasurementDescriptor = {
export type CiMeasurementGatePolicy = {
readonly enabled?: boolean
+ readonly comparisonMode?: 'budget' | 'historical' | 'paired'
readonly minBaselineSources?: number
readonly minCurrentSamples?: number
+ readonly minPairedSamples?: number
readonly noiseFloor?: number
readonly statisticalToleranceRatio?: number
readonly statisticalToleranceAbs?: number
@@ -48,6 +50,7 @@ export type CiMeasurementObservation = {
readonly group?: string
readonly path?: readonly string[]
readonly description?: string
+ readonly measurementKind?: 'deterministic' | 'wall-clock' | 'diagnostic' | (string & {})
readonly name: string
readonly unit: CiMeasurementUnit
readonly value: number
@@ -62,6 +65,7 @@ export type CiMeasurementObservation = {
readonly p25?: number
readonly p75?: number
readonly p95?: number
+ readonly pairedSampleCount?: number
}
}
@@ -309,6 +313,8 @@ const defaultDevenvPerfGatePolicy = (probeId: string): CiMeasurementGatePolicy =
if (probeId === 'tasks_list' || probeId === 'processes_help') {
return {
enabled: true,
+ comparisonMode: 'paired',
+ minPairedSamples: 7,
minBaselineSources: 10,
minCurrentSamples: 5,
warnRatio: 1.25,
@@ -322,6 +328,8 @@ const defaultDevenvPerfGatePolicy = (probeId: string): CiMeasurementGatePolicy =
}
return {
enabled: true,
+ comparisonMode: 'paired',
+ minPairedSamples: 5,
minBaselineSources: 10,
minCurrentSamples: 5,
warnRatio: 1.1,
@@ -731,6 +739,7 @@ jq -n \
label: .label,
group: .group,
description: .description,
+ measurementKind: (if (.gatePolicy.enabled == false) then "diagnostic" else "wall-clock" end),
name: ("devenv." + .id + ".duration"),
unit: "seconds",
value: (.durationMs / 1000),
@@ -1139,6 +1148,7 @@ jq -n \
label: (($bucket.label // $bucket.name) + " closure size"),
group: "nix closure buckets",
description: ("NAR size contributed by closure paths matching " + $bucket.pathRegex),
+ measurementKind: "deterministic",
unit: "bytes",
value: (
$closurePaths
@@ -1178,6 +1188,7 @@ jq -n \
group: "nix closure",
description: "Total NAR size for all paths in the resolved Nix closure.",
name: "nix.closure.nar_size",
+ measurementKind: "deterministic",
unit: "bytes",
value: $totalNarSize,
dimensions: { bucket: "total" }
@@ -1188,6 +1199,7 @@ jq -n \
group: "nix closure",
description: "Number of store paths in the resolved Nix closure.",
name: "nix.closure.path_count",
+ measurementKind: "deterministic",
unit: "count",
value: $pathCount,
dimensions: { bucket: "total" }
@@ -1324,6 +1336,7 @@ for (const scope of scopes) {
group,
path: scopePath,
description: 'Tracked non-binary source lines in the configured scope.',
+ measurementKind: 'deterministic',
name: 'source.lines',
unit: 'lines',
value: lineCount,
@@ -1337,6 +1350,7 @@ for (const scope of scopes) {
group,
path: scopePath,
description: 'Tracked non-binary source files in the configured scope.',
+ measurementKind: 'deterministic',
name: 'source.files',
unit: 'count',
value: measuredFileCount,
@@ -1535,6 +1549,7 @@ jq -n \
| {
target: ($items[0].target // {}),
observation: ($items[-1].observation // {}),
+ measurementKind: ($items[-1].observation.measurementKind // null),
value: $median,
min: ($values | min),
max: ($values | max),
@@ -1544,6 +1559,7 @@ jq -n \
mad: ($values | map(. - $median | if . < 0 then -. else . end) | median),
sourceCount: ($items | length),
sampleCount: $sampleCount,
+ pairedSampleCount: ($items | map(.observation.statistics.pairedSampleCount // .observation.comparison.pairedSampleCount // 0) | add // 0),
generatedAt: ($items[-1].generatedAt // null)
};
@@ -1571,8 +1587,10 @@ jq -n \
| noise_floor($metric; $unit) as $noise
| $b + {
enabled:true,
- minBaselineSources:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" then 3 else 10 end),
+ comparisonMode:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" or $unit != "seconds" then "budget" else "historical" end),
+ minBaselineSources:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" or $unit != "seconds" then 1 else 10 end),
minCurrentSamples:(if $unit == "seconds" then 3 else 1 end),
+ minPairedSamples:(if $unit == "seconds" then 5 else 0 end),
noiseFloor:$noise
};
def observation_policy($obs):
@@ -1580,8 +1598,9 @@ jq -n \
def policy_enabled($policy):
if ($policy | has("enabled")) then $policy.enabled else true end;
- def classify($metric; $unit; $policy; $current; $currentP25; $currentP75; $currentMad; $baseline; $baselineMin; $baselineMax; $baselineP25; $baselineP75; $baselineP95; $baselineMad; $currentSamples; $baselineSources):
+ def classify($metric; $unit; $measurementKind; $policy; $current; $currentP25; $currentP75; $currentMad; $baseline; $baselineMin; $baselineMax; $baselineP25; $baselineP75; $baselineP95; $baselineMad; $currentSamples; $baselineSources; $pairedSamples):
$policy as $b
+ | ($policy.comparisonMode // (if $measurementKind == "deterministic" or $unit != "seconds" then "budget" elif $measurementKind == "diagnostic" then "diagnostic" else "historical" end)) as $comparisonMode
| ($policy.noiseFloor // noise_floor($metric; $unit)) as $noise
| ($current - $baseline) as $delta
| (if $baseline > 0 then ($current / $baseline) else null end) as $ratio
@@ -1629,12 +1648,14 @@ jq -n \
and $baseline > 0
and $baselineSources >= ($policy.minBaselineSources // 1)
and $currentSamples >= ($policy.minCurrentSamples // 1)
+ and (if $comparisonMode == "paired" then $pairedSamples >= ($policy.minPairedSamples // 1) else true end)
) as $gateable
| (
if (policy_enabled($policy) != true) then "disabled"
elif $baseline <= 0 then "missing_baseline"
elif $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
+ elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
else "eligible"
end
) as $gateReason
@@ -1644,7 +1665,8 @@ jq -n \
elif ($delta | abs_value) <= $noise then "noise_floor"
elif $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
- elif ($thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band"
+ elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
+ elif ($comparisonMode == "historical" and $thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band"
elif $thresholdStatus == "pass" then "within_budget"
else "threshold_exceeded"
end
@@ -1684,7 +1706,7 @@ jq -n \
else "improvement"
end
) as $semanticImpactKind
- | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,semanticImpactScore:$semanticImpactScore,semanticImpactKind:$semanticImpactKind,semanticWarnBudget:$warnBudget,semanticFailBudget:$failBudget,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange};
+ | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,comparisonMode:$comparisonMode,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,semanticImpactScore:$semanticImpactScore,semanticImpactKind:$semanticImpactKind,semanticWarnBudget:$warnBudget,semanticFailBudget:$failBudget,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange,pairedSamples:$pairedSamples};
(observations_by_key($current[0]) | with_entries(.value = observation_stats(.value))) as $currentObs
| (observations_by_key($baseline[0]) | with_entries(.value = observation_stats(.value))) as $baselineObs
@@ -1716,6 +1738,7 @@ jq -n \
classify(
$currentValue.observation.name;
$currentValue.observation.unit;
+ ($currentValue.observation.measurementKind // $currentValue.measurementKind);
($currentValue.observation | observation_policy(.));
$currentValue.value;
$currentValue.p25;
@@ -1729,7 +1752,8 @@ jq -n \
$baselineValue.p95;
$baselineValue.mad;
$currentValue.sampleCount;
- $baselineValue.sourceCount
+ $baselineValue.sourceCount;
+ $currentValue.pairedSampleCount
) + {
target: $currentValue.target,
observation: $currentValue.observation,
@@ -1755,7 +1779,8 @@ jq -n \
(if (.gatePolicy | has("enabled")) then .gatePolicy.enabled else true end)
and (.gateReason == "missing_baseline"
or .gateReason == "low_baseline_count"
- or .gateReason == "low_current_sample_count")
+ or .gateReason == "low_current_sample_count"
+ or .gateReason == "low_paired_sample_count")
) then "partial"
else "pass"
end
@@ -1767,7 +1792,8 @@ jq -n \
gateableCount: (map(select(.gateable == true)) | length),
missingBaselineCount: (map(select(.gateReason == "missing_baseline")) | length),
lowBaselineCount: (map(select(.gateReason == "low_baseline_count")) | length),
- lowCurrentSampleCount: (map(select(.gateReason == "low_current_sample_count")) | length)
+ lowCurrentSampleCount: (map(select(.gateReason == "low_current_sample_count")) | length),
+ lowPairedSampleCount: (map(select(.gateReason == "low_paired_sample_count")) | length)
}
| . + {
nonGateableCount: (.enabledCount - .gateableCount),
@@ -2062,6 +2088,12 @@ const interpretation = (row) => {
tone: 'neutral',
color: '#94a3b8',
}
+ if (row.confidence === 'low_paired_sample_count') return {
+ label: 'Needs paired evidence',
+ detail: 'Wall-clock gates require same-run base/head samples before they can block merges.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
if (row.confidence === 'diagnostic') return {
label: 'Diagnostic only',
detail: 'Shown for investigation, but intentionally excluded from gating.',
diff --git a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
index a76233b0a..ce49cc955 100644
--- a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
+++ b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
@@ -415,6 +415,7 @@ describe('ci workflow devenv perf helpers', () => {
expect(generatedCiWorkflowYamlSource).toContain('baselineSources')
expect(generatedCiWorkflowYamlSource).toContain('low_baseline_count')
expect(generatedCiWorkflowYamlSource).toContain('low_current_sample_count')
+ expect(generatedCiWorkflowYamlSource).toContain('low_paired_sample_count')
expect(generatedCiWorkflowYamlSource).toContain('readiness:$readiness')
expect(generatedCiWorkflowYamlSource).toContain(
'enforceable: (.enabledCount == .gateableCount)',
@@ -430,10 +431,11 @@ describe('ci workflow devenv perf helpers', () => {
'if ($gateable and $confidence == "threshold_exceeded") then $thresholdStatus',
)
expect(generatedCiWorkflowYamlSource).toContain(
- 'elif ($thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band"',
+ 'elif ($comparisonMode == "historical" and $thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band"',
)
expect(ciWorkflowSource).toContain("label: 'Needs more baseline'")
expect(ciWorkflowSource).toContain("label: 'Needs repeat'")
+ expect(ciWorkflowSource).toContain("label: 'Needs paired evidence'")
expect(ciWorkflowSource).toContain("label: 'Too small to matter'")
expect(ciWorkflowSource).toContain("label: 'Within noise band'")
expect(ciWorkflowSource).toContain("label: 'Meaningfully lower'")
From 9c668e7129c3cb7ded3b5079cb88c81200611b72 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Tue, 19 May 2026 07:37:18 +0200
Subject: [PATCH 39/81] Format measurement architecture doc
---
context/ci-measurements.md | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/context/ci-measurements.md b/context/ci-measurements.md
index 4c0fb4999..4011eb7d7 100644
--- a/context/ci-measurements.md
+++ b/context/ci-measurements.md
@@ -8,11 +8,11 @@ Active.
## Measurement Classes
-| Class | Examples | Primary Question | Gate Model |
-| --- | --- | --- | --- |
-| `deterministic` | Nix closure size, source lines, file counts | Did a structural quantity exceed its budget? | Budget/diff against a comparable baseline. |
-| `wall-clock` | Devenv shell eval, task runtime, CLI command latency | Did this PR make this operation slower on the same runner conditions? | Paired same-run base/head samples before merge blocking. |
-| `diagnostic` | OTEL-traced shell eval, host context, trace breakdowns | Where did time go? | Never merge-blocking; explains measurements. |
+| Class | Examples | Primary Question | Gate Model |
+| --------------- | ------------------------------------------------------ | --------------------------------------------------------------------- | -------------------------------------------------------- |
+| `deterministic` | Nix closure size, source lines, file counts | Did a structural quantity exceed its budget? | Budget/diff against a comparable baseline. |
+| `wall-clock` | Devenv shell eval, task runtime, CLI command latency | Did this PR make this operation slower on the same runner conditions? | Paired same-run base/head samples before merge blocking. |
+| `diagnostic` | OTEL-traced shell eval, host context, trace breakdowns | Where did time go? | Never merge-blocking; explains measurements. |
The class is part of the observation contract through `measurementKind`.
The comparison policy is part of the gate contract through `comparisonMode`.
From 52004dde2a733dfdb425e6d63201478828a88634 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Tue, 19 May 2026 07:46:56 +0200
Subject: [PATCH 40/81] Produce paired wall-clock measurement evidence
---
.github/workflows/ci.yml | 290 +++++++++++++++---
context/ci-measurements.md | 5 +
.../ci-measurement-comparison.test.sh | 17 +
genie/ci-workflow/measurements.ts | 246 +++++++++++++--
4 files changed, 476 insertions(+), 82 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 910d8f819..7ec7f9269 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2625,6 +2625,43 @@ jobs:
run: |
set -euo pipefail
+ ARTIFACT_DIR="$(mkdir -p "$ARTIFACT_DIR" && cd "$ARTIFACT_DIR" && pwd -P)"
+ CI_MEASUREMENT_HEAD_DIR="${CI_MEASUREMENT_HEAD_DIR:-$PWD}"
+ CI_MEASUREMENT_BASE_DIR="${CI_MEASUREMENT_BASE_DIR:-${RUNNER_TEMP:-/tmp}/ci-measurement-base}"
+ CI_MEASUREMENT_PAIRED_ENABLED=0
+
+ prepare_paired_base_worktree() {
+ if [ "${GITHUB_EVENT_NAME:-}" != "pull_request" ]; then
+ return 0
+ fi
+ if [ -n "${CI_MEASUREMENT_ALLOW_PROBE_FAILURES:-}" ]; then
+ return 0
+ fi
+ if [ ! -f "${GITHUB_EVENT_PATH:-}" ]; then
+ return 0
+ fi
+
+ local base_sha
+ base_sha="$(jq -r '.pull_request.base.sha // empty' "$GITHUB_EVENT_PATH")"
+ if [ -z "$base_sha" ]; then
+ echo "::notice::paired wall-clock baseline unavailable: pull_request.base.sha missing"
+ return 0
+ fi
+
+ rm -rf "$CI_MEASUREMENT_BASE_DIR"
+ git worktree prune >/dev/null 2>&1 || true
+ if git fetch --no-tags --depth=1 origin "$base_sha" \
+ && git worktree add --detach "$CI_MEASUREMENT_BASE_DIR" "$base_sha" >/dev/null; then
+ CI_MEASUREMENT_PAIRED_ENABLED=1
+ echo "::notice::paired wall-clock baseline prepared at $CI_MEASUREMENT_BASE_DIR ($base_sha)"
+ else
+ echo "::warning::paired wall-clock baseline unavailable: failed to prepare base worktree $base_sha"
+ CI_MEASUREMENT_PAIRED_ENABLED=0
+ fi
+ }
+
+ prepare_paired_base_worktree
+
mkdir -p "$ARTIFACT_DIR/traces"
{
@@ -2684,8 +2721,22 @@ jobs:
--arg trace "$trace" \
--argjson gatePolicy "$gate_policy" \
'($samples[0] // []) as $sampleList
- | ($sampleList | map(select(.phase != "warmup" and .status == 0) | .durationMs)) as $successfulDurations
- | ($sampleList | map(select(.phase == "warmup"))) as $warmupSamples
+ | ($sampleList | map(select((.subject // "head") == "head" and .phase != "warmup" and .status == 0) | .durationMs)) as $successfulDurations
+ | ($sampleList | map(select((.subject // "head") == "head" and .phase == "warmup"))) as $warmupSamples
+ | ($sampleList | map(select((.subject // "head") == "head" and .phase == "measured" and .status == 0 and .pairIndex != null))) as $headSamples
+ | ($sampleList | map(select(.subject == "base" and .phase == "measured" and .status == 0 and .pairIndex != null))) as $baseSamples
+ | (
+ $headSamples
+ | map(. as $head | $baseSamples[]? | select(.pairIndex == $head.pairIndex) | {
+ pairIndex: $head.pairIndex,
+ currentDurationMs: $head.durationMs,
+ baselineDurationMs: .durationMs,
+ deltaMs: ($head.durationMs - .durationMs)
+ })
+ ) as $pairedSamples
+ | ($pairedSamples | map(.currentDurationMs)) as $pairedCurrentDurations
+ | ($pairedSamples | map(.baselineDurationMs)) as $pairedBaselineDurations
+ | ($pairedSamples | map(.deltaMs)) as $pairedDeltaDurations
| {
id:$id,
name:$id,
@@ -2701,11 +2752,19 @@ jobs:
statistics: {
sampleCount: ($sampleList | length),
warmupCount: ($warmupSamples | length),
- measuredSampleCount: (($sampleList | length) - ($warmupSamples | length)),
+ measuredSampleCount: (
+ $sampleList
+ | map(select((.subject // "head") == "head" and .phase != "warmup"))
+ | length
+ ),
successfulSampleCount: ($successfulDurations | length),
minDurationMs: ($successfulDurations | min),
maxDurationMs: ($successfulDurations | max),
- medianDurationMs: $durationMs
+ medianDurationMs: $durationMs,
+ pairedSampleCount: ($pairedSamples | length),
+ pairedCurrentMedianDurationMs: (if ($pairedCurrentDurations | length) == 0 then null else ($pairedCurrentDurations | sort | .[(length - 1) / 2 | floor]) end),
+ pairedBaselineMedianDurationMs: (if ($pairedBaselineDurations | length) == 0 then null else ($pairedBaselineDurations | sort | .[(length - 1) / 2 | floor]) end),
+ pairedDeltaMedianDurationMs: (if ($pairedDeltaDurations | length) == 0 then null else ($pairedDeltaDurations | sort | .[(length - 1) / 2 | floor]) end)
},
samples:$sampleList
}' \
@@ -2760,8 +2819,6 @@ jobs:
fi
fi
- started="$(date +%s%3N)"
- set +e
expanded=()
for arg in "$@"; do
case "$arg" in
@@ -2783,7 +2840,43 @@ jobs:
*) expanded+=("$arg") ;;
esac
done
- "${expanded[@]}" >"$sample_stdout" 2>"$sample_stderr"
+
+ local base_ran_before_head=0 base_stdout base_stderr base_started base_ended base_status base_duration_ms
+ if [ "$phase" = "measured" ] && [ "$CI_MEASUREMENT_PAIRED_ENABLED" -eq 1 ] && [ $((measured_index % 2)) -eq 0 ]; then
+ base_ran_before_head=1
+ base_stdout="$ARTIFACT_DIR/$id.$sample_index.base.stdout"
+ base_stderr="$ARTIFACT_DIR/$id.$sample_index.base.stderr"
+ base_started="$(date +%s%3N)"
+ set +e
+ (cd "$CI_MEASUREMENT_BASE_DIR" && "${expanded[@]}") >"$base_stdout" 2>"$base_stderr"
+ base_status=$?
+ set -e
+ base_ended="$(date +%s%3N)"
+ base_duration_ms=$((base_ended - base_started))
+
+ if [ "$sample_first" -eq 0 ]; then
+ printf ',' >>"$samples_file"
+ fi
+ sample_first=0
+ jq -cn \
+ --argjson index "$sample_index" \
+ --arg measuredIndex "$measured_index" \
+ --argjson status "$base_status" \
+ --argjson durationMs "$base_duration_ms" \
+ --arg stdout "$base_stdout" \
+ --arg stderr "$base_stderr" \
+ '{index:$index,measuredIndex:($measuredIndex | tonumber),pairIndex:($measuredIndex | tonumber),subject:"base",phase:"measured",status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:null,order:"base-head"}' \
+ >>"$samples_file"
+
+ if [ "$base_status" -ne 0 ]; then
+ echo "::warning::$id paired baseline sample $measured_index failed after ${base_duration_ms}ms; this pair is excluded from wall-clock gating"
+ tail -40 "$base_stderr" || true
+ fi
+ fi
+
+ started="$(date +%s%3N)"
+ set +e
+ (cd "$CI_MEASUREMENT_HEAD_DIR" && "${expanded[@]}") >"$sample_stdout" 2>"$sample_stderr"
status=$?
set -e
ended="$(date +%s%3N)"
@@ -2802,9 +2895,38 @@ jobs:
--arg stdout "$sample_stdout" \
--arg stderr "$sample_stderr" \
--arg trace "$sample_trace" \
- '{index:$index,measuredIndex:(if $measuredIndex == "" then null else ($measuredIndex | tonumber) end),phase:$phase,status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:(if $trace == "" then null else $trace end)}' \
+ --arg order "$(if [ "$phase" = "measured" ] && [ "$base_ran_before_head" -eq 1 ]; then printf base-head; else printf head-base; fi)" \
+ '{index:$index,measuredIndex:(if $measuredIndex == "" then null else ($measuredIndex | tonumber) end),pairIndex:(if $measuredIndex == "" then null else ($measuredIndex | tonumber) end),subject:"head",phase:$phase,status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:(if $trace == "" then null else $trace end),order:(if $phase == "measured" then $order else null end)}' \
>>"$samples_file"
+ if [ "$phase" = "measured" ] && [ "$status" -eq 0 ] && [ "$CI_MEASUREMENT_PAIRED_ENABLED" -eq 1 ] && [ "$base_ran_before_head" -eq 0 ]; then
+ base_stdout="$ARTIFACT_DIR/$id.$sample_index.base.stdout"
+ base_stderr="$ARTIFACT_DIR/$id.$sample_index.base.stderr"
+ base_started="$(date +%s%3N)"
+ set +e
+ (cd "$CI_MEASUREMENT_BASE_DIR" && "${expanded[@]}") >"$base_stdout" 2>"$base_stderr"
+ base_status=$?
+ set -e
+ base_ended="$(date +%s%3N)"
+ base_duration_ms=$((base_ended - base_started))
+
+ printf ',' >>"$samples_file"
+ jq -cn \
+ --argjson index "$sample_index" \
+ --arg measuredIndex "$measured_index" \
+ --argjson status "$base_status" \
+ --argjson durationMs "$base_duration_ms" \
+ --arg stdout "$base_stdout" \
+ --arg stderr "$base_stderr" \
+ '{index:$index,measuredIndex:($measuredIndex | tonumber),pairIndex:($measuredIndex | tonumber),subject:"base",phase:"measured",status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:null,order:"head-base"}' \
+ >>"$samples_file"
+
+ if [ "$base_status" -ne 0 ]; then
+ echo "::warning::$id paired baseline sample $measured_index failed after ${base_duration_ms}ms; this pair is excluded from wall-clock gating"
+ tail -40 "$base_stderr" || true
+ fi
+ fi
+
stdout="$sample_stdout"
stderr="$sample_stderr"
trace_file="$sample_trace"
@@ -2815,8 +2937,8 @@ jobs:
done
printf ']\n' >>"$samples_file"
- status="$(jq -r 'map(.status) | max // 0' "$samples_file")"
- duration_ms="$(jq -r 'map(select(.phase != "warmup" and .status == 0) | .durationMs) as $values | if ($values | length) == 0 then (map(.durationMs) | max // 0) else ($values | sort | .[(length - 1) / 2 | floor]) end' "$samples_file")"
+ status="$(jq -r 'map(select((.subject // "head") == "head") | .status) | max // 0' "$samples_file")"
+ duration_ms="$(jq -r 'map(select((.subject // "head") == "head" and .phase != "warmup" and .status == 0) | .durationMs) as $values | if ($values | length) == 0 then (map(select((.subject // "head") == "head") | .durationMs) | max // 0) else ($values | sort | .[(length - 1) / 2 | floor]) end' "$samples_file")"
cp "$stdout" "$ARTIFACT_DIR/$id.stdout" 2>/dev/null || true
cp "$stderr" "$ARTIFACT_DIR/$id.stderr" 2>/dev/null || true
@@ -2932,16 +3054,45 @@ jobs:
measurementKind: (if (.gatePolicy.enabled == false) then "diagnostic" else "wall-clock" end),
name: ("devenv." + .id + ".duration"),
unit: "seconds",
- value: (.durationMs / 1000),
- policy: .gatePolicy,
- statistics: {
+ value: (.durationMs / 1000),
+ policy: .gatePolicy,
+ comparison: {
+ mode: (.gatePolicy.comparisonMode // "historical"),
+ pairedSampleCount: (.statistics.pairedSampleCount // 0),
+ baseline: (
+ if (.statistics.pairedBaselineMedianDurationMs // null) == null
+ then null
+ else (.statistics.pairedBaselineMedianDurationMs / 1000)
+ end
+ )
+ },
+ statistics: {
sampleCount: (.statistics.sampleCount // 1),
warmupCount: (.statistics.warmupCount // 0),
measuredSampleCount: (.statistics.measuredSampleCount // (.statistics.sampleCount // 1)),
successfulSampleCount: (.statistics.successfulSampleCount // (if .status == 0 then 1 else 0 end)),
min: ((.statistics.minDurationMs // .durationMs) / 1000),
max: ((.statistics.maxDurationMs // .durationMs) / 1000),
- median: ((.statistics.medianDurationMs // .durationMs) / 1000)
+ median: ((.statistics.medianDurationMs // .durationMs) / 1000),
+ pairedSampleCount: (.statistics.pairedSampleCount // 0),
+ pairedCurrentMedian: (
+ if (.statistics.pairedCurrentMedianDurationMs // null) == null
+ then null
+ else (.statistics.pairedCurrentMedianDurationMs / 1000)
+ end
+ ),
+ pairedBaselineMedian: (
+ if (.statistics.pairedBaselineMedianDurationMs // null) == null
+ then null
+ else (.statistics.pairedBaselineMedianDurationMs / 1000)
+ end
+ ),
+ pairedDeltaMedian: (
+ if (.statistics.pairedDeltaMedianDurationMs // null) == null
+ then null
+ else (.statistics.pairedDeltaMedianDurationMs / 1000)
+ end
+ )
},
dimensions: {
probe: .id,
@@ -2950,6 +3101,13 @@ jobs:
sampleCount: (.statistics.sampleCount // 1),
warmupCount: (.statistics.warmupCount // 0),
measuredSampleCount: (.statistics.measuredSampleCount // (.statistics.sampleCount // 1)),
+ pairedSampleCount: (.statistics.pairedSampleCount // 0),
+ pairedOrderProtocol: (
+ if (.statistics.pairedSampleCount // 0) > 0
+ then "alternating-head-base"
+ else null
+ end
+ ),
measurementProtocol: "devenv-perf-warm-median-v2",
aggregation: "median",
phase: "warm",
@@ -3099,6 +3257,7 @@ jobs:
def observation_stats($items):
($items | map(.observation.value)) as $values
+ | ($items | map(.observation.comparison.baseline // empty)) as $pairedBaselineValues
| ($items | map(.observation.statistics.measuredSampleCount // .observation.statistics.sampleCount // 1) | add // ($items | length)) as $sampleCount
| ($values | median) as $median
| {
@@ -3115,6 +3274,7 @@ jobs:
sourceCount: ($items | length),
sampleCount: $sampleCount,
pairedSampleCount: ($items | map(.observation.statistics.pairedSampleCount // .observation.comparison.pairedSampleCount // 0) | add // 0),
+ pairedBaselineValue: (if ($pairedBaselineValues | length) == 0 then null else ($pairedBaselineValues | median) end),
generatedAt: ($items[-1].generatedAt // null)
};
@@ -3272,10 +3432,23 @@ jobs:
.key as $key
| .value as $currentValue
| ($baselineObs[$key] // null) as $baselineValue
+ | ($currentValue.observation | observation_policy(.)) as $policy
+ | ($policy.comparisonMode // (if ($currentValue.observation.measurementKind // $currentValue.measurementKind) == "deterministic" or ($currentValue.observation.unit // "") != "seconds" then "budget" elif ($currentValue.observation.measurementKind // $currentValue.measurementKind) == "diagnostic" then "diagnostic" else "historical" end)) as $comparisonMode
+ | ($currentValue.pairedBaselineValue // null) as $pairedBaselineValue
+ | (if $comparisonMode == "paired" and $pairedBaselineValue != null then {
+ value: $pairedBaselineValue,
+ min: $pairedBaselineValue,
+ max: $pairedBaselineValue,
+ p25: $pairedBaselineValue,
+ p75: $pairedBaselineValue,
+ p95: $pairedBaselineValue,
+ mad: 0,
+ sourceCount: $currentValue.pairedSampleCount
+ } else $baselineValue end) as $effectiveBaselineValue
| {
key: $key,
value: (
- if $baselineValue == null then
+ if $effectiveBaselineValue == null then
{
status: "missing_baseline",
target: $currentValue.target,
@@ -3283,7 +3456,8 @@ jobs:
current: $currentValue.value,
currentSamples: $currentValue.sampleCount,
baselineSources: 0,
- gatePolicy: ($currentValue.observation | observation_policy(.)),
+ gatePolicy: $policy,
+ comparisonMode: $comparisonMode,
gateable: false,
gateReason: "missing_baseline",
confidence: "missing_baseline",
@@ -3294,32 +3468,32 @@ jobs:
$currentValue.observation.name;
$currentValue.observation.unit;
($currentValue.observation.measurementKind // $currentValue.measurementKind);
- ($currentValue.observation | observation_policy(.));
+ $policy;
$currentValue.value;
$currentValue.p25;
$currentValue.p75;
$currentValue.mad;
- $baselineValue.value;
- $baselineValue.min;
- $baselineValue.max;
- $baselineValue.p25;
- $baselineValue.p75;
- $baselineValue.p95;
- $baselineValue.mad;
+ $effectiveBaselineValue.value;
+ $effectiveBaselineValue.min;
+ $effectiveBaselineValue.max;
+ $effectiveBaselineValue.p25;
+ $effectiveBaselineValue.p75;
+ $effectiveBaselineValue.p95;
+ $effectiveBaselineValue.mad;
$currentValue.sampleCount;
- $baselineValue.sourceCount;
+ $effectiveBaselineValue.sourceCount;
$currentValue.pairedSampleCount
) + {
target: $currentValue.target,
observation: $currentValue.observation,
currentSamples: $currentValue.sampleCount,
- baselineSources: $baselineValue.sourceCount,
- baselineMin: $baselineValue.min,
- baselineMax: $baselineValue.max,
- baselineP25: $baselineValue.p25,
- baselineP75: $baselineValue.p75,
- baselineP95: $baselineValue.p95
- ,baselineMad: $baselineValue.mad
+ baselineSources: $effectiveBaselineValue.sourceCount,
+ baselineMin: $effectiveBaselineValue.min,
+ baselineMax: $effectiveBaselineValue.max,
+ baselineP25: $effectiveBaselineValue.p25,
+ baselineP75: $effectiveBaselineValue.p75,
+ baselineP95: $effectiveBaselineValue.p95
+ ,baselineMad: $effectiveBaselineValue.mad
}
end
)
@@ -4765,6 +4939,7 @@ jobs:
def observation_stats($items):
($items | map(.observation.value)) as $values
+ | ($items | map(.observation.comparison.baseline // empty)) as $pairedBaselineValues
| ($items | map(.observation.statistics.measuredSampleCount // .observation.statistics.sampleCount // 1) | add // ($items | length)) as $sampleCount
| ($values | median) as $median
| {
@@ -4781,6 +4956,7 @@ jobs:
sourceCount: ($items | length),
sampleCount: $sampleCount,
pairedSampleCount: ($items | map(.observation.statistics.pairedSampleCount // .observation.comparison.pairedSampleCount // 0) | add // 0),
+ pairedBaselineValue: (if ($pairedBaselineValues | length) == 0 then null else ($pairedBaselineValues | median) end),
generatedAt: ($items[-1].generatedAt // null)
};
@@ -4938,10 +5114,23 @@ jobs:
.key as $key
| .value as $currentValue
| ($baselineObs[$key] // null) as $baselineValue
+ | ($currentValue.observation | observation_policy(.)) as $policy
+ | ($policy.comparisonMode // (if ($currentValue.observation.measurementKind // $currentValue.measurementKind) == "deterministic" or ($currentValue.observation.unit // "") != "seconds" then "budget" elif ($currentValue.observation.measurementKind // $currentValue.measurementKind) == "diagnostic" then "diagnostic" else "historical" end)) as $comparisonMode
+ | ($currentValue.pairedBaselineValue // null) as $pairedBaselineValue
+ | (if $comparisonMode == "paired" and $pairedBaselineValue != null then {
+ value: $pairedBaselineValue,
+ min: $pairedBaselineValue,
+ max: $pairedBaselineValue,
+ p25: $pairedBaselineValue,
+ p75: $pairedBaselineValue,
+ p95: $pairedBaselineValue,
+ mad: 0,
+ sourceCount: $currentValue.pairedSampleCount
+ } else $baselineValue end) as $effectiveBaselineValue
| {
key: $key,
value: (
- if $baselineValue == null then
+ if $effectiveBaselineValue == null then
{
status: "missing_baseline",
target: $currentValue.target,
@@ -4949,7 +5138,8 @@ jobs:
current: $currentValue.value,
currentSamples: $currentValue.sampleCount,
baselineSources: 0,
- gatePolicy: ($currentValue.observation | observation_policy(.)),
+ gatePolicy: $policy,
+ comparisonMode: $comparisonMode,
gateable: false,
gateReason: "missing_baseline",
confidence: "missing_baseline",
@@ -4960,32 +5150,32 @@ jobs:
$currentValue.observation.name;
$currentValue.observation.unit;
($currentValue.observation.measurementKind // $currentValue.measurementKind);
- ($currentValue.observation | observation_policy(.));
+ $policy;
$currentValue.value;
$currentValue.p25;
$currentValue.p75;
$currentValue.mad;
- $baselineValue.value;
- $baselineValue.min;
- $baselineValue.max;
- $baselineValue.p25;
- $baselineValue.p75;
- $baselineValue.p95;
- $baselineValue.mad;
+ $effectiveBaselineValue.value;
+ $effectiveBaselineValue.min;
+ $effectiveBaselineValue.max;
+ $effectiveBaselineValue.p25;
+ $effectiveBaselineValue.p75;
+ $effectiveBaselineValue.p95;
+ $effectiveBaselineValue.mad;
$currentValue.sampleCount;
- $baselineValue.sourceCount;
+ $effectiveBaselineValue.sourceCount;
$currentValue.pairedSampleCount
) + {
target: $currentValue.target,
observation: $currentValue.observation,
currentSamples: $currentValue.sampleCount,
- baselineSources: $baselineValue.sourceCount,
- baselineMin: $baselineValue.min,
- baselineMax: $baselineValue.max,
- baselineP25: $baselineValue.p25,
- baselineP75: $baselineValue.p75,
- baselineP95: $baselineValue.p95
- ,baselineMad: $baselineValue.mad
+ baselineSources: $effectiveBaselineValue.sourceCount,
+ baselineMin: $effectiveBaselineValue.min,
+ baselineMax: $effectiveBaselineValue.max,
+ baselineP25: $effectiveBaselineValue.p25,
+ baselineP75: $effectiveBaselineValue.p75,
+ baselineP95: $effectiveBaselineValue.p95
+ ,baselineMad: $effectiveBaselineValue.mad
}
end
)
diff --git a/context/ci-measurements.md b/context/ci-measurements.md
index 4011eb7d7..95a179c92 100644
--- a/context/ci-measurements.md
+++ b/context/ci-measurements.md
@@ -47,6 +47,11 @@ relative budgets. Historical variance is not treated as statistical evidence.
Wall-clock observations use `comparisonMode: "paired"` for enforced gates.
They need same-run base/head evidence before they can block a merge. Historical
baselines remain useful for trend context, but they do not prove PR causality.
+For PR runs, the wall-clock producer checks out the PR base commit in a sibling
+worktree and alternates measured pair order (`head -> base`, then
+`base -> head`) to reduce cache and time drift bias. The current artifact
+stores the paired baseline median and paired sample count, and the comparison
+engine uses that embedded paired baseline for the gate.
Historical wall-clock comparison may be used as an advisory transition mode.
It can warn, visualize trends, and guide investigation, but it must not be the
diff --git a/genie/ci-scripts/ci-measurement-comparison.test.sh b/genie/ci-scripts/ci-measurement-comparison.test.sh
index 93eca6a9e..5f73bdb06 100755
--- a/genie/ci-scripts/ci-measurement-comparison.test.sh
+++ b/genie/ci-scripts/ci-measurement-comparison.test.sh
@@ -122,6 +122,23 @@ if [ "$actual_status" != "partial" ] || [ "$actual_row" != "pass" ] || [ "$actua
exit 1
fi
+rm -rf "$tmp_dir/current" "$tmp_dir/baseline"
+write_measurement "$tmp_dir/current/measurements.json" 13 devenv-perf-warm-median-v2 "$paired_policy"
+jq '.observations[0].comparison = { mode: "paired", baseline: 12.95, pairedSampleCount: 5 } | .observations[0].statistics.pairedSampleCount = 5' \
+ "$tmp_dir/current/measurements.json" >"$tmp_dir/current/measurements.updated.json"
+mv "$tmp_dir/current/measurements.updated.json" "$tmp_dir/current/measurements.json"
+write_measurement "$tmp_dir/baseline/run-1/measurements.json" 10 devenv-perf-warm-median-v2 "$paired_policy"
+run_compare
+actual_status="$(jq -r '.status' "$tmp_dir/comparison.json")"
+actual_row="$(jq -r '.comparisons[] | .status' "$tmp_dir/comparison.json")"
+actual_gate="$(jq -r '.comparisons[] | .gateReason' "$tmp_dir/comparison.json")"
+actual_baseline="$(jq -r '.comparisons[] | .baseline' "$tmp_dir/comparison.json")"
+actual_enforceable="$(jq -r '.readiness.enforceable' "$tmp_dir/comparison.json")"
+if [ "$actual_status" != "pass" ] || [ "$actual_row" != "pass" ] || [ "$actual_gate" != "eligible" ] || [ "$actual_baseline" != "12.95" ] || [ "$actual_enforceable" != "true" ]; then
+ echo "expected paired current artifact baseline to override historical baseline; got status=$actual_status row=$actual_row gate=$actual_gate baseline=$actual_baseline enforceable=$actual_enforceable" >&2
+ exit 1
+fi
+
rm -rf "$tmp_dir/current" "$tmp_dir/baseline"
write_measurement "$tmp_dir/current/run-1/measurements.json" 5.1 devenv-perf-warm-median-v2 "$policy"
write_measurement "$tmp_dir/current/run-2/measurements.json" 5.2 devenv-perf-warm-median-v2 "$policy"
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 848386a65..6cb4bb3f0 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -56,6 +56,11 @@ export type CiMeasurementObservation = {
readonly value: number
readonly dimensions?: Record
readonly policy?: CiMeasurementGatePolicy
+ readonly comparison?: {
+ readonly mode?: 'budget' | 'historical' | 'paired' | (string & {})
+ readonly baseline?: number
+ readonly pairedSampleCount?: number
+ }
readonly statistics?: {
readonly sampleCount?: number
readonly measuredSampleCount?: number
@@ -66,6 +71,9 @@ export type CiMeasurementObservation = {
readonly p75?: number
readonly p95?: number
readonly pairedSampleCount?: number
+ readonly pairedBaselineMedian?: number
+ readonly pairedCurrentMedian?: number
+ readonly pairedDeltaMedian?: number
}
}
@@ -442,6 +450,43 @@ const renderDevenvPerfScript = (
return String.raw`set -euo pipefail
+ARTIFACT_DIR="$(mkdir -p "$ARTIFACT_DIR" && cd "$ARTIFACT_DIR" && pwd -P)"
+CI_MEASUREMENT_HEAD_DIR="${dollar}{CI_MEASUREMENT_HEAD_DIR:-$PWD}"
+CI_MEASUREMENT_BASE_DIR="${dollar}{CI_MEASUREMENT_BASE_DIR:-${dollar}{RUNNER_TEMP:-/tmp}/ci-measurement-base}"
+CI_MEASUREMENT_PAIRED_ENABLED=0
+
+prepare_paired_base_worktree() {
+ if [ "${dollar}{GITHUB_EVENT_NAME:-}" != "pull_request" ]; then
+ return 0
+ fi
+ if [ -n "${dollar}{CI_MEASUREMENT_ALLOW_PROBE_FAILURES:-}" ]; then
+ return 0
+ fi
+ if [ ! -f "${dollar}{GITHUB_EVENT_PATH:-}" ]; then
+ return 0
+ fi
+
+ local base_sha
+ base_sha="$(jq -r '.pull_request.base.sha // empty' "$GITHUB_EVENT_PATH")"
+ if [ -z "$base_sha" ]; then
+ echo "::notice::paired wall-clock baseline unavailable: pull_request.base.sha missing"
+ return 0
+ fi
+
+ rm -rf "$CI_MEASUREMENT_BASE_DIR"
+ git worktree prune >/dev/null 2>&1 || true
+ if git fetch --no-tags --depth=1 origin "$base_sha" \
+ && git worktree add --detach "$CI_MEASUREMENT_BASE_DIR" "$base_sha" >/dev/null; then
+ CI_MEASUREMENT_PAIRED_ENABLED=1
+ echo "::notice::paired wall-clock baseline prepared at $CI_MEASUREMENT_BASE_DIR ($base_sha)"
+ else
+ echo "::warning::paired wall-clock baseline unavailable: failed to prepare base worktree $base_sha"
+ CI_MEASUREMENT_PAIRED_ENABLED=0
+ fi
+}
+
+prepare_paired_base_worktree
+
mkdir -p "$ARTIFACT_DIR/traces"
{
@@ -501,8 +546,22 @@ json_append_timing() {
--arg trace "$trace" \
--argjson gatePolicy "$gate_policy" \
'($samples[0] // []) as $sampleList
- | ($sampleList | map(select(.phase != "warmup" and .status == 0) | .durationMs)) as $successfulDurations
- | ($sampleList | map(select(.phase == "warmup"))) as $warmupSamples
+ | ($sampleList | map(select((.subject // "head") == "head" and .phase != "warmup" and .status == 0) | .durationMs)) as $successfulDurations
+ | ($sampleList | map(select((.subject // "head") == "head" and .phase == "warmup"))) as $warmupSamples
+ | ($sampleList | map(select((.subject // "head") == "head" and .phase == "measured" and .status == 0 and .pairIndex != null))) as $headSamples
+ | ($sampleList | map(select(.subject == "base" and .phase == "measured" and .status == 0 and .pairIndex != null))) as $baseSamples
+ | (
+ $headSamples
+ | map(. as $head | $baseSamples[]? | select(.pairIndex == $head.pairIndex) | {
+ pairIndex: $head.pairIndex,
+ currentDurationMs: $head.durationMs,
+ baselineDurationMs: .durationMs,
+ deltaMs: ($head.durationMs - .durationMs)
+ })
+ ) as $pairedSamples
+ | ($pairedSamples | map(.currentDurationMs)) as $pairedCurrentDurations
+ | ($pairedSamples | map(.baselineDurationMs)) as $pairedBaselineDurations
+ | ($pairedSamples | map(.deltaMs)) as $pairedDeltaDurations
| {
id:$id,
name:$id,
@@ -518,11 +577,19 @@ json_append_timing() {
statistics: {
sampleCount: ($sampleList | length),
warmupCount: ($warmupSamples | length),
- measuredSampleCount: (($sampleList | length) - ($warmupSamples | length)),
+ measuredSampleCount: (
+ $sampleList
+ | map(select((.subject // "head") == "head" and .phase != "warmup"))
+ | length
+ ),
successfulSampleCount: ($successfulDurations | length),
minDurationMs: ($successfulDurations | min),
maxDurationMs: ($successfulDurations | max),
- medianDurationMs: $durationMs
+ medianDurationMs: $durationMs,
+ pairedSampleCount: ($pairedSamples | length),
+ pairedCurrentMedianDurationMs: (if ($pairedCurrentDurations | length) == 0 then null else ($pairedCurrentDurations | sort | .[(length - 1) / 2 | floor]) end),
+ pairedBaselineMedianDurationMs: (if ($pairedBaselineDurations | length) == 0 then null else ($pairedBaselineDurations | sort | .[(length - 1) / 2 | floor]) end),
+ pairedDeltaMedianDurationMs: (if ($pairedDeltaDurations | length) == 0 then null else ($pairedDeltaDurations | sort | .[(length - 1) / 2 | floor]) end)
},
samples:$sampleList
}' \
@@ -577,8 +644,6 @@ measure() {
fi
fi
- started="$(date +%s%3N)"
- set +e
expanded=()
for arg in "$@"; do
case "$arg" in
@@ -600,7 +665,43 @@ measure() {
*) expanded+=("$arg") ;;
esac
done
- "${dollar}{expanded[@]}" >"$sample_stdout" 2>"$sample_stderr"
+
+ local base_ran_before_head=0 base_stdout base_stderr base_started base_ended base_status base_duration_ms
+ if [ "$phase" = "measured" ] && [ "$CI_MEASUREMENT_PAIRED_ENABLED" -eq 1 ] && [ $((measured_index % 2)) -eq 0 ]; then
+ base_ran_before_head=1
+ base_stdout="$ARTIFACT_DIR/$id.$sample_index.base.stdout"
+ base_stderr="$ARTIFACT_DIR/$id.$sample_index.base.stderr"
+ base_started="$(date +%s%3N)"
+ set +e
+ (cd "$CI_MEASUREMENT_BASE_DIR" && "${dollar}{expanded[@]}") >"$base_stdout" 2>"$base_stderr"
+ base_status=$?
+ set -e
+ base_ended="$(date +%s%3N)"
+ base_duration_ms=$((base_ended - base_started))
+
+ if [ "$sample_first" -eq 0 ]; then
+ printf ',' >>"$samples_file"
+ fi
+ sample_first=0
+ jq -cn \
+ --argjson index "$sample_index" \
+ --arg measuredIndex "$measured_index" \
+ --argjson status "$base_status" \
+ --argjson durationMs "$base_duration_ms" \
+ --arg stdout "$base_stdout" \
+ --arg stderr "$base_stderr" \
+ '{index:$index,measuredIndex:($measuredIndex | tonumber),pairIndex:($measuredIndex | tonumber),subject:"base",phase:"measured",status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:null,order:"base-head"}' \
+ >>"$samples_file"
+
+ if [ "$base_status" -ne 0 ]; then
+ echo "::warning::$id paired baseline sample $measured_index failed after ${dollar}{base_duration_ms}ms; this pair is excluded from wall-clock gating"
+ tail -40 "$base_stderr" || true
+ fi
+ fi
+
+ started="$(date +%s%3N)"
+ set +e
+ (cd "$CI_MEASUREMENT_HEAD_DIR" && "${dollar}{expanded[@]}") >"$sample_stdout" 2>"$sample_stderr"
status=$?
set -e
ended="$(date +%s%3N)"
@@ -619,9 +720,38 @@ measure() {
--arg stdout "$sample_stdout" \
--arg stderr "$sample_stderr" \
--arg trace "$sample_trace" \
- '{index:$index,measuredIndex:(if $measuredIndex == "" then null else ($measuredIndex | tonumber) end),phase:$phase,status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:(if $trace == "" then null else $trace end)}' \
+ --arg order "$(if [ "$phase" = "measured" ] && [ "$base_ran_before_head" -eq 1 ]; then printf base-head; else printf head-base; fi)" \
+ '{index:$index,measuredIndex:(if $measuredIndex == "" then null else ($measuredIndex | tonumber) end),pairIndex:(if $measuredIndex == "" then null else ($measuredIndex | tonumber) end),subject:"head",phase:$phase,status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:(if $trace == "" then null else $trace end),order:(if $phase == "measured" then $order else null end)}' \
>>"$samples_file"
+ if [ "$phase" = "measured" ] && [ "$status" -eq 0 ] && [ "$CI_MEASUREMENT_PAIRED_ENABLED" -eq 1 ] && [ "$base_ran_before_head" -eq 0 ]; then
+ base_stdout="$ARTIFACT_DIR/$id.$sample_index.base.stdout"
+ base_stderr="$ARTIFACT_DIR/$id.$sample_index.base.stderr"
+ base_started="$(date +%s%3N)"
+ set +e
+ (cd "$CI_MEASUREMENT_BASE_DIR" && "${dollar}{expanded[@]}") >"$base_stdout" 2>"$base_stderr"
+ base_status=$?
+ set -e
+ base_ended="$(date +%s%3N)"
+ base_duration_ms=$((base_ended - base_started))
+
+ printf ',' >>"$samples_file"
+ jq -cn \
+ --argjson index "$sample_index" \
+ --arg measuredIndex "$measured_index" \
+ --argjson status "$base_status" \
+ --argjson durationMs "$base_duration_ms" \
+ --arg stdout "$base_stdout" \
+ --arg stderr "$base_stderr" \
+ '{index:$index,measuredIndex:($measuredIndex | tonumber),pairIndex:($measuredIndex | tonumber),subject:"base",phase:"measured",status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:null,order:"head-base"}' \
+ >>"$samples_file"
+
+ if [ "$base_status" -ne 0 ]; then
+ echo "::warning::$id paired baseline sample $measured_index failed after ${dollar}{base_duration_ms}ms; this pair is excluded from wall-clock gating"
+ tail -40 "$base_stderr" || true
+ fi
+ fi
+
stdout="$sample_stdout"
stderr="$sample_stderr"
trace_file="$sample_trace"
@@ -632,8 +762,8 @@ measure() {
done
printf ']\n' >>"$samples_file"
- status="$(jq -r 'map(.status) | max // 0' "$samples_file")"
- duration_ms="$(jq -r 'map(select(.phase != "warmup" and .status == 0) | .durationMs) as $values | if ($values | length) == 0 then (map(.durationMs) | max // 0) else ($values | sort | .[(length - 1) / 2 | floor]) end' "$samples_file")"
+ status="$(jq -r 'map(select((.subject // "head") == "head") | .status) | max // 0' "$samples_file")"
+ duration_ms="$(jq -r 'map(select((.subject // "head") == "head" and .phase != "warmup" and .status == 0) | .durationMs) as $values | if ($values | length) == 0 then (map(select((.subject // "head") == "head") | .durationMs) | max // 0) else ($values | sort | .[(length - 1) / 2 | floor]) end' "$samples_file")"
cp "$stdout" "$ARTIFACT_DIR/$id.stdout" 2>/dev/null || true
cp "$stderr" "$ARTIFACT_DIR/$id.stderr" 2>/dev/null || true
@@ -742,16 +872,45 @@ jq -n \
measurementKind: (if (.gatePolicy.enabled == false) then "diagnostic" else "wall-clock" end),
name: ("devenv." + .id + ".duration"),
unit: "seconds",
- value: (.durationMs / 1000),
- policy: .gatePolicy,
- statistics: {
+ value: (.durationMs / 1000),
+ policy: .gatePolicy,
+ comparison: {
+ mode: (.gatePolicy.comparisonMode // "historical"),
+ pairedSampleCount: (.statistics.pairedSampleCount // 0),
+ baseline: (
+ if (.statistics.pairedBaselineMedianDurationMs // null) == null
+ then null
+ else (.statistics.pairedBaselineMedianDurationMs / 1000)
+ end
+ )
+ },
+ statistics: {
sampleCount: (.statistics.sampleCount // 1),
warmupCount: (.statistics.warmupCount // 0),
measuredSampleCount: (.statistics.measuredSampleCount // (.statistics.sampleCount // 1)),
successfulSampleCount: (.statistics.successfulSampleCount // (if .status == 0 then 1 else 0 end)),
min: ((.statistics.minDurationMs // .durationMs) / 1000),
max: ((.statistics.maxDurationMs // .durationMs) / 1000),
- median: ((.statistics.medianDurationMs // .durationMs) / 1000)
+ median: ((.statistics.medianDurationMs // .durationMs) / 1000),
+ pairedSampleCount: (.statistics.pairedSampleCount // 0),
+ pairedCurrentMedian: (
+ if (.statistics.pairedCurrentMedianDurationMs // null) == null
+ then null
+ else (.statistics.pairedCurrentMedianDurationMs / 1000)
+ end
+ ),
+ pairedBaselineMedian: (
+ if (.statistics.pairedBaselineMedianDurationMs // null) == null
+ then null
+ else (.statistics.pairedBaselineMedianDurationMs / 1000)
+ end
+ ),
+ pairedDeltaMedian: (
+ if (.statistics.pairedDeltaMedianDurationMs // null) == null
+ then null
+ else (.statistics.pairedDeltaMedianDurationMs / 1000)
+ end
+ )
},
dimensions: {
probe: .id,
@@ -760,6 +919,13 @@ jq -n \
sampleCount: (.statistics.sampleCount // 1),
warmupCount: (.statistics.warmupCount // 0),
measuredSampleCount: (.statistics.measuredSampleCount // (.statistics.sampleCount // 1)),
+ pairedSampleCount: (.statistics.pairedSampleCount // 0),
+ pairedOrderProtocol: (
+ if (.statistics.pairedSampleCount // 0) > 0
+ then "alternating-head-base"
+ else null
+ end
+ ),
measurementProtocol: "devenv-perf-warm-median-v2",
aggregation: "median",
phase: "warm",
@@ -1544,6 +1710,7 @@ jq -n \
def observation_stats($items):
($items | map(.observation.value)) as $values
+ | ($items | map(.observation.comparison.baseline // empty)) as $pairedBaselineValues
| ($items | map(.observation.statistics.measuredSampleCount // .observation.statistics.sampleCount // 1) | add // ($items | length)) as $sampleCount
| ($values | median) as $median
| {
@@ -1560,6 +1727,7 @@ jq -n \
sourceCount: ($items | length),
sampleCount: $sampleCount,
pairedSampleCount: ($items | map(.observation.statistics.pairedSampleCount // .observation.comparison.pairedSampleCount // 0) | add // 0),
+ pairedBaselineValue: (if ($pairedBaselineValues | length) == 0 then null else ($pairedBaselineValues | median) end),
generatedAt: ($items[-1].generatedAt // null)
};
@@ -1717,10 +1885,23 @@ jq -n \
.key as $key
| .value as $currentValue
| ($baselineObs[$key] // null) as $baselineValue
+ | ($currentValue.observation | observation_policy(.)) as $policy
+ | ($policy.comparisonMode // (if ($currentValue.observation.measurementKind // $currentValue.measurementKind) == "deterministic" or ($currentValue.observation.unit // "") != "seconds" then "budget" elif ($currentValue.observation.measurementKind // $currentValue.measurementKind) == "diagnostic" then "diagnostic" else "historical" end)) as $comparisonMode
+ | ($currentValue.pairedBaselineValue // null) as $pairedBaselineValue
+ | (if $comparisonMode == "paired" and $pairedBaselineValue != null then {
+ value: $pairedBaselineValue,
+ min: $pairedBaselineValue,
+ max: $pairedBaselineValue,
+ p25: $pairedBaselineValue,
+ p75: $pairedBaselineValue,
+ p95: $pairedBaselineValue,
+ mad: 0,
+ sourceCount: $currentValue.pairedSampleCount
+ } else $baselineValue end) as $effectiveBaselineValue
| {
key: $key,
value: (
- if $baselineValue == null then
+ if $effectiveBaselineValue == null then
{
status: "missing_baseline",
target: $currentValue.target,
@@ -1728,7 +1909,8 @@ jq -n \
current: $currentValue.value,
currentSamples: $currentValue.sampleCount,
baselineSources: 0,
- gatePolicy: ($currentValue.observation | observation_policy(.)),
+ gatePolicy: $policy,
+ comparisonMode: $comparisonMode,
gateable: false,
gateReason: "missing_baseline",
confidence: "missing_baseline",
@@ -1739,32 +1921,32 @@ jq -n \
$currentValue.observation.name;
$currentValue.observation.unit;
($currentValue.observation.measurementKind // $currentValue.measurementKind);
- ($currentValue.observation | observation_policy(.));
+ $policy;
$currentValue.value;
$currentValue.p25;
$currentValue.p75;
$currentValue.mad;
- $baselineValue.value;
- $baselineValue.min;
- $baselineValue.max;
- $baselineValue.p25;
- $baselineValue.p75;
- $baselineValue.p95;
- $baselineValue.mad;
+ $effectiveBaselineValue.value;
+ $effectiveBaselineValue.min;
+ $effectiveBaselineValue.max;
+ $effectiveBaselineValue.p25;
+ $effectiveBaselineValue.p75;
+ $effectiveBaselineValue.p95;
+ $effectiveBaselineValue.mad;
$currentValue.sampleCount;
- $baselineValue.sourceCount;
+ $effectiveBaselineValue.sourceCount;
$currentValue.pairedSampleCount
) + {
target: $currentValue.target,
observation: $currentValue.observation,
currentSamples: $currentValue.sampleCount,
- baselineSources: $baselineValue.sourceCount,
- baselineMin: $baselineValue.min,
- baselineMax: $baselineValue.max,
- baselineP25: $baselineValue.p25,
- baselineP75: $baselineValue.p75,
- baselineP95: $baselineValue.p95
- ,baselineMad: $baselineValue.mad
+ baselineSources: $effectiveBaselineValue.sourceCount,
+ baselineMin: $effectiveBaselineValue.min,
+ baselineMax: $effectiveBaselineValue.max,
+ baselineP25: $effectiveBaselineValue.p25,
+ baselineP75: $effectiveBaselineValue.p75,
+ baselineP95: $effectiveBaselineValue.p95
+ ,baselineMad: $effectiveBaselineValue.mad
}
end
)
From 32da40c1e6e4171097d59acb2aff1ec9dc4352bd Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Tue, 19 May 2026 08:03:18 +0200
Subject: [PATCH 41/81] Make wall-clock measurement gates uncertainty-aware
---
.github/workflows/ci.yml | 208 ++++++++++++++++--
context/ci-measurements.md | 31 ++-
.../ci-measurement-comparison.test.sh | 51 ++++-
genie/ci-workflow/measurements.ts | 151 +++++++++++--
4 files changed, 397 insertions(+), 44 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7ec7f9269..766b11662 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2629,6 +2629,7 @@ jobs:
CI_MEASUREMENT_HEAD_DIR="${CI_MEASUREMENT_HEAD_DIR:-$PWD}"
CI_MEASUREMENT_BASE_DIR="${CI_MEASUREMENT_BASE_DIR:-${RUNNER_TEMP:-/tmp}/ci-measurement-base}"
CI_MEASUREMENT_PAIRED_ENABLED=0
+ CI_MEASUREMENT_ORDER_SEED="${CI_MEASUREMENT_ORDER_SEED:-${GITHUB_RUN_ID:-local}-${GITHUB_RUN_ATTEMPT:-0}-${GITHUB_SHA:-unknown}}"
prepare_paired_base_worktree() {
if [ "${GITHUB_EVENT_NAME:-}" != "pull_request" ]; then
@@ -2720,7 +2721,20 @@ jobs:
--arg stderr "$stderr" \
--arg trace "$trace" \
--argjson gatePolicy "$gate_policy" \
- '($samples[0] // []) as $sampleList
+ 'def median:
+ sort as $sorted
+ | ($sorted | length) as $count
+ | if $count == 0 then null
+ elif ($count % 2) == 1 then $sorted[($count / 2 | floor)]
+ else (($sorted[($count / 2 - 1)] + $sorted[($count / 2)]) / 2)
+ end;
+ def percentile($p):
+ sort as $sorted
+ | ($sorted | length) as $count
+ | if $count == 0 then null
+ else $sorted[(($p * ($count - 1)) | floor)]
+ end;
+ ($samples[0] // []) as $sampleList
| ($sampleList | map(select((.subject // "head") == "head" and .phase != "warmup" and .status == 0) | .durationMs)) as $successfulDurations
| ($sampleList | map(select((.subject // "head") == "head" and .phase == "warmup"))) as $warmupSamples
| ($sampleList | map(select((.subject // "head") == "head" and .phase == "measured" and .status == 0 and .pairIndex != null))) as $headSamples
@@ -2737,6 +2751,7 @@ jobs:
| ($pairedSamples | map(.currentDurationMs)) as $pairedCurrentDurations
| ($pairedSamples | map(.baselineDurationMs)) as $pairedBaselineDurations
| ($pairedSamples | map(.deltaMs)) as $pairedDeltaDurations
+ | ($pairedDeltaDurations | median) as $pairedDeltaMedian
| {
id:$id,
name:$id,
@@ -2762,9 +2777,18 @@ jobs:
maxDurationMs: ($successfulDurations | max),
medianDurationMs: $durationMs,
pairedSampleCount: ($pairedSamples | length),
- pairedCurrentMedianDurationMs: (if ($pairedCurrentDurations | length) == 0 then null else ($pairedCurrentDurations | sort | .[(length - 1) / 2 | floor]) end),
- pairedBaselineMedianDurationMs: (if ($pairedBaselineDurations | length) == 0 then null else ($pairedBaselineDurations | sort | .[(length - 1) / 2 | floor]) end),
- pairedDeltaMedianDurationMs: (if ($pairedDeltaDurations | length) == 0 then null else ($pairedDeltaDurations | sort | .[(length - 1) / 2 | floor]) end)
+ pairedCurrentMedianDurationMs: ($pairedCurrentDurations | median),
+ pairedBaselineMedianDurationMs: ($pairedBaselineDurations | median),
+ pairedDeltaMedianDurationMs: $pairedDeltaMedian,
+ pairedDeltaMinDurationMs: ($pairedDeltaDurations | min),
+ pairedDeltaMaxDurationMs: ($pairedDeltaDurations | max),
+ pairedDeltaP25DurationMs: ($pairedDeltaDurations | percentile(0.25)),
+ pairedDeltaP75DurationMs: ($pairedDeltaDurations | percentile(0.75)),
+ pairedDeltaMadDurationMs: (
+ if $pairedDeltaMedian == null then null
+ else ($pairedDeltaDurations | map(. - $pairedDeltaMedian | if . < 0 then -. else . end) | median)
+ end
+ )
},
samples:$sampleList
}' \
@@ -2800,6 +2824,8 @@ jobs:
printf '[' >"$samples_file"
local sample_first=1
local sample_index measured_index total_repetitions phase sample_stdout sample_stderr sample_trace expanded
+ local order_offset
+ order_offset="$(printf '%s' "$CI_MEASUREMENT_ORDER_SEED:$id" | cksum | awk '{ print $1 % 2 }')"
total_repetitions=$((warmup_repetitions + repetitions))
for sample_index in $(seq 1 "$total_repetitions"); do
if [ "$sample_index" -le "$warmup_repetitions" ]; then
@@ -2842,7 +2868,7 @@ jobs:
done
local base_ran_before_head=0 base_stdout base_stderr base_started base_ended base_status base_duration_ms
- if [ "$phase" = "measured" ] && [ "$CI_MEASUREMENT_PAIRED_ENABLED" -eq 1 ] && [ $((measured_index % 2)) -eq 0 ]; then
+ if [ "$phase" = "measured" ] && [ "$CI_MEASUREMENT_PAIRED_ENABLED" -eq 1 ] && [ $(((measured_index + order_offset) % 2)) -eq 0 ]; then
base_ran_before_head=1
base_stdout="$ARTIFACT_DIR/$id.$sample_index.base.stdout"
base_stderr="$ARTIFACT_DIR/$id.$sample_index.base.stderr"
@@ -2865,7 +2891,8 @@ jobs:
--argjson durationMs "$base_duration_ms" \
--arg stdout "$base_stdout" \
--arg stderr "$base_stderr" \
- '{index:$index,measuredIndex:($measuredIndex | tonumber),pairIndex:($measuredIndex | tonumber),subject:"base",phase:"measured",status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:null,order:"base-head"}' \
+ --arg orderSeed "$CI_MEASUREMENT_ORDER_SEED" \
+ '{index:$index,measuredIndex:($measuredIndex | tonumber),pairIndex:($measuredIndex | tonumber),subject:"base",phase:"measured",status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:null,order:"base-head",orderSeed:$orderSeed}' \
>>"$samples_file"
if [ "$base_status" -ne 0 ]; then
@@ -2896,7 +2923,8 @@ jobs:
--arg stderr "$sample_stderr" \
--arg trace "$sample_trace" \
--arg order "$(if [ "$phase" = "measured" ] && [ "$base_ran_before_head" -eq 1 ]; then printf base-head; else printf head-base; fi)" \
- '{index:$index,measuredIndex:(if $measuredIndex == "" then null else ($measuredIndex | tonumber) end),pairIndex:(if $measuredIndex == "" then null else ($measuredIndex | tonumber) end),subject:"head",phase:$phase,status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:(if $trace == "" then null else $trace end),order:(if $phase == "measured" then $order else null end)}' \
+ --arg orderSeed "$CI_MEASUREMENT_ORDER_SEED" \
+ '{index:$index,measuredIndex:(if $measuredIndex == "" then null else ($measuredIndex | tonumber) end),pairIndex:(if $measuredIndex == "" then null else ($measuredIndex | tonumber) end),subject:"head",phase:$phase,status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:(if $trace == "" then null else $trace end),order:(if $phase == "measured" then $order else null end),orderSeed:(if $phase == "measured" then $orderSeed else null end)}' \
>>"$samples_file"
if [ "$phase" = "measured" ] && [ "$status" -eq 0 ] && [ "$CI_MEASUREMENT_PAIRED_ENABLED" -eq 1 ] && [ "$base_ran_before_head" -eq 0 ]; then
@@ -2918,7 +2946,8 @@ jobs:
--argjson durationMs "$base_duration_ms" \
--arg stdout "$base_stdout" \
--arg stderr "$base_stderr" \
- '{index:$index,measuredIndex:($measuredIndex | tonumber),pairIndex:($measuredIndex | tonumber),subject:"base",phase:"measured",status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:null,order:"head-base"}' \
+ --arg orderSeed "$CI_MEASUREMENT_ORDER_SEED" \
+ '{index:$index,measuredIndex:($measuredIndex | tonumber),pairIndex:($measuredIndex | tonumber),subject:"base",phase:"measured",status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:null,order:"head-base",orderSeed:$orderSeed}' \
>>"$samples_file"
if [ "$base_status" -ne 0 ]; then
@@ -3015,6 +3044,7 @@ jobs:
--arg traceId "${TRACE_ID:-}" \
--arg devenvRev "${DEVENV_REV:-unknown}" \
--arg otelServiceName "${OTEL_SERVICE_NAME:-unknown}" \
+ --arg orderSeed "$CI_MEASUREMENT_ORDER_SEED" \
--arg targetSystem "${DEVENV_SYSTEM:-${RUNNER_OS:-unknown}}" \
'{
schemaVersion: $schemaVersion,
@@ -3092,6 +3122,36 @@ jobs:
then null
else (.statistics.pairedDeltaMedianDurationMs / 1000)
end
+ ),
+ pairedDeltaMin: (
+ if (.statistics.pairedDeltaMinDurationMs // null) == null
+ then null
+ else (.statistics.pairedDeltaMinDurationMs / 1000)
+ end
+ ),
+ pairedDeltaMax: (
+ if (.statistics.pairedDeltaMaxDurationMs // null) == null
+ then null
+ else (.statistics.pairedDeltaMaxDurationMs / 1000)
+ end
+ ),
+ pairedDeltaP25: (
+ if (.statistics.pairedDeltaP25DurationMs // null) == null
+ then null
+ else (.statistics.pairedDeltaP25DurationMs / 1000)
+ end
+ ),
+ pairedDeltaP75: (
+ if (.statistics.pairedDeltaP75DurationMs // null) == null
+ then null
+ else (.statistics.pairedDeltaP75DurationMs / 1000)
+ end
+ ),
+ pairedDeltaMad: (
+ if (.statistics.pairedDeltaMadDurationMs // null) == null
+ then null
+ else (.statistics.pairedDeltaMadDurationMs / 1000)
+ end
)
},
dimensions: {
@@ -3104,7 +3164,13 @@ jobs:
pairedSampleCount: (.statistics.pairedSampleCount // 0),
pairedOrderProtocol: (
if (.statistics.pairedSampleCount // 0) > 0
- then "alternating-head-base"
+ then "balanced-seeded-alternating-v1"
+ else null
+ end
+ ),
+ pairedOrderSeed: (
+ if (.statistics.pairedSampleCount // 0) > 0
+ then $orderSeed
else null
end
),
@@ -3258,6 +3324,10 @@ jobs:
def observation_stats($items):
($items | map(.observation.value)) as $values
| ($items | map(.observation.comparison.baseline // empty)) as $pairedBaselineValues
+ | ($items | map(.observation.statistics.pairedDeltaMedian // empty)) as $pairedDeltaMedianValues
+ | ($items | map(.observation.statistics.pairedDeltaP25 // empty)) as $pairedDeltaP25Values
+ | ($items | map(.observation.statistics.pairedDeltaP75 // empty)) as $pairedDeltaP75Values
+ | ($items | map(.observation.statistics.pairedDeltaMad // empty)) as $pairedDeltaMadValues
| ($items | map(.observation.statistics.measuredSampleCount // .observation.statistics.sampleCount // 1) | add // ($items | length)) as $sampleCount
| ($values | median) as $median
| {
@@ -3275,6 +3345,10 @@ jobs:
sampleCount: $sampleCount,
pairedSampleCount: ($items | map(.observation.statistics.pairedSampleCount // .observation.comparison.pairedSampleCount // 0) | add // 0),
pairedBaselineValue: (if ($pairedBaselineValues | length) == 0 then null else ($pairedBaselineValues | median) end),
+ pairedDeltaMedianValue: (if ($pairedDeltaMedianValues | length) == 0 then null else ($pairedDeltaMedianValues | median) end),
+ pairedDeltaP25Value: (if ($pairedDeltaP25Values | length) == 0 then null else ($pairedDeltaP25Values | median) end),
+ pairedDeltaP75Value: (if ($pairedDeltaP75Values | length) == 0 then null else ($pairedDeltaP75Values | median) end),
+ pairedDeltaMadValue: (if ($pairedDeltaMadValues | length) == 0 then null else ($pairedDeltaMadValues | median) end),
generatedAt: ($items[-1].generatedAt // null)
};
@@ -3313,14 +3387,16 @@ jobs:
def policy_enabled($policy):
if ($policy | has("enabled")) then $policy.enabled else true end;
- def classify($metric; $unit; $measurementKind; $policy; $current; $currentP25; $currentP75; $currentMad; $baseline; $baselineMin; $baselineMax; $baselineP25; $baselineP75; $baselineP95; $baselineMad; $currentSamples; $baselineSources; $pairedSamples):
+ def classify($metric; $unit; $measurementKind; $policy; $current; $currentP25; $currentP75; $currentMad; $baseline; $baselineMin; $baselineMax; $baselineP25; $baselineP75; $baselineP95; $baselineMad; $currentSamples; $baselineSources; $pairedSamples; $pairedDeltaMedian; $pairedDeltaP25; $pairedDeltaP75; $pairedDeltaMad):
$policy as $b
| ($policy.comparisonMode // (if $measurementKind == "deterministic" or $unit != "seconds" then "budget" elif $measurementKind == "diagnostic" then "diagnostic" else "historical" end)) as $comparisonMode
| ($policy.noiseFloor // noise_floor($metric; $unit)) as $noise
| ($current - $baseline) as $delta
+ | (if $comparisonMode == "paired" and $pairedDeltaMedian != null then $pairedDeltaMedian else $delta end) as $evidenceDelta
| (if $baseline > 0 then ($current / $baseline) else null end) as $ratio
| (($baselineP75 // $baseline) - ($baselineP25 // $baseline)) as $iqr
| (($currentP75 // $current) - ($currentP25 // $current)) as $currentIqr
+ | (($pairedDeltaP75 // $evidenceDelta) - ($pairedDeltaP25 // $evidenceDelta)) as $pairedDeltaIqr
| ([
$noise,
(($policy.statisticalToleranceAbs // 0) | tonumber),
@@ -3335,10 +3411,19 @@ jobs:
(($currentMad // 0) * 3),
(($currentIqr // 0) * 1.5)
] | max) else 0 end) as $currentRobustTolerance
+ | ([
+ $noise,
+ (($policy.statisticalToleranceAbs // 0) | tonumber),
+ (if $baseline > 0 then ($baseline * (($policy.statisticalToleranceRatio // 0) | tonumber)) else 0 end),
+ (($pairedDeltaMad // 0) * 3),
+ (($pairedDeltaIqr // 0) * 1.5)
+ ] | max) as $pairedDeltaTolerance
| ($baseline + $robustTolerance) as $robustUpper
| ($baseline - $robustTolerance) as $robustLower
| ($current + $currentRobustTolerance) as $currentRobustUpper
| ($current - $currentRobustTolerance) as $currentRobustLower
+ | ($evidenceDelta + $pairedDeltaTolerance) as $evidenceDeltaUpper
+ | ($evidenceDelta - $pairedDeltaTolerance) as $evidenceDeltaLower
| ([($b.warnAbs // 0), (if $baseline > 0 then ($baseline * (($b.warnRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $warnBudget
| ([($b.failAbs // 0), (if $baseline > 0 then ($baseline * (($b.failRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $failBudget
| (
@@ -3353,6 +3438,9 @@ jobs:
) as $withinBaselineRange
| (
if $baseline <= 0 then "unknown"
+ elif $comparisonMode == "paired" and $evidenceDeltaLower > $failBudget then "fail"
+ elif $comparisonMode == "paired" and $evidenceDeltaLower > $warnBudget then "warn"
+ elif $comparisonMode == "paired" then "pass"
elif ($delta > $b.failAbs and $current > ($baseline * $b.failRatio)) then "fail"
elif ($delta > $b.warnAbs and $current > ($baseline * $b.warnRatio)) then "warn"
else "pass"
@@ -3364,6 +3452,7 @@ jobs:
and $baselineSources >= ($policy.minBaselineSources // 1)
and $currentSamples >= ($policy.minCurrentSamples // 1)
and (if $comparisonMode == "paired" then $pairedSamples >= ($policy.minPairedSamples // 1) else true end)
+ and (if $comparisonMode == "paired" then $pairedDeltaMedian != null else true end)
) as $gateable
| (
if (policy_enabled($policy) != true) then "disabled"
@@ -3371,6 +3460,7 @@ jobs:
elif $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
+ elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta"
else "eligible"
end
) as $gateReason
@@ -3381,6 +3471,8 @@ jobs:
elif $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
+ elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta"
+ elif $comparisonMode == "paired" and $thresholdStatus == "pass" and $evidenceDelta > $warnBudget then "paired_uncertain"
elif ($comparisonMode == "historical" and $thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band"
elif $thresholdStatus == "pass" then "within_budget"
else "threshold_exceeded"
@@ -3394,6 +3486,10 @@ jobs:
) as $status
| (
if $baseline <= 0 then "unknown"
+ elif $comparisonMode == "paired" and ($evidenceDelta | abs_value) <= $noise then "unchanged"
+ elif $comparisonMode == "paired" and $evidenceDeltaLower <= 0 and $evidenceDeltaUpper >= 0 then "unchanged"
+ elif $comparisonMode == "paired" and $evidenceDelta < 0 then "improved"
+ elif $comparisonMode == "paired" then "regressed"
elif ($delta | abs_value) <= $noise then "unchanged"
elif $withinRobustBand then "unchanged"
elif $delta < 0 then "improved"
@@ -3403,6 +3499,10 @@ jobs:
| (
if $baseline <= 0 then null
elif (policy_enabled($policy) != true) then null
+ elif $comparisonMode == "paired" and ($evidenceDeltaLower <= 0 and $evidenceDeltaUpper >= 0) then 0
+ elif $comparisonMode == "paired" and ($evidenceDelta | abs_value) <= $noise then 0
+ elif $comparisonMode == "paired" and $evidenceDelta > 0 then ([0, $evidenceDeltaLower] | max) / $warnBudget
+ elif $comparisonMode == "paired" then -(([0, (-$evidenceDeltaUpper)] | max) / $warnBudget)
elif $withinRobustBand then 0
elif ($delta | abs_value) <= $noise then 0
elif ($confidence == "threshold_exceeded" and $delta > 0) then ([0, ($currentRobustLower - $robustUpper), $delta] | max) / $warnBudget
@@ -3421,7 +3521,7 @@ jobs:
else "improvement"
end
) as $semanticImpactKind
- | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,comparisonMode:$comparisonMode,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,semanticImpactScore:$semanticImpactScore,semanticImpactKind:$semanticImpactKind,semanticWarnBudget:$warnBudget,semanticFailBudget:$failBudget,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange,pairedSamples:$pairedSamples};
+ | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,comparisonMode:$comparisonMode,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,semanticImpactScore:$semanticImpactScore,semanticImpactKind:$semanticImpactKind,semanticWarnBudget:$warnBudget,semanticFailBudget:$failBudget,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange,pairedSamples:$pairedSamples,evidenceDelta:$evidenceDelta,evidenceDeltaLower:$evidenceDeltaLower,evidenceDeltaUpper:$evidenceDeltaUpper,evidenceDeltaTolerance:$pairedDeltaTolerance};
(observations_by_key($current[0]) | with_entries(.value = observation_stats(.value))) as $currentObs
| (observations_by_key($baseline[0]) | with_entries(.value = observation_stats(.value))) as $baselineObs
@@ -3482,7 +3582,11 @@ jobs:
$effectiveBaselineValue.mad;
$currentValue.sampleCount;
$effectiveBaselineValue.sourceCount;
- $currentValue.pairedSampleCount
+ $currentValue.pairedSampleCount;
+ $currentValue.pairedDeltaMedianValue;
+ $currentValue.pairedDeltaP25Value;
+ $currentValue.pairedDeltaP75Value;
+ $currentValue.pairedDeltaMadValue
) + {
target: $currentValue.target,
observation: $currentValue.observation,
@@ -3509,7 +3613,8 @@ jobs:
and (.gateReason == "missing_baseline"
or .gateReason == "low_baseline_count"
or .gateReason == "low_current_sample_count"
- or .gateReason == "low_paired_sample_count")
+ or .gateReason == "low_paired_sample_count"
+ or .gateReason == "missing_paired_delta")
) then "partial"
else "pass"
end
@@ -3522,7 +3627,8 @@ jobs:
missingBaselineCount: (map(select(.gateReason == "missing_baseline")) | length),
lowBaselineCount: (map(select(.gateReason == "low_baseline_count")) | length),
lowCurrentSampleCount: (map(select(.gateReason == "low_current_sample_count")) | length),
- lowPairedSampleCount: (map(select(.gateReason == "low_paired_sample_count")) | length)
+ lowPairedSampleCount: (map(select(.gateReason == "low_paired_sample_count")) | length),
+ missingPairedDeltaCount: (map(select(.gateReason == "missing_paired_delta")) | length)
}
| . + {
nonGateableCount: (.enabledCount - .gateableCount),
@@ -3823,6 +3929,18 @@ jobs:
tone: 'neutral',
color: '#94a3b8',
}
+ if (row.confidence === 'missing_paired_delta') return {
+ label: 'Needs paired delta stats',
+ detail: 'Wall-clock gates require per-pair delta statistics, not only paired medians.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ if (row.confidence === 'paired_uncertain') return {
+ label: 'Uncertain wall-clock movement',
+ detail: 'The paired median moved, but the paired delta band still crosses the configured budget.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
if (row.confidence === 'diagnostic') return {
label: 'Diagnostic only',
detail: 'Shown for investigation, but intentionally excluded from gating.',
@@ -4940,6 +5058,10 @@ jobs:
def observation_stats($items):
($items | map(.observation.value)) as $values
| ($items | map(.observation.comparison.baseline // empty)) as $pairedBaselineValues
+ | ($items | map(.observation.statistics.pairedDeltaMedian // empty)) as $pairedDeltaMedianValues
+ | ($items | map(.observation.statistics.pairedDeltaP25 // empty)) as $pairedDeltaP25Values
+ | ($items | map(.observation.statistics.pairedDeltaP75 // empty)) as $pairedDeltaP75Values
+ | ($items | map(.observation.statistics.pairedDeltaMad // empty)) as $pairedDeltaMadValues
| ($items | map(.observation.statistics.measuredSampleCount // .observation.statistics.sampleCount // 1) | add // ($items | length)) as $sampleCount
| ($values | median) as $median
| {
@@ -4957,6 +5079,10 @@ jobs:
sampleCount: $sampleCount,
pairedSampleCount: ($items | map(.observation.statistics.pairedSampleCount // .observation.comparison.pairedSampleCount // 0) | add // 0),
pairedBaselineValue: (if ($pairedBaselineValues | length) == 0 then null else ($pairedBaselineValues | median) end),
+ pairedDeltaMedianValue: (if ($pairedDeltaMedianValues | length) == 0 then null else ($pairedDeltaMedianValues | median) end),
+ pairedDeltaP25Value: (if ($pairedDeltaP25Values | length) == 0 then null else ($pairedDeltaP25Values | median) end),
+ pairedDeltaP75Value: (if ($pairedDeltaP75Values | length) == 0 then null else ($pairedDeltaP75Values | median) end),
+ pairedDeltaMadValue: (if ($pairedDeltaMadValues | length) == 0 then null else ($pairedDeltaMadValues | median) end),
generatedAt: ($items[-1].generatedAt // null)
};
@@ -4995,14 +5121,16 @@ jobs:
def policy_enabled($policy):
if ($policy | has("enabled")) then $policy.enabled else true end;
- def classify($metric; $unit; $measurementKind; $policy; $current; $currentP25; $currentP75; $currentMad; $baseline; $baselineMin; $baselineMax; $baselineP25; $baselineP75; $baselineP95; $baselineMad; $currentSamples; $baselineSources; $pairedSamples):
+ def classify($metric; $unit; $measurementKind; $policy; $current; $currentP25; $currentP75; $currentMad; $baseline; $baselineMin; $baselineMax; $baselineP25; $baselineP75; $baselineP95; $baselineMad; $currentSamples; $baselineSources; $pairedSamples; $pairedDeltaMedian; $pairedDeltaP25; $pairedDeltaP75; $pairedDeltaMad):
$policy as $b
| ($policy.comparisonMode // (if $measurementKind == "deterministic" or $unit != "seconds" then "budget" elif $measurementKind == "diagnostic" then "diagnostic" else "historical" end)) as $comparisonMode
| ($policy.noiseFloor // noise_floor($metric; $unit)) as $noise
| ($current - $baseline) as $delta
+ | (if $comparisonMode == "paired" and $pairedDeltaMedian != null then $pairedDeltaMedian else $delta end) as $evidenceDelta
| (if $baseline > 0 then ($current / $baseline) else null end) as $ratio
| (($baselineP75 // $baseline) - ($baselineP25 // $baseline)) as $iqr
| (($currentP75 // $current) - ($currentP25 // $current)) as $currentIqr
+ | (($pairedDeltaP75 // $evidenceDelta) - ($pairedDeltaP25 // $evidenceDelta)) as $pairedDeltaIqr
| ([
$noise,
(($policy.statisticalToleranceAbs // 0) | tonumber),
@@ -5017,10 +5145,19 @@ jobs:
(($currentMad // 0) * 3),
(($currentIqr // 0) * 1.5)
] | max) else 0 end) as $currentRobustTolerance
+ | ([
+ $noise,
+ (($policy.statisticalToleranceAbs // 0) | tonumber),
+ (if $baseline > 0 then ($baseline * (($policy.statisticalToleranceRatio // 0) | tonumber)) else 0 end),
+ (($pairedDeltaMad // 0) * 3),
+ (($pairedDeltaIqr // 0) * 1.5)
+ ] | max) as $pairedDeltaTolerance
| ($baseline + $robustTolerance) as $robustUpper
| ($baseline - $robustTolerance) as $robustLower
| ($current + $currentRobustTolerance) as $currentRobustUpper
| ($current - $currentRobustTolerance) as $currentRobustLower
+ | ($evidenceDelta + $pairedDeltaTolerance) as $evidenceDeltaUpper
+ | ($evidenceDelta - $pairedDeltaTolerance) as $evidenceDeltaLower
| ([($b.warnAbs // 0), (if $baseline > 0 then ($baseline * (($b.warnRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $warnBudget
| ([($b.failAbs // 0), (if $baseline > 0 then ($baseline * (($b.failRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $failBudget
| (
@@ -5035,6 +5172,9 @@ jobs:
) as $withinBaselineRange
| (
if $baseline <= 0 then "unknown"
+ elif $comparisonMode == "paired" and $evidenceDeltaLower > $failBudget then "fail"
+ elif $comparisonMode == "paired" and $evidenceDeltaLower > $warnBudget then "warn"
+ elif $comparisonMode == "paired" then "pass"
elif ($delta > $b.failAbs and $current > ($baseline * $b.failRatio)) then "fail"
elif ($delta > $b.warnAbs and $current > ($baseline * $b.warnRatio)) then "warn"
else "pass"
@@ -5046,6 +5186,7 @@ jobs:
and $baselineSources >= ($policy.minBaselineSources // 1)
and $currentSamples >= ($policy.minCurrentSamples // 1)
and (if $comparisonMode == "paired" then $pairedSamples >= ($policy.minPairedSamples // 1) else true end)
+ and (if $comparisonMode == "paired" then $pairedDeltaMedian != null else true end)
) as $gateable
| (
if (policy_enabled($policy) != true) then "disabled"
@@ -5053,6 +5194,7 @@ jobs:
elif $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
+ elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta"
else "eligible"
end
) as $gateReason
@@ -5063,6 +5205,8 @@ jobs:
elif $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
+ elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta"
+ elif $comparisonMode == "paired" and $thresholdStatus == "pass" and $evidenceDelta > $warnBudget then "paired_uncertain"
elif ($comparisonMode == "historical" and $thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band"
elif $thresholdStatus == "pass" then "within_budget"
else "threshold_exceeded"
@@ -5076,6 +5220,10 @@ jobs:
) as $status
| (
if $baseline <= 0 then "unknown"
+ elif $comparisonMode == "paired" and ($evidenceDelta | abs_value) <= $noise then "unchanged"
+ elif $comparisonMode == "paired" and $evidenceDeltaLower <= 0 and $evidenceDeltaUpper >= 0 then "unchanged"
+ elif $comparisonMode == "paired" and $evidenceDelta < 0 then "improved"
+ elif $comparisonMode == "paired" then "regressed"
elif ($delta | abs_value) <= $noise then "unchanged"
elif $withinRobustBand then "unchanged"
elif $delta < 0 then "improved"
@@ -5085,6 +5233,10 @@ jobs:
| (
if $baseline <= 0 then null
elif (policy_enabled($policy) != true) then null
+ elif $comparisonMode == "paired" and ($evidenceDeltaLower <= 0 and $evidenceDeltaUpper >= 0) then 0
+ elif $comparisonMode == "paired" and ($evidenceDelta | abs_value) <= $noise then 0
+ elif $comparisonMode == "paired" and $evidenceDelta > 0 then ([0, $evidenceDeltaLower] | max) / $warnBudget
+ elif $comparisonMode == "paired" then -(([0, (-$evidenceDeltaUpper)] | max) / $warnBudget)
elif $withinRobustBand then 0
elif ($delta | abs_value) <= $noise then 0
elif ($confidence == "threshold_exceeded" and $delta > 0) then ([0, ($currentRobustLower - $robustUpper), $delta] | max) / $warnBudget
@@ -5103,7 +5255,7 @@ jobs:
else "improvement"
end
) as $semanticImpactKind
- | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,comparisonMode:$comparisonMode,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,semanticImpactScore:$semanticImpactScore,semanticImpactKind:$semanticImpactKind,semanticWarnBudget:$warnBudget,semanticFailBudget:$failBudget,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange,pairedSamples:$pairedSamples};
+ | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,comparisonMode:$comparisonMode,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,semanticImpactScore:$semanticImpactScore,semanticImpactKind:$semanticImpactKind,semanticWarnBudget:$warnBudget,semanticFailBudget:$failBudget,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange,pairedSamples:$pairedSamples,evidenceDelta:$evidenceDelta,evidenceDeltaLower:$evidenceDeltaLower,evidenceDeltaUpper:$evidenceDeltaUpper,evidenceDeltaTolerance:$pairedDeltaTolerance};
(observations_by_key($current[0]) | with_entries(.value = observation_stats(.value))) as $currentObs
| (observations_by_key($baseline[0]) | with_entries(.value = observation_stats(.value))) as $baselineObs
@@ -5164,7 +5316,11 @@ jobs:
$effectiveBaselineValue.mad;
$currentValue.sampleCount;
$effectiveBaselineValue.sourceCount;
- $currentValue.pairedSampleCount
+ $currentValue.pairedSampleCount;
+ $currentValue.pairedDeltaMedianValue;
+ $currentValue.pairedDeltaP25Value;
+ $currentValue.pairedDeltaP75Value;
+ $currentValue.pairedDeltaMadValue
) + {
target: $currentValue.target,
observation: $currentValue.observation,
@@ -5191,7 +5347,8 @@ jobs:
and (.gateReason == "missing_baseline"
or .gateReason == "low_baseline_count"
or .gateReason == "low_current_sample_count"
- or .gateReason == "low_paired_sample_count")
+ or .gateReason == "low_paired_sample_count"
+ or .gateReason == "missing_paired_delta")
) then "partial"
else "pass"
end
@@ -5204,7 +5361,8 @@ jobs:
missingBaselineCount: (map(select(.gateReason == "missing_baseline")) | length),
lowBaselineCount: (map(select(.gateReason == "low_baseline_count")) | length),
lowCurrentSampleCount: (map(select(.gateReason == "low_current_sample_count")) | length),
- lowPairedSampleCount: (map(select(.gateReason == "low_paired_sample_count")) | length)
+ lowPairedSampleCount: (map(select(.gateReason == "low_paired_sample_count")) | length),
+ missingPairedDeltaCount: (map(select(.gateReason == "missing_paired_delta")) | length)
}
| . + {
nonGateableCount: (.enabledCount - .gateableCount),
@@ -5505,6 +5663,18 @@ jobs:
tone: 'neutral',
color: '#94a3b8',
}
+ if (row.confidence === 'missing_paired_delta') return {
+ label: 'Needs paired delta stats',
+ detail: 'Wall-clock gates require per-pair delta statistics, not only paired medians.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ if (row.confidence === 'paired_uncertain') return {
+ label: 'Uncertain wall-clock movement',
+ detail: 'The paired median moved, but the paired delta band still crosses the configured budget.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
if (row.confidence === 'diagnostic') return {
label: 'Diagnostic only',
detail: 'Shown for investigation, but intentionally excluded from gating.',
diff --git a/context/ci-measurements.md b/context/ci-measurements.md
index 95a179c92..34d6eaf93 100644
--- a/context/ci-measurements.md
+++ b/context/ci-measurements.md
@@ -49,9 +49,18 @@ They need same-run base/head evidence before they can block a merge. Historical
baselines remain useful for trend context, but they do not prove PR causality.
For PR runs, the wall-clock producer checks out the PR base commit in a sibling
worktree and alternates measured pair order (`head -> base`, then
-`base -> head`) to reduce cache and time drift bias. The current artifact
-stores the paired baseline median and paired sample count, and the comparison
-engine uses that embedded paired baseline for the gate.
+`base -> head`) from a recorded seed to reduce cache and time drift bias
+without making order a hidden variable. The current artifact stores the paired
+baseline median and paired sample count, and the comparison engine uses that
+embedded paired baseline for the gate.
+
+The gate evaluates per-pair deltas, not only the difference between medians.
+A paired wall-clock row is actionable only when the paired delta evidence band
+clears the configured warning or failure budget. If the point estimate moved
+but the paired delta band still crosses the budget, the row renders as
+`paired_uncertain` and does not block. This follows the same principle used by
+continuous benchmark tools: a point estimate without uncertainty is not enough
+evidence for a regression.
Historical wall-clock comparison may be used as an advisory transition mode.
It can warn, visualize trends, and guide investigation, but it must not be the
@@ -84,10 +93,8 @@ For merge-blocking use, same-run paired measurement is required:
```text
base warmup
head warmup
-base sample 1
-head sample 1
-head sample 2
-base sample 2
+sample pair 1: seeded order chooses base/head or head/base
+sample pair 2: opposite order
...
```
@@ -97,11 +104,13 @@ the row is partial/advisory even if the historical raw delta is large.
## Deterministic Measurements
-Nix closure size and source shape are not statistical performance probes. They
+Nix closure size, source shape, code complexity, lines of code, and file counts
+are deterministic or near-deterministic structural measurements. They are not
+wall-clock performance probes and must not use paired timing statistics. They
should use explicit budgets and semantic buckets. A closure-size regression is
actionable because the same installable and lock graph should produce a stable
-closure. Source-shape growth is an architecture signal and should remain
-advisory unless a repo defines an explicit owner-approved budget.
+closure. Source-shape or complexity growth is an architecture signal and should
+remain advisory unless a repo defines an explicit owner-approved budget.
## Visualization
@@ -111,6 +120,8 @@ Reports must distinguish raw movement from actionable evidence.
- Actionable impact is only shown for gateable rows.
- Diagnostic rows render as `diagnostic`, not `0.00x`.
- Non-gateable paired wall-clock rows render as needing paired evidence.
+- Noisy paired wall-clock rows render as uncertain, with neutral actionable
+ impact, even when the raw percentage delta is large.
This prevents a large historical wall-clock delta from looking like a proven
PR regression when the measurement lacks causal evidence.
diff --git a/genie/ci-scripts/ci-measurement-comparison.test.sh b/genie/ci-scripts/ci-measurement-comparison.test.sh
index 5f73bdb06..335cd8c1f 100755
--- a/genie/ci-scripts/ci-measurement-comparison.test.sh
+++ b/genie/ci-scripts/ci-measurement-comparison.test.sh
@@ -124,7 +124,12 @@ fi
rm -rf "$tmp_dir/current" "$tmp_dir/baseline"
write_measurement "$tmp_dir/current/measurements.json" 13 devenv-perf-warm-median-v2 "$paired_policy"
-jq '.observations[0].comparison = { mode: "paired", baseline: 12.95, pairedSampleCount: 5 } | .observations[0].statistics.pairedSampleCount = 5' \
+jq '.observations[0].comparison = { mode: "paired", baseline: 12.95, pairedSampleCount: 5 }
+ | .observations[0].statistics.pairedSampleCount = 5
+ | .observations[0].statistics.pairedDeltaMedian = 0.05
+ | .observations[0].statistics.pairedDeltaP25 = 0.04
+ | .observations[0].statistics.pairedDeltaP75 = 0.06
+ | .observations[0].statistics.pairedDeltaMad = 0.01' \
"$tmp_dir/current/measurements.json" >"$tmp_dir/current/measurements.updated.json"
mv "$tmp_dir/current/measurements.updated.json" "$tmp_dir/current/measurements.json"
write_measurement "$tmp_dir/baseline/run-1/measurements.json" 10 devenv-perf-warm-median-v2 "$paired_policy"
@@ -139,6 +144,50 @@ if [ "$actual_status" != "pass" ] || [ "$actual_row" != "pass" ] || [ "$actual_g
exit 1
fi
+rm -rf "$tmp_dir/current" "$tmp_dir/baseline"
+write_measurement "$tmp_dir/current/measurements.json" 13 devenv-perf-warm-median-v2 "$paired_policy"
+jq '.observations[0].comparison = { mode: "paired", baseline: 10, pairedSampleCount: 5 }
+ | .observations[0].statistics.pairedSampleCount = 5
+ | .observations[0].statistics.pairedDeltaMedian = 1.2
+ | .observations[0].statistics.pairedDeltaP25 = -1
+ | .observations[0].statistics.pairedDeltaP75 = 3
+ | .observations[0].statistics.pairedDeltaMad = 1' \
+ "$tmp_dir/current/measurements.json" >"$tmp_dir/current/measurements.updated.json"
+mv "$tmp_dir/current/measurements.updated.json" "$tmp_dir/current/measurements.json"
+write_measurement "$tmp_dir/baseline/run-1/measurements.json" 10 devenv-perf-warm-median-v2 "$paired_policy"
+run_compare
+actual_status="$(jq -r '.status' "$tmp_dir/comparison.json")"
+actual_row="$(jq -r '.comparisons[] | .status' "$tmp_dir/comparison.json")"
+actual_confidence="$(jq -r '.comparisons[] | .confidence' "$tmp_dir/comparison.json")"
+actual_impact="$(jq -r '.comparisons[] | .semanticImpactScore' "$tmp_dir/comparison.json")"
+actual_lower="$(jq -r '.comparisons[] | .evidenceDeltaLower' "$tmp_dir/comparison.json")"
+if [ "$actual_status" != "pass" ] || [ "$actual_row" != "pass" ] || [ "$actual_confidence" != "paired_uncertain" ] || [ "$actual_impact" != "0" ] || ! awk "BEGIN { exit !($actual_lower < 0) }"; then
+ echo "expected noisy paired delta to stay pass/uncertain; got status=$actual_status row=$actual_row confidence=$actual_confidence impact=$actual_impact lower=$actual_lower" >&2
+ exit 1
+fi
+
+rm -rf "$tmp_dir/current" "$tmp_dir/baseline"
+write_measurement "$tmp_dir/current/measurements.json" 13 devenv-perf-warm-median-v2 "$paired_policy"
+jq '.observations[0].comparison = { mode: "paired", baseline: 10, pairedSampleCount: 5 }
+ | .observations[0].statistics.pairedSampleCount = 5
+ | .observations[0].statistics.pairedDeltaMedian = 3.2
+ | .observations[0].statistics.pairedDeltaP25 = 3.15
+ | .observations[0].statistics.pairedDeltaP75 = 3.25
+ | .observations[0].statistics.pairedDeltaMad = 0.03' \
+ "$tmp_dir/current/measurements.json" >"$tmp_dir/current/measurements.updated.json"
+mv "$tmp_dir/current/measurements.updated.json" "$tmp_dir/current/measurements.json"
+write_measurement "$tmp_dir/baseline/run-1/measurements.json" 10 devenv-perf-warm-median-v2 "$paired_policy"
+run_compare
+actual_status="$(jq -r '.status' "$tmp_dir/comparison.json")"
+actual_row="$(jq -r '.comparisons[] | .status' "$tmp_dir/comparison.json")"
+actual_confidence="$(jq -r '.comparisons[] | .confidence' "$tmp_dir/comparison.json")"
+actual_impact="$(jq -r '.comparisons[] | .semanticImpactScore' "$tmp_dir/comparison.json")"
+actual_lower="$(jq -r '.comparisons[] | .evidenceDeltaLower' "$tmp_dir/comparison.json")"
+if [ "$actual_status" != "fail" ] || [ "$actual_row" != "fail" ] || [ "$actual_confidence" != "threshold_exceeded" ] || ! awk "BEGIN { exit !($actual_impact > 1) }" || ! awk "BEGIN { exit !($actual_lower > 2) }"; then
+ echo "expected stable paired delta over fail budget to fail; got status=$actual_status row=$actual_row confidence=$actual_confidence impact=$actual_impact lower=$actual_lower" >&2
+ exit 1
+fi
+
rm -rf "$tmp_dir/current" "$tmp_dir/baseline"
write_measurement "$tmp_dir/current/run-1/measurements.json" 5.1 devenv-perf-warm-median-v2 "$policy"
write_measurement "$tmp_dir/current/run-2/measurements.json" 5.2 devenv-perf-warm-median-v2 "$policy"
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 6cb4bb3f0..de7784f7b 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -74,6 +74,11 @@ export type CiMeasurementObservation = {
readonly pairedBaselineMedian?: number
readonly pairedCurrentMedian?: number
readonly pairedDeltaMedian?: number
+ readonly pairedDeltaMin?: number
+ readonly pairedDeltaMax?: number
+ readonly pairedDeltaP25?: number
+ readonly pairedDeltaP75?: number
+ readonly pairedDeltaMad?: number
}
}
@@ -454,6 +459,7 @@ ARTIFACT_DIR="$(mkdir -p "$ARTIFACT_DIR" && cd "$ARTIFACT_DIR" && pwd -P)"
CI_MEASUREMENT_HEAD_DIR="${dollar}{CI_MEASUREMENT_HEAD_DIR:-$PWD}"
CI_MEASUREMENT_BASE_DIR="${dollar}{CI_MEASUREMENT_BASE_DIR:-${dollar}{RUNNER_TEMP:-/tmp}/ci-measurement-base}"
CI_MEASUREMENT_PAIRED_ENABLED=0
+CI_MEASUREMENT_ORDER_SEED="${dollar}{CI_MEASUREMENT_ORDER_SEED:-${dollar}{GITHUB_RUN_ID:-local}-${dollar}{GITHUB_RUN_ATTEMPT:-0}-${dollar}{GITHUB_SHA:-unknown}}"
prepare_paired_base_worktree() {
if [ "${dollar}{GITHUB_EVENT_NAME:-}" != "pull_request" ]; then
@@ -545,7 +551,20 @@ json_append_timing() {
--arg stderr "$stderr" \
--arg trace "$trace" \
--argjson gatePolicy "$gate_policy" \
- '($samples[0] // []) as $sampleList
+ 'def median:
+ sort as $sorted
+ | ($sorted | length) as $count
+ | if $count == 0 then null
+ elif ($count % 2) == 1 then $sorted[($count / 2 | floor)]
+ else (($sorted[($count / 2 - 1)] + $sorted[($count / 2)]) / 2)
+ end;
+ def percentile($p):
+ sort as $sorted
+ | ($sorted | length) as $count
+ | if $count == 0 then null
+ else $sorted[(($p * ($count - 1)) | floor)]
+ end;
+ ($samples[0] // []) as $sampleList
| ($sampleList | map(select((.subject // "head") == "head" and .phase != "warmup" and .status == 0) | .durationMs)) as $successfulDurations
| ($sampleList | map(select((.subject // "head") == "head" and .phase == "warmup"))) as $warmupSamples
| ($sampleList | map(select((.subject // "head") == "head" and .phase == "measured" and .status == 0 and .pairIndex != null))) as $headSamples
@@ -562,6 +581,7 @@ json_append_timing() {
| ($pairedSamples | map(.currentDurationMs)) as $pairedCurrentDurations
| ($pairedSamples | map(.baselineDurationMs)) as $pairedBaselineDurations
| ($pairedSamples | map(.deltaMs)) as $pairedDeltaDurations
+ | ($pairedDeltaDurations | median) as $pairedDeltaMedian
| {
id:$id,
name:$id,
@@ -587,9 +607,18 @@ json_append_timing() {
maxDurationMs: ($successfulDurations | max),
medianDurationMs: $durationMs,
pairedSampleCount: ($pairedSamples | length),
- pairedCurrentMedianDurationMs: (if ($pairedCurrentDurations | length) == 0 then null else ($pairedCurrentDurations | sort | .[(length - 1) / 2 | floor]) end),
- pairedBaselineMedianDurationMs: (if ($pairedBaselineDurations | length) == 0 then null else ($pairedBaselineDurations | sort | .[(length - 1) / 2 | floor]) end),
- pairedDeltaMedianDurationMs: (if ($pairedDeltaDurations | length) == 0 then null else ($pairedDeltaDurations | sort | .[(length - 1) / 2 | floor]) end)
+ pairedCurrentMedianDurationMs: ($pairedCurrentDurations | median),
+ pairedBaselineMedianDurationMs: ($pairedBaselineDurations | median),
+ pairedDeltaMedianDurationMs: $pairedDeltaMedian,
+ pairedDeltaMinDurationMs: ($pairedDeltaDurations | min),
+ pairedDeltaMaxDurationMs: ($pairedDeltaDurations | max),
+ pairedDeltaP25DurationMs: ($pairedDeltaDurations | percentile(0.25)),
+ pairedDeltaP75DurationMs: ($pairedDeltaDurations | percentile(0.75)),
+ pairedDeltaMadDurationMs: (
+ if $pairedDeltaMedian == null then null
+ else ($pairedDeltaDurations | map(. - $pairedDeltaMedian | if . < 0 then -. else . end) | median)
+ end
+ )
},
samples:$sampleList
}' \
@@ -625,6 +654,8 @@ measure() {
printf '[' >"$samples_file"
local sample_first=1
local sample_index measured_index total_repetitions phase sample_stdout sample_stderr sample_trace expanded
+ local order_offset
+ order_offset="$(printf '%s' "$CI_MEASUREMENT_ORDER_SEED:$id" | cksum | awk '{ print $1 % 2 }')"
total_repetitions=$((warmup_repetitions + repetitions))
for sample_index in $(seq 1 "$total_repetitions"); do
if [ "$sample_index" -le "$warmup_repetitions" ]; then
@@ -667,7 +698,7 @@ measure() {
done
local base_ran_before_head=0 base_stdout base_stderr base_started base_ended base_status base_duration_ms
- if [ "$phase" = "measured" ] && [ "$CI_MEASUREMENT_PAIRED_ENABLED" -eq 1 ] && [ $((measured_index % 2)) -eq 0 ]; then
+ if [ "$phase" = "measured" ] && [ "$CI_MEASUREMENT_PAIRED_ENABLED" -eq 1 ] && [ $(((measured_index + order_offset) % 2)) -eq 0 ]; then
base_ran_before_head=1
base_stdout="$ARTIFACT_DIR/$id.$sample_index.base.stdout"
base_stderr="$ARTIFACT_DIR/$id.$sample_index.base.stderr"
@@ -690,7 +721,8 @@ measure() {
--argjson durationMs "$base_duration_ms" \
--arg stdout "$base_stdout" \
--arg stderr "$base_stderr" \
- '{index:$index,measuredIndex:($measuredIndex | tonumber),pairIndex:($measuredIndex | tonumber),subject:"base",phase:"measured",status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:null,order:"base-head"}' \
+ --arg orderSeed "$CI_MEASUREMENT_ORDER_SEED" \
+ '{index:$index,measuredIndex:($measuredIndex | tonumber),pairIndex:($measuredIndex | tonumber),subject:"base",phase:"measured",status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:null,order:"base-head",orderSeed:$orderSeed}' \
>>"$samples_file"
if [ "$base_status" -ne 0 ]; then
@@ -721,7 +753,8 @@ measure() {
--arg stderr "$sample_stderr" \
--arg trace "$sample_trace" \
--arg order "$(if [ "$phase" = "measured" ] && [ "$base_ran_before_head" -eq 1 ]; then printf base-head; else printf head-base; fi)" \
- '{index:$index,measuredIndex:(if $measuredIndex == "" then null else ($measuredIndex | tonumber) end),pairIndex:(if $measuredIndex == "" then null else ($measuredIndex | tonumber) end),subject:"head",phase:$phase,status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:(if $trace == "" then null else $trace end),order:(if $phase == "measured" then $order else null end)}' \
+ --arg orderSeed "$CI_MEASUREMENT_ORDER_SEED" \
+ '{index:$index,measuredIndex:(if $measuredIndex == "" then null else ($measuredIndex | tonumber) end),pairIndex:(if $measuredIndex == "" then null else ($measuredIndex | tonumber) end),subject:"head",phase:$phase,status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:(if $trace == "" then null else $trace end),order:(if $phase == "measured" then $order else null end),orderSeed:(if $phase == "measured" then $orderSeed else null end)}' \
>>"$samples_file"
if [ "$phase" = "measured" ] && [ "$status" -eq 0 ] && [ "$CI_MEASUREMENT_PAIRED_ENABLED" -eq 1 ] && [ "$base_ran_before_head" -eq 0 ]; then
@@ -743,7 +776,8 @@ measure() {
--argjson durationMs "$base_duration_ms" \
--arg stdout "$base_stdout" \
--arg stderr "$base_stderr" \
- '{index:$index,measuredIndex:($measuredIndex | tonumber),pairIndex:($measuredIndex | tonumber),subject:"base",phase:"measured",status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:null,order:"head-base"}' \
+ --arg orderSeed "$CI_MEASUREMENT_ORDER_SEED" \
+ '{index:$index,measuredIndex:($measuredIndex | tonumber),pairIndex:($measuredIndex | tonumber),subject:"base",phase:"measured",status:$status,durationMs:$durationMs,stdout:$stdout,stderr:$stderr,trace:null,order:"head-base",orderSeed:$orderSeed}' \
>>"$samples_file"
if [ "$base_status" -ne 0 ]; then
@@ -833,6 +867,7 @@ jq -n \
--arg traceId "${dollar}{TRACE_ID:-}" \
--arg devenvRev "${dollar}{DEVENV_REV:-unknown}" \
--arg otelServiceName "${dollar}{OTEL_SERVICE_NAME:-unknown}" \
+ --arg orderSeed "$CI_MEASUREMENT_ORDER_SEED" \
--arg targetSystem "${dollar}{DEVENV_SYSTEM:-${dollar}{RUNNER_OS:-unknown}}" \
'{
schemaVersion: $schemaVersion,
@@ -910,6 +945,36 @@ jq -n \
then null
else (.statistics.pairedDeltaMedianDurationMs / 1000)
end
+ ),
+ pairedDeltaMin: (
+ if (.statistics.pairedDeltaMinDurationMs // null) == null
+ then null
+ else (.statistics.pairedDeltaMinDurationMs / 1000)
+ end
+ ),
+ pairedDeltaMax: (
+ if (.statistics.pairedDeltaMaxDurationMs // null) == null
+ then null
+ else (.statistics.pairedDeltaMaxDurationMs / 1000)
+ end
+ ),
+ pairedDeltaP25: (
+ if (.statistics.pairedDeltaP25DurationMs // null) == null
+ then null
+ else (.statistics.pairedDeltaP25DurationMs / 1000)
+ end
+ ),
+ pairedDeltaP75: (
+ if (.statistics.pairedDeltaP75DurationMs // null) == null
+ then null
+ else (.statistics.pairedDeltaP75DurationMs / 1000)
+ end
+ ),
+ pairedDeltaMad: (
+ if (.statistics.pairedDeltaMadDurationMs // null) == null
+ then null
+ else (.statistics.pairedDeltaMadDurationMs / 1000)
+ end
)
},
dimensions: {
@@ -922,7 +987,13 @@ jq -n \
pairedSampleCount: (.statistics.pairedSampleCount // 0),
pairedOrderProtocol: (
if (.statistics.pairedSampleCount // 0) > 0
- then "alternating-head-base"
+ then "balanced-seeded-alternating-v1"
+ else null
+ end
+ ),
+ pairedOrderSeed: (
+ if (.statistics.pairedSampleCount // 0) > 0
+ then $orderSeed
else null
end
),
@@ -1711,6 +1782,10 @@ jq -n \
def observation_stats($items):
($items | map(.observation.value)) as $values
| ($items | map(.observation.comparison.baseline // empty)) as $pairedBaselineValues
+ | ($items | map(.observation.statistics.pairedDeltaMedian // empty)) as $pairedDeltaMedianValues
+ | ($items | map(.observation.statistics.pairedDeltaP25 // empty)) as $pairedDeltaP25Values
+ | ($items | map(.observation.statistics.pairedDeltaP75 // empty)) as $pairedDeltaP75Values
+ | ($items | map(.observation.statistics.pairedDeltaMad // empty)) as $pairedDeltaMadValues
| ($items | map(.observation.statistics.measuredSampleCount // .observation.statistics.sampleCount // 1) | add // ($items | length)) as $sampleCount
| ($values | median) as $median
| {
@@ -1728,6 +1803,10 @@ jq -n \
sampleCount: $sampleCount,
pairedSampleCount: ($items | map(.observation.statistics.pairedSampleCount // .observation.comparison.pairedSampleCount // 0) | add // 0),
pairedBaselineValue: (if ($pairedBaselineValues | length) == 0 then null else ($pairedBaselineValues | median) end),
+ pairedDeltaMedianValue: (if ($pairedDeltaMedianValues | length) == 0 then null else ($pairedDeltaMedianValues | median) end),
+ pairedDeltaP25Value: (if ($pairedDeltaP25Values | length) == 0 then null else ($pairedDeltaP25Values | median) end),
+ pairedDeltaP75Value: (if ($pairedDeltaP75Values | length) == 0 then null else ($pairedDeltaP75Values | median) end),
+ pairedDeltaMadValue: (if ($pairedDeltaMadValues | length) == 0 then null else ($pairedDeltaMadValues | median) end),
generatedAt: ($items[-1].generatedAt // null)
};
@@ -1766,14 +1845,16 @@ jq -n \
def policy_enabled($policy):
if ($policy | has("enabled")) then $policy.enabled else true end;
- def classify($metric; $unit; $measurementKind; $policy; $current; $currentP25; $currentP75; $currentMad; $baseline; $baselineMin; $baselineMax; $baselineP25; $baselineP75; $baselineP95; $baselineMad; $currentSamples; $baselineSources; $pairedSamples):
+ def classify($metric; $unit; $measurementKind; $policy; $current; $currentP25; $currentP75; $currentMad; $baseline; $baselineMin; $baselineMax; $baselineP25; $baselineP75; $baselineP95; $baselineMad; $currentSamples; $baselineSources; $pairedSamples; $pairedDeltaMedian; $pairedDeltaP25; $pairedDeltaP75; $pairedDeltaMad):
$policy as $b
| ($policy.comparisonMode // (if $measurementKind == "deterministic" or $unit != "seconds" then "budget" elif $measurementKind == "diagnostic" then "diagnostic" else "historical" end)) as $comparisonMode
| ($policy.noiseFloor // noise_floor($metric; $unit)) as $noise
| ($current - $baseline) as $delta
+ | (if $comparisonMode == "paired" and $pairedDeltaMedian != null then $pairedDeltaMedian else $delta end) as $evidenceDelta
| (if $baseline > 0 then ($current / $baseline) else null end) as $ratio
| (($baselineP75 // $baseline) - ($baselineP25 // $baseline)) as $iqr
| (($currentP75 // $current) - ($currentP25 // $current)) as $currentIqr
+ | (($pairedDeltaP75 // $evidenceDelta) - ($pairedDeltaP25 // $evidenceDelta)) as $pairedDeltaIqr
| ([
$noise,
(($policy.statisticalToleranceAbs // 0) | tonumber),
@@ -1788,10 +1869,19 @@ jq -n \
(($currentMad // 0) * 3),
(($currentIqr // 0) * 1.5)
] | max) else 0 end) as $currentRobustTolerance
+ | ([
+ $noise,
+ (($policy.statisticalToleranceAbs // 0) | tonumber),
+ (if $baseline > 0 then ($baseline * (($policy.statisticalToleranceRatio // 0) | tonumber)) else 0 end),
+ (($pairedDeltaMad // 0) * 3),
+ (($pairedDeltaIqr // 0) * 1.5)
+ ] | max) as $pairedDeltaTolerance
| ($baseline + $robustTolerance) as $robustUpper
| ($baseline - $robustTolerance) as $robustLower
| ($current + $currentRobustTolerance) as $currentRobustUpper
| ($current - $currentRobustTolerance) as $currentRobustLower
+ | ($evidenceDelta + $pairedDeltaTolerance) as $evidenceDeltaUpper
+ | ($evidenceDelta - $pairedDeltaTolerance) as $evidenceDeltaLower
| ([($b.warnAbs // 0), (if $baseline > 0 then ($baseline * (($b.warnRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $warnBudget
| ([($b.failAbs // 0), (if $baseline > 0 then ($baseline * (($b.failRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $failBudget
| (
@@ -1806,6 +1896,9 @@ jq -n \
) as $withinBaselineRange
| (
if $baseline <= 0 then "unknown"
+ elif $comparisonMode == "paired" and $evidenceDeltaLower > $failBudget then "fail"
+ elif $comparisonMode == "paired" and $evidenceDeltaLower > $warnBudget then "warn"
+ elif $comparisonMode == "paired" then "pass"
elif ($delta > $b.failAbs and $current > ($baseline * $b.failRatio)) then "fail"
elif ($delta > $b.warnAbs and $current > ($baseline * $b.warnRatio)) then "warn"
else "pass"
@@ -1817,6 +1910,7 @@ jq -n \
and $baselineSources >= ($policy.minBaselineSources // 1)
and $currentSamples >= ($policy.minCurrentSamples // 1)
and (if $comparisonMode == "paired" then $pairedSamples >= ($policy.minPairedSamples // 1) else true end)
+ and (if $comparisonMode == "paired" then $pairedDeltaMedian != null else true end)
) as $gateable
| (
if (policy_enabled($policy) != true) then "disabled"
@@ -1824,6 +1918,7 @@ jq -n \
elif $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
+ elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta"
else "eligible"
end
) as $gateReason
@@ -1834,6 +1929,8 @@ jq -n \
elif $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
+ elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta"
+ elif $comparisonMode == "paired" and $thresholdStatus == "pass" and $evidenceDelta > $warnBudget then "paired_uncertain"
elif ($comparisonMode == "historical" and $thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band"
elif $thresholdStatus == "pass" then "within_budget"
else "threshold_exceeded"
@@ -1847,6 +1944,10 @@ jq -n \
) as $status
| (
if $baseline <= 0 then "unknown"
+ elif $comparisonMode == "paired" and ($evidenceDelta | abs_value) <= $noise then "unchanged"
+ elif $comparisonMode == "paired" and $evidenceDeltaLower <= 0 and $evidenceDeltaUpper >= 0 then "unchanged"
+ elif $comparisonMode == "paired" and $evidenceDelta < 0 then "improved"
+ elif $comparisonMode == "paired" then "regressed"
elif ($delta | abs_value) <= $noise then "unchanged"
elif $withinRobustBand then "unchanged"
elif $delta < 0 then "improved"
@@ -1856,6 +1957,10 @@ jq -n \
| (
if $baseline <= 0 then null
elif (policy_enabled($policy) != true) then null
+ elif $comparisonMode == "paired" and ($evidenceDeltaLower <= 0 and $evidenceDeltaUpper >= 0) then 0
+ elif $comparisonMode == "paired" and ($evidenceDelta | abs_value) <= $noise then 0
+ elif $comparisonMode == "paired" and $evidenceDelta > 0 then ([0, $evidenceDeltaLower] | max) / $warnBudget
+ elif $comparisonMode == "paired" then -(([0, (-$evidenceDeltaUpper)] | max) / $warnBudget)
elif $withinRobustBand then 0
elif ($delta | abs_value) <= $noise then 0
elif ($confidence == "threshold_exceeded" and $delta > 0) then ([0, ($currentRobustLower - $robustUpper), $delta] | max) / $warnBudget
@@ -1874,7 +1979,7 @@ jq -n \
else "improvement"
end
) as $semanticImpactKind
- | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,comparisonMode:$comparisonMode,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,semanticImpactScore:$semanticImpactScore,semanticImpactKind:$semanticImpactKind,semanticWarnBudget:$warnBudget,semanticFailBudget:$failBudget,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange,pairedSamples:$pairedSamples};
+ | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,comparisonMode:$comparisonMode,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,semanticImpactScore:$semanticImpactScore,semanticImpactKind:$semanticImpactKind,semanticWarnBudget:$warnBudget,semanticFailBudget:$failBudget,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange,pairedSamples:$pairedSamples,evidenceDelta:$evidenceDelta,evidenceDeltaLower:$evidenceDeltaLower,evidenceDeltaUpper:$evidenceDeltaUpper,evidenceDeltaTolerance:$pairedDeltaTolerance};
(observations_by_key($current[0]) | with_entries(.value = observation_stats(.value))) as $currentObs
| (observations_by_key($baseline[0]) | with_entries(.value = observation_stats(.value))) as $baselineObs
@@ -1935,7 +2040,11 @@ jq -n \
$effectiveBaselineValue.mad;
$currentValue.sampleCount;
$effectiveBaselineValue.sourceCount;
- $currentValue.pairedSampleCount
+ $currentValue.pairedSampleCount;
+ $currentValue.pairedDeltaMedianValue;
+ $currentValue.pairedDeltaP25Value;
+ $currentValue.pairedDeltaP75Value;
+ $currentValue.pairedDeltaMadValue
) + {
target: $currentValue.target,
observation: $currentValue.observation,
@@ -1962,7 +2071,8 @@ jq -n \
and (.gateReason == "missing_baseline"
or .gateReason == "low_baseline_count"
or .gateReason == "low_current_sample_count"
- or .gateReason == "low_paired_sample_count")
+ or .gateReason == "low_paired_sample_count"
+ or .gateReason == "missing_paired_delta")
) then "partial"
else "pass"
end
@@ -1975,7 +2085,8 @@ jq -n \
missingBaselineCount: (map(select(.gateReason == "missing_baseline")) | length),
lowBaselineCount: (map(select(.gateReason == "low_baseline_count")) | length),
lowCurrentSampleCount: (map(select(.gateReason == "low_current_sample_count")) | length),
- lowPairedSampleCount: (map(select(.gateReason == "low_paired_sample_count")) | length)
+ lowPairedSampleCount: (map(select(.gateReason == "low_paired_sample_count")) | length),
+ missingPairedDeltaCount: (map(select(.gateReason == "missing_paired_delta")) | length)
}
| . + {
nonGateableCount: (.enabledCount - .gateableCount),
@@ -2276,6 +2387,18 @@ const interpretation = (row) => {
tone: 'neutral',
color: '#94a3b8',
}
+ if (row.confidence === 'missing_paired_delta') return {
+ label: 'Needs paired delta stats',
+ detail: 'Wall-clock gates require per-pair delta statistics, not only paired medians.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ if (row.confidence === 'paired_uncertain') return {
+ label: 'Uncertain wall-clock movement',
+ detail: 'The paired median moved, but the paired delta band still crosses the configured budget.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
if (row.confidence === 'diagnostic') return {
label: 'Diagnostic only',
detail: 'Shown for investigation, but intentionally excluded from gating.',
From 2d000f0da45142e4f74307250155425703b463ca Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Tue, 19 May 2026 08:13:18 +0200
Subject: [PATCH 42/81] Keep deterministic measurement gates point-budget based
---
.github/workflows/ci.yml | 14 ++++----
context/ci-measurements.md | 24 ++++++++-----
.../ci-measurement-comparison.test.sh | 36 +++++++++++++++++++
genie/ci-workflow/measurements.ts | 7 ++--
4 files changed, 64 insertions(+), 17 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 766b11662..1f45d3425 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3430,6 +3430,7 @@ jobs:
($current >= $robustLower and $current <= $robustUpper)
or ($currentRobustTolerance > 0 and $currentRobustLower <= $robustUpper and $currentRobustUpper >= $robustLower)
) as $withinRobustBand
+ | ($comparisonMode == "historical" and $measurementKind != "deterministic") as $canUseRobustBandSuppression
| (
$baselineMin != null
and $baselineMax != null
@@ -3473,7 +3474,7 @@ jobs:
elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta"
elif $comparisonMode == "paired" and $thresholdStatus == "pass" and $evidenceDelta > $warnBudget then "paired_uncertain"
- elif ($comparisonMode == "historical" and $thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band"
+ elif ($canUseRobustBandSuppression and $thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band"
elif $thresholdStatus == "pass" then "within_budget"
else "threshold_exceeded"
end
@@ -3491,7 +3492,7 @@ jobs:
elif $comparisonMode == "paired" and $evidenceDelta < 0 then "improved"
elif $comparisonMode == "paired" then "regressed"
elif ($delta | abs_value) <= $noise then "unchanged"
- elif $withinRobustBand then "unchanged"
+ elif $canUseRobustBandSuppression and $withinRobustBand then "unchanged"
elif $delta < 0 then "improved"
else "regressed"
end
@@ -3503,7 +3504,7 @@ jobs:
elif $comparisonMode == "paired" and ($evidenceDelta | abs_value) <= $noise then 0
elif $comparisonMode == "paired" and $evidenceDelta > 0 then ([0, $evidenceDeltaLower] | max) / $warnBudget
elif $comparisonMode == "paired" then -(([0, (-$evidenceDeltaUpper)] | max) / $warnBudget)
- elif $withinRobustBand then 0
+ elif $canUseRobustBandSuppression and $withinRobustBand then 0
elif ($delta | abs_value) <= $noise then 0
elif ($confidence == "threshold_exceeded" and $delta > 0) then ([0, ($currentRobustLower - $robustUpper), $delta] | max) / $warnBudget
elif ($confidence == "threshold_exceeded" and $delta < 0) then -(([0, ($robustLower - $currentRobustUpper), (-$delta)] | max) / $warnBudget)
@@ -5164,6 +5165,7 @@ jobs:
($current >= $robustLower and $current <= $robustUpper)
or ($currentRobustTolerance > 0 and $currentRobustLower <= $robustUpper and $currentRobustUpper >= $robustLower)
) as $withinRobustBand
+ | ($comparisonMode == "historical" and $measurementKind != "deterministic") as $canUseRobustBandSuppression
| (
$baselineMin != null
and $baselineMax != null
@@ -5207,7 +5209,7 @@ jobs:
elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta"
elif $comparisonMode == "paired" and $thresholdStatus == "pass" and $evidenceDelta > $warnBudget then "paired_uncertain"
- elif ($comparisonMode == "historical" and $thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band"
+ elif ($canUseRobustBandSuppression and $thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band"
elif $thresholdStatus == "pass" then "within_budget"
else "threshold_exceeded"
end
@@ -5225,7 +5227,7 @@ jobs:
elif $comparisonMode == "paired" and $evidenceDelta < 0 then "improved"
elif $comparisonMode == "paired" then "regressed"
elif ($delta | abs_value) <= $noise then "unchanged"
- elif $withinRobustBand then "unchanged"
+ elif $canUseRobustBandSuppression and $withinRobustBand then "unchanged"
elif $delta < 0 then "improved"
else "regressed"
end
@@ -5237,7 +5239,7 @@ jobs:
elif $comparisonMode == "paired" and ($evidenceDelta | abs_value) <= $noise then 0
elif $comparisonMode == "paired" and $evidenceDelta > 0 then ([0, $evidenceDeltaLower] | max) / $warnBudget
elif $comparisonMode == "paired" then -(([0, (-$evidenceDeltaUpper)] | max) / $warnBudget)
- elif $withinRobustBand then 0
+ elif $canUseRobustBandSuppression and $withinRobustBand then 0
elif ($delta | abs_value) <= $noise then 0
elif ($confidence == "threshold_exceeded" and $delta > 0) then ([0, ($currentRobustLower - $robustUpper), $delta] | max) / $warnBudget
elif ($confidence == "threshold_exceeded" and $delta < 0) then -(([0, ($robustLower - $currentRobustUpper), (-$delta)] | max) / $warnBudget)
diff --git a/context/ci-measurements.md b/context/ci-measurements.md
index 34d6eaf93..ca99d5521 100644
--- a/context/ci-measurements.md
+++ b/context/ci-measurements.md
@@ -42,7 +42,10 @@ numeric `value`, `unit`, `measurementKind`, and a gate `policy`.
Deterministic observations use `comparisonMode: "budget"`.
They require a comparable baseline and then evaluate configured absolute and
-relative budgets. Historical variance is not treated as statistical evidence.
+relative budgets. Historical variance is context only; it does not neutralize
+a budget-exceeding deterministic movement. This keeps Nix closure sizes,
+source-shape counts, lines of code, complexity scores, and similar structural
+measurements separate from wall-clock noise handling.
Wall-clock observations use `comparisonMode: "paired"` for enforced gates.
They need same-run base/head evidence before they can block a merge. Historical
@@ -64,7 +67,9 @@ evidence for a regression.
Historical wall-clock comparison may be used as an advisory transition mode.
It can warn, visualize trends, and guide investigation, but it must not be the
-required merge gate for noisy runner-dependent timings.
+required merge gate for noisy runner-dependent timings. Robust baseline/current
+bands may suppress historical wall-clock noise; they are not applied as a
+semantic escape hatch for deterministic budget rows.
Diagnostic observations set `enabled: false` or `measurementKind: "diagnostic"`.
They appear in reports, but their impact is rendered as `diagnostic` and they
@@ -82,7 +87,9 @@ probe execution
The artifact is the source of truth. OTEL traces and host context are evidence
attachments, not the canonical numeric store. PR comments are projections of
-the artifact and can be regenerated.
+the artifact and can be regenerated. New measurement families should add
+producer adapters that emit this artifact contract; comparison, policy
+evaluation, charting, and comment rendering stay shared.
## Wall-Clock Soundness
@@ -106,11 +113,12 @@ the row is partial/advisory even if the historical raw delta is large.
Nix closure size, source shape, code complexity, lines of code, and file counts
are deterministic or near-deterministic structural measurements. They are not
-wall-clock performance probes and must not use paired timing statistics. They
-should use explicit budgets and semantic buckets. A closure-size regression is
-actionable because the same installable and lock graph should produce a stable
-closure. Source-shape or complexity growth is an architecture signal and should
-remain advisory unless a repo defines an explicit owner-approved budget.
+wall-clock performance probes and must not use paired timing statistics or
+historical timing-style robust-band suppression. They should use explicit
+budgets and semantic buckets. A closure-size regression is actionable because
+the same installable and lock graph should produce a stable closure.
+Source-shape or complexity growth is an architecture signal and should remain
+advisory unless a repo defines an explicit owner-approved budget.
## Visualization
diff --git a/genie/ci-scripts/ci-measurement-comparison.test.sh b/genie/ci-scripts/ci-measurement-comparison.test.sh
index 335cd8c1f..569d09a6d 100755
--- a/genie/ci-scripts/ci-measurement-comparison.test.sh
+++ b/genie/ci-scripts/ci-measurement-comparison.test.sh
@@ -210,6 +210,42 @@ if [ "$actual_status" != "pass" ] || [ "$actual_row" != "pass" ] || [ "$actual_c
exit 1
fi
+deterministic_policy='{"enabled":true,"comparisonMode":"budget","minBaselineSources":1,"minCurrentSamples":1,"warnRatio":1.01,"failRatio":1.02,"warnAbs":10,"failAbs":20,"noiseFloor":0,"statisticalToleranceAbs":1000,"statisticalToleranceRatio":0}'
+rm -rf "$tmp_dir/current" "$tmp_dir/baseline"
+write_measurement "$tmp_dir/current/measurements.json" 130 devenv-perf-warm-median-v2 "$deterministic_policy"
+jq '.observations[0].id = "nix.closure.nar_size"
+ | .observations[0].name = "nix.closure.nar_size"
+ | .observations[0].label = "Nix closure size"
+ | .observations[0].unit = "bytes"
+ | .observations[0].measurementKind = "deterministic"
+ | .observations[0].statistics.sampleCount = 1
+ | .observations[0].statistics.measuredSampleCount = 1
+ | .observations[0].statistics.successfulSampleCount = 1' \
+ "$tmp_dir/current/measurements.json" >"$tmp_dir/current/measurements.updated.json"
+mv "$tmp_dir/current/measurements.updated.json" "$tmp_dir/current/measurements.json"
+write_measurement "$tmp_dir/baseline/run-1/measurements.json" 100 devenv-perf-warm-median-v2 "$deterministic_policy"
+jq '.observations[0].id = "nix.closure.nar_size"
+ | .observations[0].name = "nix.closure.nar_size"
+ | .observations[0].label = "Nix closure size"
+ | .observations[0].unit = "bytes"
+ | .observations[0].measurementKind = "deterministic"
+ | .observations[0].statistics.sampleCount = 1
+ | .observations[0].statistics.measuredSampleCount = 1
+ | .observations[0].statistics.successfulSampleCount = 1' \
+ "$tmp_dir/baseline/run-1/measurements.json" >"$tmp_dir/baseline/run-1/measurements.updated.json"
+mv "$tmp_dir/baseline/run-1/measurements.updated.json" "$tmp_dir/baseline/run-1/measurements.json"
+run_compare
+actual_status="$(jq -r '.status' "$tmp_dir/comparison.json")"
+actual_row="$(jq -r '.comparisons[] | .status' "$tmp_dir/comparison.json")"
+actual_confidence="$(jq -r '.comparisons[] | .confidence' "$tmp_dir/comparison.json")"
+actual_impact="$(jq -r '.comparisons[] | .semanticImpactScore' "$tmp_dir/comparison.json")"
+actual_impact_kind="$(jq -r '.comparisons[] | .semanticImpactKind' "$tmp_dir/comparison.json")"
+actual_within_band="$(jq -r '.comparisons[] | .withinBaselineRange' "$tmp_dir/comparison.json")"
+if [ "$actual_status" != "fail" ] || [ "$actual_row" != "fail" ] || [ "$actual_confidence" != "threshold_exceeded" ] || [ "$actual_impact_kind" != "fail_boundary" ] || ! awk "BEGIN { exit !($actual_impact > 1) }"; then
+ echo "expected deterministic budget regression to fail even when robust tolerance is wide; got status=$actual_status row=$actual_row confidence=$actual_confidence impact=$actual_impact kind=$actual_impact_kind withinBaselineRange=$actual_within_band" >&2
+ exit 1
+fi
+
low_baseline_policy='{"enabled":true,"minBaselineSources":2,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":0.5,"noiseFloor":0.1}'
rm -rf "$tmp_dir/current" "$tmp_dir/baseline"
write_measurement "$tmp_dir/current/measurements.json" 10.5 devenv-perf-warm-median-v2 "$low_baseline_policy"
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index de7784f7b..97336dca3 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -1888,6 +1888,7 @@ jq -n \
($current >= $robustLower and $current <= $robustUpper)
or ($currentRobustTolerance > 0 and $currentRobustLower <= $robustUpper and $currentRobustUpper >= $robustLower)
) as $withinRobustBand
+ | ($comparisonMode == "historical" and $measurementKind != "deterministic") as $canUseRobustBandSuppression
| (
$baselineMin != null
and $baselineMax != null
@@ -1931,7 +1932,7 @@ jq -n \
elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta"
elif $comparisonMode == "paired" and $thresholdStatus == "pass" and $evidenceDelta > $warnBudget then "paired_uncertain"
- elif ($comparisonMode == "historical" and $thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band"
+ elif ($canUseRobustBandSuppression and $thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band"
elif $thresholdStatus == "pass" then "within_budget"
else "threshold_exceeded"
end
@@ -1949,7 +1950,7 @@ jq -n \
elif $comparisonMode == "paired" and $evidenceDelta < 0 then "improved"
elif $comparisonMode == "paired" then "regressed"
elif ($delta | abs_value) <= $noise then "unchanged"
- elif $withinRobustBand then "unchanged"
+ elif $canUseRobustBandSuppression and $withinRobustBand then "unchanged"
elif $delta < 0 then "improved"
else "regressed"
end
@@ -1961,7 +1962,7 @@ jq -n \
elif $comparisonMode == "paired" and ($evidenceDelta | abs_value) <= $noise then 0
elif $comparisonMode == "paired" and $evidenceDelta > 0 then ([0, $evidenceDeltaLower] | max) / $warnBudget
elif $comparisonMode == "paired" then -(([0, (-$evidenceDeltaUpper)] | max) / $warnBudget)
- elif $withinRobustBand then 0
+ elif $canUseRobustBandSuppression and $withinRobustBand then 0
elif ($delta | abs_value) <= $noise then 0
elif ($confidence == "threshold_exceeded" and $delta > 0) then ([0, ($currentRobustLower - $robustUpper), $delta] | max) / $warnBudget
elif ($confidence == "threshold_exceeded" and $delta < 0) then -(([0, ($robustLower - $currentRobustUpper), (-$delta)] | max) / $warnBudget)
From 317dbd024896bf4daef902ed9b789d77cfcf16fa Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Tue, 19 May 2026 08:22:07 +0200
Subject: [PATCH 43/81] Update measurement gate workflow assertion
---
.../runtime/github-workflow/ci-workflow-helpers.unit.test.ts | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
index ce49cc955..237db0a18 100644
--- a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
+++ b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
@@ -431,7 +431,7 @@ describe('ci workflow devenv perf helpers', () => {
'if ($gateable and $confidence == "threshold_exceeded") then $thresholdStatus',
)
expect(generatedCiWorkflowYamlSource).toContain(
- 'elif ($comparisonMode == "historical" and $thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band"',
+ 'elif ($canUseRobustBandSuppression and $thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band"',
)
expect(ciWorkflowSource).toContain("label: 'Needs more baseline'")
expect(ciWorkflowSource).toContain("label: 'Needs repeat'")
From d62fb133a6887dda7ae5c56a2f2ebab3b53a3943 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Tue, 19 May 2026 08:27:55 +0200
Subject: [PATCH 44/81] Document typed CI measurement architecture
---
context/ci-measurements.md | 83 ++++++++++++++++++++++++++++++++++++++
1 file changed, 83 insertions(+)
diff --git a/context/ci-measurements.md b/context/ci-measurements.md
index ca99d5521..5f8017aba 100644
--- a/context/ci-measurements.md
+++ b/context/ci-measurements.md
@@ -17,6 +17,20 @@ Active.
The class is part of the observation contract through `measurementKind`.
The comparison policy is part of the gate contract through `comparisonMode`.
+`measurementKind` defines the physical meaning of the number. `comparisonMode`
+defines how the number is compared. A producer may only combine them when the
+semantics match:
+
+| `measurementKind` | Gateable `comparisonMode` | Baseline Meaning | Uncertainty Model |
+| ----------------- | ------------------------- | --------------------------------------------- | ----------------------------------------- |
+| `deterministic` | `budget` | Same target on a comparable ref | None by default; exact value plus budget. |
+| `wall-clock` | `paired` | Same PR run, same runner, base/head pairs | Per-pair delta band. |
+| `wall-clock` | `historical` | Previous comparable successful artifacts | Advisory robust bands only. |
+| `diagnostic` | none | Optional context artifact or trace attachment | Not gateable. |
+
+Historical comparison is not a substitute for paired wall-clock evidence.
+Budget comparison is not a substitute for owner-approved semantic budgets.
+
## Observation Contract
Every observation has a stable `id`, human `label`, semantic `group`/`path`,
@@ -38,6 +52,26 @@ numeric `value`, `unit`, `measurementKind`, and a gate `policy`.
}
```
+Observation IDs are public API. They should be stable, dotted names whose
+prefix names the domain and whose suffix names the measured quantity, for
+example `devenv.shell_eval_warm.duration`, `nix.closure.nar_size`, or
+`source.lines`. Labels are review UI, not identity. Paths and groups may change
+to improve hierarchy, but IDs should only change when the measurement protocol
+or semantic target changes.
+
+New measurement producers should emit the shared artifact format directly:
+
+```text
+producer adapter
+ -> typed observation(s)
+ -> shared comparison policy
+ -> shared report/comment/SVG projection
+```
+
+This keeps probe-specific collection code separate from the reusable regression
+system. A new probe should not fork comparison, markdown rendering, or asset
+publication logic.
+
## Gate Semantics
Deterministic observations use `comparisonMode: "budget"`.
@@ -120,6 +154,55 @@ the same installable and lock graph should produce a stable closure.
Source-shape or complexity growth is an architecture signal and should remain
advisory unless a repo defines an explicit owner-approved budget.
+Deterministic budgets should prefer absolute units when the user impact is
+absolute, such as bytes or path counts, and relative thresholds when scale is
+the meaningful signal. A deterministic row may show historical values for
+review context, but the pass/fail decision is the budget decision.
+
+## Policy Lifecycle
+
+Each observation should move through explicit policy stages:
+
+| Stage | Use Case | Merge Behavior |
+| ------------ | --------------------------------------------- | --------------------------------------------------- |
+| `diagnostic` | New metric, trace attachment, host context | Render only. |
+| `advisory` | Historical trend before calibration is mature | Comment and warn, but do not block merge. |
+| `gateable` | Calibrated wall-clock or deterministic budget | Block only when the measurement class proves it. |
+| `required` | Stable semantic invariant | Repo branch protection may depend on the gate name. |
+
+Wall-clock probes should start advisory until paired evidence and a noise
+profile exist for that repo/runner. Deterministic probes can become gateable
+earlier when their target identity and budget are explicit.
+
+## Baseline Model
+
+Baselines are comparable evidence, not arbitrary previous numbers.
+
+| Measurement Class | Baseline Source | Backfill Rule |
+| ----------------- | ---------------------------------------------------- | ----------------------------------------------------- |
+| `deterministic` | Current main artifacts or manually seeded exact runs | Backfill past merged PRs when introducing the metric. |
+| `wall-clock` | Same-run paired base checkout for PR gates | Historical backfill is trend context only. |
+| `diagnostic` | Trace or host artifact for the same run | No baseline required. |
+
+Manual baseline seeds must record the source run, ref, SHA, and reason. Seeded
+data is acceptable when it was produced by the same probe protocol and target
+identity; it is not acceptable to copy a chart value into the baseline store.
+
+## State-of-the-Art Alignment
+
+The design follows current continuous benchmarking practice:
+
+- Wall-clock gates need repeated measurements, warmup, and uncertainty, not
+ single raw timing deltas.
+- Paired base/head runs reduce runner-load, cache, and time-drift bias.
+- Outliers and wide variance reduce confidence instead of being silently
+ averaged away.
+- Diagnostic traces explain regressions; they do not define the canonical
+ numeric result.
+- Human review should show raw values, nominal deltas, percent deltas, and an
+ actionable impact scale so large noisy movements are not mistaken for proven
+ PR regressions.
+
## Visualization
Reports must distinguish raw movement from actionable evidence.
From c1fbbed73b125c4e08cc9cefbb6bd7ac7ef6637e Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Tue, 19 May 2026 08:40:39 +0200
Subject: [PATCH 45/81] Gate paired wall-clock measurements from paired
evidence
---
.github/workflows/ci.yml | 14 +++++++-----
context/ci-measurements.md | 5 +++++
.../ci-measurement-comparison.test.sh | 22 +++++++++++++++++++
genie/ci-workflow/measurements.ts | 7 +++---
.../ci-workflow-helpers.unit.test.ts | 2 +-
5 files changed, 40 insertions(+), 10 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 1f45d3425..b2ac32a10 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3426,6 +3426,7 @@ jobs:
| ($evidenceDelta - $pairedDeltaTolerance) as $evidenceDeltaLower
| ([($b.warnAbs // 0), (if $baseline > 0 then ($baseline * (($b.warnRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $warnBudget
| ([($b.failAbs // 0), (if $baseline > 0 then ($baseline * (($b.failRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $failBudget
+ | ($comparisonMode != "paired") as $needsHistoricalBaselineCount
| (
($current >= $robustLower and $current <= $robustUpper)
or ($currentRobustTolerance > 0 and $currentRobustLower <= $robustUpper and $currentRobustUpper >= $robustLower)
@@ -3450,7 +3451,7 @@ jobs:
| (
policy_enabled($policy) == true
and $baseline > 0
- and $baselineSources >= ($policy.minBaselineSources // 1)
+ and (if $needsHistoricalBaselineCount then $baselineSources >= ($policy.minBaselineSources // 1) else true end)
and $currentSamples >= ($policy.minCurrentSamples // 1)
and (if $comparisonMode == "paired" then $pairedSamples >= ($policy.minPairedSamples // 1) else true end)
and (if $comparisonMode == "paired" then $pairedDeltaMedian != null else true end)
@@ -3458,7 +3459,7 @@ jobs:
| (
if (policy_enabled($policy) != true) then "disabled"
elif $baseline <= 0 then "missing_baseline"
- elif $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
+ elif $needsHistoricalBaselineCount and $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta"
@@ -3469,7 +3470,7 @@ jobs:
if $baseline <= 0 then "unknown"
elif (policy_enabled($policy) != true) then "diagnostic"
elif ($delta | abs_value) <= $noise then "noise_floor"
- elif $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
+ elif $needsHistoricalBaselineCount and $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta"
@@ -5161,6 +5162,7 @@ jobs:
| ($evidenceDelta - $pairedDeltaTolerance) as $evidenceDeltaLower
| ([($b.warnAbs // 0), (if $baseline > 0 then ($baseline * (($b.warnRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $warnBudget
| ([($b.failAbs // 0), (if $baseline > 0 then ($baseline * (($b.failRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $failBudget
+ | ($comparisonMode != "paired") as $needsHistoricalBaselineCount
| (
($current >= $robustLower and $current <= $robustUpper)
or ($currentRobustTolerance > 0 and $currentRobustLower <= $robustUpper and $currentRobustUpper >= $robustLower)
@@ -5185,7 +5187,7 @@ jobs:
| (
policy_enabled($policy) == true
and $baseline > 0
- and $baselineSources >= ($policy.minBaselineSources // 1)
+ and (if $needsHistoricalBaselineCount then $baselineSources >= ($policy.minBaselineSources // 1) else true end)
and $currentSamples >= ($policy.minCurrentSamples // 1)
and (if $comparisonMode == "paired" then $pairedSamples >= ($policy.minPairedSamples // 1) else true end)
and (if $comparisonMode == "paired" then $pairedDeltaMedian != null else true end)
@@ -5193,7 +5195,7 @@ jobs:
| (
if (policy_enabled($policy) != true) then "disabled"
elif $baseline <= 0 then "missing_baseline"
- elif $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
+ elif $needsHistoricalBaselineCount and $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta"
@@ -5204,7 +5206,7 @@ jobs:
if $baseline <= 0 then "unknown"
elif (policy_enabled($policy) != true) then "diagnostic"
elif ($delta | abs_value) <= $noise then "noise_floor"
- elif $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
+ elif $needsHistoricalBaselineCount and $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta"
diff --git a/context/ci-measurements.md b/context/ci-measurements.md
index 5f8017aba..0e1db3380 100644
--- a/context/ci-measurements.md
+++ b/context/ci-measurements.md
@@ -99,6 +99,11 @@ but the paired delta band still crosses the budget, the row renders as
continuous benchmark tools: a point estimate without uncertainty is not enough
evidence for a regression.
+Paired wall-clock gates do not require a historical baseline source count. The
+same-run paired baseline is the comparable evidence. Historical runs may still
+appear in the report as trend context, but they do not decide whether paired PR
+evidence is gateable.
+
Historical wall-clock comparison may be used as an advisory transition mode.
It can warn, visualize trends, and guide investigation, but it must not be the
required merge gate for noisy runner-dependent timings. Robust baseline/current
diff --git a/genie/ci-scripts/ci-measurement-comparison.test.sh b/genie/ci-scripts/ci-measurement-comparison.test.sh
index 569d09a6d..cf5814ee1 100755
--- a/genie/ci-scripts/ci-measurement-comparison.test.sh
+++ b/genie/ci-scripts/ci-measurement-comparison.test.sh
@@ -67,6 +67,7 @@ run_compare() {
policy='{"enabled":true,"minBaselineSources":1,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":0.5,"noiseFloor":0.1}'
paired_policy='{"enabled":true,"comparisonMode":"paired","minBaselineSources":1,"minCurrentSamples":5,"minPairedSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":0.5,"noiseFloor":0.1}'
+strict_paired_policy='{"enabled":true,"comparisonMode":"paired","minBaselineSources":20,"minCurrentSamples":5,"minPairedSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":0.5,"noiseFloor":0.1}'
emit_compare_script
rm -rf "$tmp_dir/current" "$tmp_dir/baseline"
@@ -144,6 +145,27 @@ if [ "$actual_status" != "pass" ] || [ "$actual_row" != "pass" ] || [ "$actual_g
exit 1
fi
+rm -rf "$tmp_dir/current" "$tmp_dir/baseline"
+write_measurement "$tmp_dir/current/measurements.json" 13 devenv-perf-warm-median-v2 "$strict_paired_policy"
+jq '.observations[0].comparison = { mode: "paired", baseline: 12.95, pairedSampleCount: 5 }
+ | .observations[0].statistics.pairedSampleCount = 5
+ | .observations[0].statistics.pairedDeltaMedian = 0.05
+ | .observations[0].statistics.pairedDeltaP25 = 0.04
+ | .observations[0].statistics.pairedDeltaP75 = 0.06
+ | .observations[0].statistics.pairedDeltaMad = 0.01' \
+ "$tmp_dir/current/measurements.json" >"$tmp_dir/current/measurements.updated.json"
+mv "$tmp_dir/current/measurements.updated.json" "$tmp_dir/current/measurements.json"
+write_measurement "$tmp_dir/baseline/run-1/measurements.json" 10 devenv-perf-warm-median-v2 "$strict_paired_policy"
+run_compare
+actual_status="$(jq -r '.status' "$tmp_dir/comparison.json")"
+actual_gate="$(jq -r '.comparisons[] | .gateReason' "$tmp_dir/comparison.json")"
+actual_confidence="$(jq -r '.comparisons[] | .confidence' "$tmp_dir/comparison.json")"
+actual_enforceable="$(jq -r '.readiness.enforceable' "$tmp_dir/comparison.json")"
+if [ "$actual_status" != "pass" ] || [ "$actual_gate" != "eligible" ] || [ "$actual_confidence" != "noise_floor" ] || [ "$actual_enforceable" != "true" ]; then
+ echo "expected paired wall-clock gate to ignore historical minBaselineSources when paired evidence is present; got status=$actual_status gate=$actual_gate confidence=$actual_confidence enforceable=$actual_enforceable" >&2
+ exit 1
+fi
+
rm -rf "$tmp_dir/current" "$tmp_dir/baseline"
write_measurement "$tmp_dir/current/measurements.json" 13 devenv-perf-warm-median-v2 "$paired_policy"
jq '.observations[0].comparison = { mode: "paired", baseline: 10, pairedSampleCount: 5 }
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 97336dca3..0c0379101 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -1884,6 +1884,7 @@ jq -n \
| ($evidenceDelta - $pairedDeltaTolerance) as $evidenceDeltaLower
| ([($b.warnAbs // 0), (if $baseline > 0 then ($baseline * (($b.warnRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $warnBudget
| ([($b.failAbs // 0), (if $baseline > 0 then ($baseline * (($b.failRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $failBudget
+ | ($comparisonMode != "paired") as $needsHistoricalBaselineCount
| (
($current >= $robustLower and $current <= $robustUpper)
or ($currentRobustTolerance > 0 and $currentRobustLower <= $robustUpper and $currentRobustUpper >= $robustLower)
@@ -1908,7 +1909,7 @@ jq -n \
| (
policy_enabled($policy) == true
and $baseline > 0
- and $baselineSources >= ($policy.minBaselineSources // 1)
+ and (if $needsHistoricalBaselineCount then $baselineSources >= ($policy.minBaselineSources // 1) else true end)
and $currentSamples >= ($policy.minCurrentSamples // 1)
and (if $comparisonMode == "paired" then $pairedSamples >= ($policy.minPairedSamples // 1) else true end)
and (if $comparisonMode == "paired" then $pairedDeltaMedian != null else true end)
@@ -1916,7 +1917,7 @@ jq -n \
| (
if (policy_enabled($policy) != true) then "disabled"
elif $baseline <= 0 then "missing_baseline"
- elif $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
+ elif $needsHistoricalBaselineCount and $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta"
@@ -1927,7 +1928,7 @@ jq -n \
if $baseline <= 0 then "unknown"
elif (policy_enabled($policy) != true) then "diagnostic"
elif ($delta | abs_value) <= $noise then "noise_floor"
- elif $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
+ elif $needsHistoricalBaselineCount and $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta"
diff --git a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
index 237db0a18..bb25e5b81 100644
--- a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
+++ b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
@@ -422,7 +422,7 @@ describe('ci workflow devenv perf helpers', () => {
)
expect(generatedCiWorkflowYamlSource).toContain('within_baseline_range')
expect(generatedCiWorkflowYamlSource).toContain(
- 'elif $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"',
+ 'elif $needsHistoricalBaselineCount and $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"',
)
expect(generatedCiWorkflowYamlSource).toContain(
'elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"',
From a9039e99f69ae6e459b4cf75580a60c05cd87c43 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Tue, 19 May 2026 08:52:20 +0200
Subject: [PATCH 46/81] Use paired delta evidence for wall-clock gates
---
.github/workflows/ci.yml | 70 +++++++++++++++----
context/ci-measurements.md | 40 ++++++++---
.../ci-measurement-comparison.test.sh | 22 ++++++
genie/ci-workflow/measurements.ts | 40 ++++++++---
4 files changed, 141 insertions(+), 31 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b2ac32a10..020756f98 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2788,7 +2788,8 @@ jobs:
if $pairedDeltaMedian == null then null
else ($pairedDeltaDurations | map(. - $pairedDeltaMedian | if . < 0 then -. else . end) | median)
end
- )
+ ),
+ pairedDeltaSampleDurationMs: $pairedDeltaDurations
},
samples:$sampleList
}' \
@@ -3152,7 +3153,8 @@ jobs:
then null
else (.statistics.pairedDeltaMadDurationMs / 1000)
end
- )
+ ),
+ pairedDeltaSamples: ((.statistics.pairedDeltaSampleDurationMs // []) | map(. / 1000))
},
dimensions: {
probe: .id,
@@ -3328,6 +3330,7 @@ jobs:
| ($items | map(.observation.statistics.pairedDeltaP25 // empty)) as $pairedDeltaP25Values
| ($items | map(.observation.statistics.pairedDeltaP75 // empty)) as $pairedDeltaP75Values
| ($items | map(.observation.statistics.pairedDeltaMad // empty)) as $pairedDeltaMadValues
+ | ($items | map(.observation.statistics.pairedDeltaSamples // []) | add // []) as $pairedDeltaSampleValues
| ($items | map(.observation.statistics.measuredSampleCount // .observation.statistics.sampleCount // 1) | add // ($items | length)) as $sampleCount
| ($values | median) as $median
| {
@@ -3349,6 +3352,7 @@ jobs:
pairedDeltaP25Value: (if ($pairedDeltaP25Values | length) == 0 then null else ($pairedDeltaP25Values | median) end),
pairedDeltaP75Value: (if ($pairedDeltaP75Values | length) == 0 then null else ($pairedDeltaP75Values | median) end),
pairedDeltaMadValue: (if ($pairedDeltaMadValues | length) == 0 then null else ($pairedDeltaMadValues | median) end),
+ pairedDeltaSampleValues: $pairedDeltaSampleValues,
generatedAt: ($items[-1].generatedAt // null)
};
@@ -3387,12 +3391,13 @@ jobs:
def policy_enabled($policy):
if ($policy | has("enabled")) then $policy.enabled else true end;
- def classify($metric; $unit; $measurementKind; $policy; $current; $currentP25; $currentP75; $currentMad; $baseline; $baselineMin; $baselineMax; $baselineP25; $baselineP75; $baselineP95; $baselineMad; $currentSamples; $baselineSources; $pairedSamples; $pairedDeltaMedian; $pairedDeltaP25; $pairedDeltaP75; $pairedDeltaMad):
+ def classify($metric; $unit; $measurementKind; $policy; $current; $currentP25; $currentP75; $currentMad; $baseline; $baselineMin; $baselineMax; $baselineP25; $baselineP75; $baselineP95; $baselineMad; $currentSamples; $baselineSources; $pairedSamples; $pairedDeltaMedian; $pairedDeltaP25; $pairedDeltaP75; $pairedDeltaMad; $pairedDeltaValues):
$policy as $b
| ($policy.comparisonMode // (if $measurementKind == "deterministic" or $unit != "seconds" then "budget" elif $measurementKind == "diagnostic" then "diagnostic" else "historical" end)) as $comparisonMode
| ($policy.noiseFloor // noise_floor($metric; $unit)) as $noise
| ($current - $baseline) as $delta
| (if $comparisonMode == "paired" and $pairedDeltaMedian != null then $pairedDeltaMedian else $delta end) as $evidenceDelta
+ | (($policy.pairedEvidenceQuantile // 0.25) | tonumber) as $pairedEvidenceQuantile
| (if $baseline > 0 then ($current / $baseline) else null end) as $ratio
| (($baselineP75 // $baseline) - ($baselineP25 // $baseline)) as $iqr
| (($currentP75 // $current) - ($currentP25 // $current)) as $currentIqr
@@ -3422,8 +3427,8 @@ jobs:
| ($baseline - $robustTolerance) as $robustLower
| ($current + $currentRobustTolerance) as $currentRobustUpper
| ($current - $currentRobustTolerance) as $currentRobustLower
- | ($evidenceDelta + $pairedDeltaTolerance) as $evidenceDeltaUpper
- | ($evidenceDelta - $pairedDeltaTolerance) as $evidenceDeltaLower
+ | (if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then ($pairedDeltaValues | percentile($pairedEvidenceQuantile)) else ($evidenceDelta - $pairedDeltaTolerance) end) as $evidenceDeltaLower
+ | (if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then ($pairedDeltaValues | percentile(1 - $pairedEvidenceQuantile)) else ($evidenceDelta + $pairedDeltaTolerance) end) as $evidenceDeltaUpper
| ([($b.warnAbs // 0), (if $baseline > 0 then ($baseline * (($b.warnRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $warnBudget
| ([($b.failAbs // 0), (if $baseline > 0 then ($baseline * (($b.failRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $failBudget
| ($comparisonMode != "paired") as $needsHistoricalBaselineCount
@@ -3523,7 +3528,7 @@ jobs:
else "improvement"
end
) as $semanticImpactKind
- | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,comparisonMode:$comparisonMode,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,semanticImpactScore:$semanticImpactScore,semanticImpactKind:$semanticImpactKind,semanticWarnBudget:$warnBudget,semanticFailBudget:$failBudget,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange,pairedSamples:$pairedSamples,evidenceDelta:$evidenceDelta,evidenceDeltaLower:$evidenceDeltaLower,evidenceDeltaUpper:$evidenceDeltaUpper,evidenceDeltaTolerance:$pairedDeltaTolerance};
+ | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,comparisonMode:$comparisonMode,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,semanticImpactScore:$semanticImpactScore,semanticImpactKind:$semanticImpactKind,semanticWarnBudget:$warnBudget,semanticFailBudget:$failBudget,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange,pairedSamples:$pairedSamples,evidenceDelta:$evidenceDelta,evidenceDeltaLower:$evidenceDeltaLower,evidenceDeltaUpper:$evidenceDeltaUpper,evidenceDeltaTolerance:$pairedDeltaTolerance,pairedEvidenceQuantile:$pairedEvidenceQuantile,pairedEvidenceProtocol:(if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then "paired-delta-quantile-v1" elif $comparisonMode == "paired" then "paired-summary-robust-band-v1" else null end)};
(observations_by_key($current[0]) | with_entries(.value = observation_stats(.value))) as $currentObs
| (observations_by_key($baseline[0]) | with_entries(.value = observation_stats(.value))) as $baselineObs
@@ -3588,7 +3593,8 @@ jobs:
$currentValue.pairedDeltaMedianValue;
$currentValue.pairedDeltaP25Value;
$currentValue.pairedDeltaP75Value;
- $currentValue.pairedDeltaMadValue
+ $currentValue.pairedDeltaMadValue;
+ ($currentValue.pairedDeltaSampleValues // [])
) + {
target: $currentValue.target,
observation: $currentValue.observation,
@@ -3912,6 +3918,22 @@ jobs:
return formatSemanticImpact(row.semanticImpactScore)
}
+ const formatEvidence = (row) => {
+ const unit = row.observation?.unit
+ if (row.comparisonMode === 'paired' && typeof row.evidenceDeltaLower === 'number' && typeof row.evidenceDeltaUpper === 'number') {
+ const quantile = typeof row.pairedEvidenceQuantile === 'number'
+ ? Math.round(row.pairedEvidenceQuantile * 100)
+ : 25
+ return (row.confidence || 'unknown')
+ + '
paired n=' + (row.pairedSamples ?? 0)
+ + ', ' + quantile + '-' + (100 - quantile) + '% delta '
+ + formatValue(row.evidenceDeltaLower, unit)
+ + ' - ' + formatValue(row.evidenceDeltaUpper, unit)
+ + ''
+ }
+ return (row.confidence || 'unknown') + '
baseline n=' + (row.baselineSources ?? 0) + ', current samples=' + (row.currentSamples ?? 1) + ''
+ }
+
const interpretation = (row) => {
if (row.confidence === 'low_baseline_count') return {
label: 'Needs more baseline',
@@ -4129,7 +4151,7 @@ jobs:
formatRowImpact(row),
meaning.label + '
' + meaning.detail + '',
formatGate(row),
- (row.confidence || 'unknown') + '
baseline n=' + (row.baselineSources ?? 0) + ', current samples=' + (row.currentSamples ?? 1) + '',
+ formatEvidence(row),
].map(escapeCell).join(' | ') + ' |'
}),
].join('\n')
@@ -5064,6 +5086,7 @@ jobs:
| ($items | map(.observation.statistics.pairedDeltaP25 // empty)) as $pairedDeltaP25Values
| ($items | map(.observation.statistics.pairedDeltaP75 // empty)) as $pairedDeltaP75Values
| ($items | map(.observation.statistics.pairedDeltaMad // empty)) as $pairedDeltaMadValues
+ | ($items | map(.observation.statistics.pairedDeltaSamples // []) | add // []) as $pairedDeltaSampleValues
| ($items | map(.observation.statistics.measuredSampleCount // .observation.statistics.sampleCount // 1) | add // ($items | length)) as $sampleCount
| ($values | median) as $median
| {
@@ -5085,6 +5108,7 @@ jobs:
pairedDeltaP25Value: (if ($pairedDeltaP25Values | length) == 0 then null else ($pairedDeltaP25Values | median) end),
pairedDeltaP75Value: (if ($pairedDeltaP75Values | length) == 0 then null else ($pairedDeltaP75Values | median) end),
pairedDeltaMadValue: (if ($pairedDeltaMadValues | length) == 0 then null else ($pairedDeltaMadValues | median) end),
+ pairedDeltaSampleValues: $pairedDeltaSampleValues,
generatedAt: ($items[-1].generatedAt // null)
};
@@ -5123,12 +5147,13 @@ jobs:
def policy_enabled($policy):
if ($policy | has("enabled")) then $policy.enabled else true end;
- def classify($metric; $unit; $measurementKind; $policy; $current; $currentP25; $currentP75; $currentMad; $baseline; $baselineMin; $baselineMax; $baselineP25; $baselineP75; $baselineP95; $baselineMad; $currentSamples; $baselineSources; $pairedSamples; $pairedDeltaMedian; $pairedDeltaP25; $pairedDeltaP75; $pairedDeltaMad):
+ def classify($metric; $unit; $measurementKind; $policy; $current; $currentP25; $currentP75; $currentMad; $baseline; $baselineMin; $baselineMax; $baselineP25; $baselineP75; $baselineP95; $baselineMad; $currentSamples; $baselineSources; $pairedSamples; $pairedDeltaMedian; $pairedDeltaP25; $pairedDeltaP75; $pairedDeltaMad; $pairedDeltaValues):
$policy as $b
| ($policy.comparisonMode // (if $measurementKind == "deterministic" or $unit != "seconds" then "budget" elif $measurementKind == "diagnostic" then "diagnostic" else "historical" end)) as $comparisonMode
| ($policy.noiseFloor // noise_floor($metric; $unit)) as $noise
| ($current - $baseline) as $delta
| (if $comparisonMode == "paired" and $pairedDeltaMedian != null then $pairedDeltaMedian else $delta end) as $evidenceDelta
+ | (($policy.pairedEvidenceQuantile // 0.25) | tonumber) as $pairedEvidenceQuantile
| (if $baseline > 0 then ($current / $baseline) else null end) as $ratio
| (($baselineP75 // $baseline) - ($baselineP25 // $baseline)) as $iqr
| (($currentP75 // $current) - ($currentP25 // $current)) as $currentIqr
@@ -5158,8 +5183,8 @@ jobs:
| ($baseline - $robustTolerance) as $robustLower
| ($current + $currentRobustTolerance) as $currentRobustUpper
| ($current - $currentRobustTolerance) as $currentRobustLower
- | ($evidenceDelta + $pairedDeltaTolerance) as $evidenceDeltaUpper
- | ($evidenceDelta - $pairedDeltaTolerance) as $evidenceDeltaLower
+ | (if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then ($pairedDeltaValues | percentile($pairedEvidenceQuantile)) else ($evidenceDelta - $pairedDeltaTolerance) end) as $evidenceDeltaLower
+ | (if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then ($pairedDeltaValues | percentile(1 - $pairedEvidenceQuantile)) else ($evidenceDelta + $pairedDeltaTolerance) end) as $evidenceDeltaUpper
| ([($b.warnAbs // 0), (if $baseline > 0 then ($baseline * (($b.warnRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $warnBudget
| ([($b.failAbs // 0), (if $baseline > 0 then ($baseline * (($b.failRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $failBudget
| ($comparisonMode != "paired") as $needsHistoricalBaselineCount
@@ -5259,7 +5284,7 @@ jobs:
else "improvement"
end
) as $semanticImpactKind
- | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,comparisonMode:$comparisonMode,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,semanticImpactScore:$semanticImpactScore,semanticImpactKind:$semanticImpactKind,semanticWarnBudget:$warnBudget,semanticFailBudget:$failBudget,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange,pairedSamples:$pairedSamples,evidenceDelta:$evidenceDelta,evidenceDeltaLower:$evidenceDeltaLower,evidenceDeltaUpper:$evidenceDeltaUpper,evidenceDeltaTolerance:$pairedDeltaTolerance};
+ | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,comparisonMode:$comparisonMode,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,semanticImpactScore:$semanticImpactScore,semanticImpactKind:$semanticImpactKind,semanticWarnBudget:$warnBudget,semanticFailBudget:$failBudget,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange,pairedSamples:$pairedSamples,evidenceDelta:$evidenceDelta,evidenceDeltaLower:$evidenceDeltaLower,evidenceDeltaUpper:$evidenceDeltaUpper,evidenceDeltaTolerance:$pairedDeltaTolerance,pairedEvidenceQuantile:$pairedEvidenceQuantile,pairedEvidenceProtocol:(if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then "paired-delta-quantile-v1" elif $comparisonMode == "paired" then "paired-summary-robust-band-v1" else null end)};
(observations_by_key($current[0]) | with_entries(.value = observation_stats(.value))) as $currentObs
| (observations_by_key($baseline[0]) | with_entries(.value = observation_stats(.value))) as $baselineObs
@@ -5324,7 +5349,8 @@ jobs:
$currentValue.pairedDeltaMedianValue;
$currentValue.pairedDeltaP25Value;
$currentValue.pairedDeltaP75Value;
- $currentValue.pairedDeltaMadValue
+ $currentValue.pairedDeltaMadValue;
+ ($currentValue.pairedDeltaSampleValues // [])
) + {
target: $currentValue.target,
observation: $currentValue.observation,
@@ -5648,6 +5674,22 @@ jobs:
return formatSemanticImpact(row.semanticImpactScore)
}
+ const formatEvidence = (row) => {
+ const unit = row.observation?.unit
+ if (row.comparisonMode === 'paired' && typeof row.evidenceDeltaLower === 'number' && typeof row.evidenceDeltaUpper === 'number') {
+ const quantile = typeof row.pairedEvidenceQuantile === 'number'
+ ? Math.round(row.pairedEvidenceQuantile * 100)
+ : 25
+ return (row.confidence || 'unknown')
+ + '
paired n=' + (row.pairedSamples ?? 0)
+ + ', ' + quantile + '-' + (100 - quantile) + '% delta '
+ + formatValue(row.evidenceDeltaLower, unit)
+ + ' - ' + formatValue(row.evidenceDeltaUpper, unit)
+ + ''
+ }
+ return (row.confidence || 'unknown') + '
baseline n=' + (row.baselineSources ?? 0) + ', current samples=' + (row.currentSamples ?? 1) + ''
+ }
+
const interpretation = (row) => {
if (row.confidence === 'low_baseline_count') return {
label: 'Needs more baseline',
@@ -5865,7 +5907,7 @@ jobs:
formatRowImpact(row),
meaning.label + '
' + meaning.detail + '',
formatGate(row),
- (row.confidence || 'unknown') + '
baseline n=' + (row.baselineSources ?? 0) + ', current samples=' + (row.currentSamples ?? 1) + '',
+ formatEvidence(row),
].map(escapeCell).join(' | ') + ' |'
}),
].join('\n')
diff --git a/context/ci-measurements.md b/context/ci-measurements.md
index 0e1db3380..6a1f63f25 100644
--- a/context/ci-measurements.md
+++ b/context/ci-measurements.md
@@ -24,7 +24,7 @@ semantics match:
| `measurementKind` | Gateable `comparisonMode` | Baseline Meaning | Uncertainty Model |
| ----------------- | ------------------------- | --------------------------------------------- | ----------------------------------------- |
| `deterministic` | `budget` | Same target on a comparable ref | None by default; exact value plus budget. |
-| `wall-clock` | `paired` | Same PR run, same runner, base/head pairs | Per-pair delta band. |
+| `wall-clock` | `paired` | Same PR run, same runner, base/head pairs | Per-pair delta evidence interval. |
| `wall-clock` | `historical` | Previous comparable successful artifacts | Advisory robust bands only. |
| `diagnostic` | none | Optional context artifact or trace attachment | Not gateable. |
@@ -47,7 +47,8 @@ numeric `value`, `unit`, `measurementKind`, and a gate `policy`.
"enabled": true,
"comparisonMode": "paired",
"minPairedSamples": 5,
- "minCurrentSamples": 5
+ "minCurrentSamples": 5,
+ "pairedEvidenceQuantile": 0.25
}
}
```
@@ -91,13 +92,18 @@ without making order a hidden variable. The current artifact stores the paired
baseline median and paired sample count, and the comparison engine uses that
embedded paired baseline for the gate.
-The gate evaluates per-pair deltas, not only the difference between medians.
-A paired wall-clock row is actionable only when the paired delta evidence band
-clears the configured warning or failure budget. If the point estimate moved
-but the paired delta band still crosses the budget, the row renders as
-`paired_uncertain` and does not block. This follows the same principle used by
-continuous benchmark tools: a point estimate without uncertainty is not enough
-evidence for a regression.
+The gate evaluates per-pair deltas, not only the difference between medians. New
+artifacts carry the raw paired delta samples in the observation statistics. The
+comparison engine derives a nonparametric evidence interval from those samples
+using `pairedEvidenceQuantile` (default `0.25`, so the displayed interval is the
+25th-75th percentile by default). A paired wall-clock row blocks only when the
+lower evidence quantile clears the configured failure budget. If the point
+estimate moved but the paired delta evidence still crosses the budget, the row
+renders as `paired_uncertain` and does not block. Older artifacts that only have
+summary statistics use a conservative robust-band fallback and are labeled with
+that evidence protocol. This follows the same principle used by continuous
+benchmark tools: a point estimate without uncertainty is not enough evidence
+for a regression.
Paired wall-clock gates do not require a historical baseline source count. The
same-run paired baseline is the comparable evidence. Historical runs may still
@@ -148,6 +154,22 @@ The comparison operates on per-pair deltas. A wall-clock row becomes gateable
only when the configured minimum paired sample count is present. Until then,
the row is partial/advisory even if the historical raw delta is large.
+For PR gates, the preferred evidence protocol is `paired-delta-quantile-v1`:
+
+```text
+paired deltas = current_duration(pair_i) - baseline_duration(pair_i)
+evidence lower = quantile(paired deltas, pairedEvidenceQuantile)
+evidence upper = quantile(paired deltas, 1 - pairedEvidenceQuantile)
+gate fail = evidence lower > semantic fail budget
+gate warn = evidence lower > semantic warn budget
+```
+
+This is intentionally nonparametric because CI timings are often skewed,
+heavy-tailed, and not normally distributed. A future scheduled calibration lane
+can increase sample counts or move to bootstrap intervals for selected
+high-value probes, but the PR gate should remain understandable from the raw
+pair deltas in the artifact.
+
## Deterministic Measurements
Nix closure size, source shape, code complexity, lines of code, and file counts
diff --git a/genie/ci-scripts/ci-measurement-comparison.test.sh b/genie/ci-scripts/ci-measurement-comparison.test.sh
index cf5814ee1..80260faa5 100755
--- a/genie/ci-scripts/ci-measurement-comparison.test.sh
+++ b/genie/ci-scripts/ci-measurement-comparison.test.sh
@@ -188,6 +188,28 @@ if [ "$actual_status" != "pass" ] || [ "$actual_row" != "pass" ] || [ "$actual_c
exit 1
fi
+rm -rf "$tmp_dir/current" "$tmp_dir/baseline"
+write_measurement "$tmp_dir/current/measurements.json" 12.1 devenv-perf-warm-median-v2 "$paired_policy"
+jq '.observations[0].comparison = { mode: "paired", baseline: 10, pairedSampleCount: 5 }
+ | .observations[0].statistics.pairedSampleCount = 5
+ | .observations[0].statistics.pairedDeltaMedian = 2.1
+ | .observations[0].statistics.pairedDeltaP25 = 2.05
+ | .observations[0].statistics.pairedDeltaP75 = 2.2
+ | .observations[0].statistics.pairedDeltaMad = 0.1
+ | .observations[0].statistics.pairedDeltaSamples = [0.2, 2.05, 2.1, 2.2, 2.3]' \
+ "$tmp_dir/current/measurements.json" >"$tmp_dir/current/measurements.updated.json"
+mv "$tmp_dir/current/measurements.updated.json" "$tmp_dir/current/measurements.json"
+write_measurement "$tmp_dir/baseline/run-1/measurements.json" 10 devenv-perf-warm-median-v2 "$paired_policy"
+run_compare
+actual_status="$(jq -r '.status' "$tmp_dir/comparison.json")"
+actual_row="$(jq -r '.comparisons[] | .status' "$tmp_dir/comparison.json")"
+actual_protocol="$(jq -r '.comparisons[] | .pairedEvidenceProtocol' "$tmp_dir/comparison.json")"
+actual_lower="$(jq -r '.comparisons[] | .evidenceDeltaLower' "$tmp_dir/comparison.json")"
+if [ "$actual_status" != "fail" ] || [ "$actual_row" != "fail" ] || [ "$actual_protocol" != "paired-delta-quantile-v1" ] || ! awk "BEGIN { exit !($actual_lower > 2) }"; then
+ echo "expected raw paired delta quantile evidence to fail only when the lower evidence quantile exceeds budget; got status=$actual_status row=$actual_row protocol=$actual_protocol lower=$actual_lower" >&2
+ exit 1
+fi
+
rm -rf "$tmp_dir/current" "$tmp_dir/baseline"
write_measurement "$tmp_dir/current/measurements.json" 13 devenv-perf-warm-median-v2 "$paired_policy"
jq '.observations[0].comparison = { mode: "paired", baseline: 10, pairedSampleCount: 5 }
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 0c0379101..474bd542b 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -30,6 +30,7 @@ export type CiMeasurementGatePolicy = {
readonly noiseFloor?: number
readonly statisticalToleranceRatio?: number
readonly statisticalToleranceAbs?: number
+ readonly pairedEvidenceQuantile?: number
readonly warnRatio?: number
readonly failRatio?: number
readonly warnAbs?: number
@@ -79,6 +80,7 @@ export type CiMeasurementObservation = {
readonly pairedDeltaP25?: number
readonly pairedDeltaP75?: number
readonly pairedDeltaMad?: number
+ readonly pairedDeltaSamples?: readonly number[]
}
}
@@ -618,7 +620,8 @@ json_append_timing() {
if $pairedDeltaMedian == null then null
else ($pairedDeltaDurations | map(. - $pairedDeltaMedian | if . < 0 then -. else . end) | median)
end
- )
+ ),
+ pairedDeltaSampleDurationMs: $pairedDeltaDurations
},
samples:$sampleList
}' \
@@ -975,7 +978,8 @@ jq -n \
then null
else (.statistics.pairedDeltaMadDurationMs / 1000)
end
- )
+ ),
+ pairedDeltaSamples: ((.statistics.pairedDeltaSampleDurationMs // []) | map(. / 1000))
},
dimensions: {
probe: .id,
@@ -1786,6 +1790,7 @@ jq -n \
| ($items | map(.observation.statistics.pairedDeltaP25 // empty)) as $pairedDeltaP25Values
| ($items | map(.observation.statistics.pairedDeltaP75 // empty)) as $pairedDeltaP75Values
| ($items | map(.observation.statistics.pairedDeltaMad // empty)) as $pairedDeltaMadValues
+ | ($items | map(.observation.statistics.pairedDeltaSamples // []) | add // []) as $pairedDeltaSampleValues
| ($items | map(.observation.statistics.measuredSampleCount // .observation.statistics.sampleCount // 1) | add // ($items | length)) as $sampleCount
| ($values | median) as $median
| {
@@ -1807,6 +1812,7 @@ jq -n \
pairedDeltaP25Value: (if ($pairedDeltaP25Values | length) == 0 then null else ($pairedDeltaP25Values | median) end),
pairedDeltaP75Value: (if ($pairedDeltaP75Values | length) == 0 then null else ($pairedDeltaP75Values | median) end),
pairedDeltaMadValue: (if ($pairedDeltaMadValues | length) == 0 then null else ($pairedDeltaMadValues | median) end),
+ pairedDeltaSampleValues: $pairedDeltaSampleValues,
generatedAt: ($items[-1].generatedAt // null)
};
@@ -1845,12 +1851,13 @@ jq -n \
def policy_enabled($policy):
if ($policy | has("enabled")) then $policy.enabled else true end;
- def classify($metric; $unit; $measurementKind; $policy; $current; $currentP25; $currentP75; $currentMad; $baseline; $baselineMin; $baselineMax; $baselineP25; $baselineP75; $baselineP95; $baselineMad; $currentSamples; $baselineSources; $pairedSamples; $pairedDeltaMedian; $pairedDeltaP25; $pairedDeltaP75; $pairedDeltaMad):
+ def classify($metric; $unit; $measurementKind; $policy; $current; $currentP25; $currentP75; $currentMad; $baseline; $baselineMin; $baselineMax; $baselineP25; $baselineP75; $baselineP95; $baselineMad; $currentSamples; $baselineSources; $pairedSamples; $pairedDeltaMedian; $pairedDeltaP25; $pairedDeltaP75; $pairedDeltaMad; $pairedDeltaValues):
$policy as $b
| ($policy.comparisonMode // (if $measurementKind == "deterministic" or $unit != "seconds" then "budget" elif $measurementKind == "diagnostic" then "diagnostic" else "historical" end)) as $comparisonMode
| ($policy.noiseFloor // noise_floor($metric; $unit)) as $noise
| ($current - $baseline) as $delta
| (if $comparisonMode == "paired" and $pairedDeltaMedian != null then $pairedDeltaMedian else $delta end) as $evidenceDelta
+ | (($policy.pairedEvidenceQuantile // 0.25) | tonumber) as $pairedEvidenceQuantile
| (if $baseline > 0 then ($current / $baseline) else null end) as $ratio
| (($baselineP75 // $baseline) - ($baselineP25 // $baseline)) as $iqr
| (($currentP75 // $current) - ($currentP25 // $current)) as $currentIqr
@@ -1880,8 +1887,8 @@ jq -n \
| ($baseline - $robustTolerance) as $robustLower
| ($current + $currentRobustTolerance) as $currentRobustUpper
| ($current - $currentRobustTolerance) as $currentRobustLower
- | ($evidenceDelta + $pairedDeltaTolerance) as $evidenceDeltaUpper
- | ($evidenceDelta - $pairedDeltaTolerance) as $evidenceDeltaLower
+ | (if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then ($pairedDeltaValues | percentile($pairedEvidenceQuantile)) else ($evidenceDelta - $pairedDeltaTolerance) end) as $evidenceDeltaLower
+ | (if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then ($pairedDeltaValues | percentile(1 - $pairedEvidenceQuantile)) else ($evidenceDelta + $pairedDeltaTolerance) end) as $evidenceDeltaUpper
| ([($b.warnAbs // 0), (if $baseline > 0 then ($baseline * (($b.warnRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $warnBudget
| ([($b.failAbs // 0), (if $baseline > 0 then ($baseline * (($b.failRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $failBudget
| ($comparisonMode != "paired") as $needsHistoricalBaselineCount
@@ -1981,7 +1988,7 @@ jq -n \
else "improvement"
end
) as $semanticImpactKind
- | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,comparisonMode:$comparisonMode,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,semanticImpactScore:$semanticImpactScore,semanticImpactKind:$semanticImpactKind,semanticWarnBudget:$warnBudget,semanticFailBudget:$failBudget,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange,pairedSamples:$pairedSamples,evidenceDelta:$evidenceDelta,evidenceDeltaLower:$evidenceDeltaLower,evidenceDeltaUpper:$evidenceDeltaUpper,evidenceDeltaTolerance:$pairedDeltaTolerance};
+ | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,comparisonMode:$comparisonMode,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,semanticImpactScore:$semanticImpactScore,semanticImpactKind:$semanticImpactKind,semanticWarnBudget:$warnBudget,semanticFailBudget:$failBudget,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange,pairedSamples:$pairedSamples,evidenceDelta:$evidenceDelta,evidenceDeltaLower:$evidenceDeltaLower,evidenceDeltaUpper:$evidenceDeltaUpper,evidenceDeltaTolerance:$pairedDeltaTolerance,pairedEvidenceQuantile:$pairedEvidenceQuantile,pairedEvidenceProtocol:(if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then "paired-delta-quantile-v1" elif $comparisonMode == "paired" then "paired-summary-robust-band-v1" else null end)};
(observations_by_key($current[0]) | with_entries(.value = observation_stats(.value))) as $currentObs
| (observations_by_key($baseline[0]) | with_entries(.value = observation_stats(.value))) as $baselineObs
@@ -2046,7 +2053,8 @@ jq -n \
$currentValue.pairedDeltaMedianValue;
$currentValue.pairedDeltaP25Value;
$currentValue.pairedDeltaP75Value;
- $currentValue.pairedDeltaMadValue
+ $currentValue.pairedDeltaMadValue;
+ ($currentValue.pairedDeltaSampleValues // [])
) + {
target: $currentValue.target,
observation: $currentValue.observation,
@@ -2370,6 +2378,22 @@ const formatRowImpact = (row) => {
return formatSemanticImpact(row.semanticImpactScore)
}
+const formatEvidence = (row) => {
+ const unit = row.observation?.unit
+ if (row.comparisonMode === 'paired' && typeof row.evidenceDeltaLower === 'number' && typeof row.evidenceDeltaUpper === 'number') {
+ const quantile = typeof row.pairedEvidenceQuantile === 'number'
+ ? Math.round(row.pairedEvidenceQuantile * 100)
+ : 25
+ return (row.confidence || 'unknown')
+ + '
paired n=' + (row.pairedSamples ?? 0)
+ + ', ' + quantile + '-' + (100 - quantile) + '% delta '
+ + formatValue(row.evidenceDeltaLower, unit)
+ + ' - ' + formatValue(row.evidenceDeltaUpper, unit)
+ + ''
+ }
+ return (row.confidence || 'unknown') + '
baseline n=' + (row.baselineSources ?? 0) + ', current samples=' + (row.currentSamples ?? 1) + ''
+}
+
const interpretation = (row) => {
if (row.confidence === 'low_baseline_count') return {
label: 'Needs more baseline',
@@ -2587,7 +2611,7 @@ const comparisonTable = (rows) => {
formatRowImpact(row),
meaning.label + '
' + meaning.detail + '',
formatGate(row),
- (row.confidence || 'unknown') + '
baseline n=' + (row.baselineSources ?? 0) + ', current samples=' + (row.currentSamples ?? 1) + '',
+ formatEvidence(row),
].map(escapeCell).join(' | ') + ' |'
}),
].join('\n')
From a9e651805512a3334e6f08ea826bb1ee110ead9e Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Tue, 19 May 2026 09:18:28 +0200
Subject: [PATCH 47/81] Document CI measurement engine direction
---
context/ci-measurement-engine.md | 167 ++++++++++++++++++++++++++
context/ci-measurement-experiments.md | 97 +++++++++++++++
context/ci-measurements.md | 22 ++++
3 files changed, 286 insertions(+)
create mode 100644 context/ci-measurement-engine.md
create mode 100644 context/ci-measurement-experiments.md
diff --git a/context/ci-measurement-engine.md b/context/ci-measurement-engine.md
new file mode 100644
index 000000000..f4901f197
--- /dev/null
+++ b/context/ci-measurement-engine.md
@@ -0,0 +1,167 @@
+# CI Measurement Engine
+
+This document specifies the reusable CI measurement engine. It builds on
+[ci-measurements.md](./ci-measurements.md).
+
+## Status
+
+Draft - architecture target for replacing generated shell/jq comparison code
+with a typed reusable implementation.
+
+## Scope
+
+This spec defines:
+
+- the stable measurement artifact contract;
+- comparison policy semantics;
+- the native engine boundary;
+- external-tool integration boundaries;
+- the rollout path from generated shell/jq to a packaged CLI.
+
+This spec does not define individual probes. Devenv, Nix closure, source-shape,
+LOC, and complexity probes remain producer adapters that emit the shared
+artifact format.
+
+## Architecture
+
+```text
+producer adapters
+ devenv wall-clock
+ nix closure size
+ source shape
+ future LOC / complexity
+ |
+ v
+measurements.json
+ |
+ v
+ci-measure native engine
+ schema validation
+ compatibility matching
+ comparison policy
+ gate decision
+ report projection
+ |
+ +--> measurement-comparison.json
+ +--> GitHub Markdown comment
+ +--> SVG/PNG chart payload
+ +--> optional trend export
+```
+
+The engine owns comparison and rendering. Workflows own checkout, dependency
+setup, artifact upload, and GitHub API calls.
+
+## Measurement Registry
+
+Every observation is interpreted through a registry entry:
+
+| Field | Purpose |
+| ------------------------- | ------------------------------------------------------ |
+| `id` | Stable public identity. |
+| `label` | Human review label. |
+| `semanticPath` | Hierarchical grouping for comments and charts. |
+| `measurementKind` | `deterministic`, `wall-clock`, or `diagnostic`. |
+| `unit` | Canonical unit for values and deltas. |
+| `direction` | Whether larger values are better, worse, or neutral. |
+| `defaultComparisonMode` | `budget`, `paired`, `historical`, or `diagnostic`. |
+| `gatePolicy` | Absolute/relative budgets and sample requirements. |
+| `compatibilityDimensions` | Which dimensions must match for historical comparison. |
+| `displayPolicy` | Visibility, sorting, and chart inclusion behavior. |
+| `rawSampleSchema` | Optional schema for per-sample evidence. |
+
+The registry is the public API for cross-repo reuse. Repos may add local entries,
+but they must not fork comparison semantics.
+
+## Comparison Semantics
+
+| Kind | Merge-gate mode | Evidence model |
+| --------------- | --------------- | -------------------------------------------------- |
+| `deterministic` | `budget` | Exact comparable value plus configured budget. |
+| `wall-clock` | `paired` | Same-run base/head pairs and paired delta samples. |
+| `wall-clock` | `historical` | Advisory trend context only. |
+| `diagnostic` | `diagnostic` | Non-gating explanatory data. |
+
+Wall-clock PR gates must not depend on historical timing alone. Historical
+timing is useful for drift detection, A/A calibration, and dashboards, but it
+does not prove PR causality.
+
+Paired wall-clock gates use nonparametric evidence by default:
+
+```text
+paired_delta_i = current_duration_i - baseline_duration_i
+evidence_lower = quantile(paired_delta, pairedEvidenceQuantile)
+evidence_upper = quantile(paired_delta, 1 - pairedEvidenceQuantile)
+fail = evidence_lower > fail_budget
+warn = evidence_lower > warn_budget
+```
+
+The engine may add bootstrap or permutation intervals for selected probes, but
+it must keep the raw paired delta samples in the artifact so decisions remain
+auditable.
+
+## Native CLI Boundary
+
+The long-term implementation should be a packaged `ci-measure` CLI.
+
+```text
+ci-measure validate --input measurements.json
+ci-measure compare --current DIR --baseline DIR --output comparison.json
+ci-measure render-comment --comparison comparison.json --output comment.md
+ci-measure render-chart --comparison comparison.json --theme light --output chart.svg
+ci-measure export-trends --comparison comparison.json --format bencher-json
+```
+
+Rust is the preferred implementation language for the engine because it gives:
+
+- typed schemas for artifact compatibility;
+- deterministic rendering without ad hoc heredocs;
+- fast startup in generated CI workflows;
+- property tests for policy classification;
+- snapshot tests for Markdown/SVG output;
+- a single packaged binary for all repos.
+
+Shell remains appropriate for probe execution because probes invoke arbitrary
+repo-local commands, Nix, devenv, and GitHub workflow primitives.
+
+## External Tool Boundary
+
+External tools may be exporters, not authorities.
+
+| Tool class | Allowed role | Not allowed role |
+| -------------------------- | ----------------------------------------- | -------------------------------------- |
+| Bencher / trend stores | Historical storage, dashboards, alerting. | Primary PR gate for paired wall-clock. |
+| CodSpeed-style instruments | Language-level benchmark suites. | Devenv/Nix shell gate replacement. |
+| OTEL backends | Trace explanation and runner diagnostics. | Canonical numeric regression decision. |
+| GitHub artifacts/comments | Current authoritative review projection. | Long-term statistical trend database. |
+
+This keeps the merge contract under our control while still allowing the best
+external system to own trend visualization or specialized microbenchmarking.
+
+The Bencher experiment in
+[ci-measurement-experiments.md](./ci-measurement-experiments.md) confirms this
+boundary: Bencher is useful for historical storage and scalar threshold alerts,
+but it does not natively gate on same-run paired base/head evidence.
+
+## Rollout
+
+1. Keep the current generated workflow behavior and comment shape stable.
+2. Add schema fixtures from existing production `measurements.json` artifacts.
+3. Implement `ci-measure compare` behind a workflow environment switch.
+4. Run generated jq and native CLI comparisons side by side in CI.
+5. Require byte-for-byte compatible `measurement-comparison.json` for existing
+ fixtures, except for intentional schema-version changes.
+6. Move Markdown and SVG rendering into the native CLI after comparison parity.
+7. Remove generated jq/Node snippets once all megarepo consumers use the CLI.
+
+The branch-protection surface must keep the same job names during rollout.
+
+## Open Questions
+
+- **DQ1 Bootstrap intervals:** Which probes are valuable enough to pay for
+ bootstrap or permutation intervals instead of quantile evidence?
+- **DQ2 Trend backend:** Should historical trend export target Bencher, an
+ object-store-backed JSON index, Prometheus/OTEL metrics, or more than one?
+- **DQ3 Registry location:** Should shared registry entries live in effect-utils
+ source, generated repo config, or both?
+- **DQ4 Calibration lane:** Which repos should run scheduled A/A and injected
+ regression calibration first?
diff --git a/context/ci-measurement-experiments.md b/context/ci-measurement-experiments.md
new file mode 100644
index 000000000..4f89b21db
--- /dev/null
+++ b/context/ci-measurement-experiments.md
@@ -0,0 +1,97 @@
+# CI Measurement Experiments
+
+This document records experiments that inform
+[ci-measurement-engine.md](./ci-measurement-engine.md).
+
+## Bencher Fit Experiment
+
+Date: 2026-05-19.
+
+Purpose: evaluate whether Bencher should replace or complement the
+GitHub-native CI measurement gate.
+
+### Setup
+
+The experiment used a local self-hosted Bencher instance and synthetic metrics
+that mimic our current measurement families:
+
+- wall-clock duration;
+- deterministic Nix closure size;
+- deterministic store path count;
+- diagnostic counters.
+
+Commands exercised:
+
+```bash
+docker run --rm ghcr.io/bencherdev/bencher --version
+
+bencher up --detach --pull missing \
+ --console-port 33080 \
+ --api-port 61018 \
+ --console-env BENCHER_API_URL=http://localhost:61018
+
+bencher run --host http://localhost:61018 \
+ --project effect-utils-ci-measurements \
+ --branch main \
+ --testbed github-ubuntu-latest \
+ --adapter json \
+ --file measurements-base.json \
+ --format json
+
+bencher run --host http://localhost:61018 \
+ --project effect-utils-ci-measurements \
+ --branch pr-658 \
+ --start-point main \
+ --start-point-clone-thresholds \
+ --start-point-reset \
+ --testbed github-ubuntu-latest \
+ --error-on-alert \
+ --adapter json \
+ --file measurements-head.json \
+ --format json
+```
+
+### Findings
+
+Bencher worked well for:
+
+- storing historical benchmark rows by project, branch, testbed, benchmark,
+ and measure;
+- cloning thresholds from a main start point into a PR branch;
+- failing CI through `--error-on-alert`;
+- percentage thresholds for coarse performance trend alerts;
+- static thresholds for simple absolute deterministic budgets;
+- multi-measure reports through Bencher Metric Format JSON;
+- local self-hosting through Docker.
+
+Bencher did not model our primary wall-clock gate:
+
+- same-run base/head paired samples are not first-class;
+- multiple files in one report become iterations, not paired comparisons;
+- alerting compares scalar metric values against thresholds;
+- stored lower/upper metric fields are not treated as paired evidence
+ intervals for gating;
+- comments and checks would be Bencher-shaped alerts, not our semantic PR
+ report with paired `n` and delta evidence intervals.
+
+### Decision
+
+Bencher is not the authority for PR merge gates.
+
+Allowed use:
+
+- optional trend backend;
+- historical dashboards;
+- coarse scheduled alerts;
+- export target for already-computed metrics, including paired summary metrics
+ and deterministic budget ratios.
+
+Disallowed use:
+
+- replacing the GitHub-native PR comment;
+- replacing paired wall-clock gate decisions;
+- replacing deterministic budget evaluation when budgets are metric-specific.
+
+The native `ci-measure` engine should own gate semantics. A future Bencher
+exporter can publish selected observations after `ci-measure compare` has
+produced the authoritative decision.
diff --git a/context/ci-measurements.md b/context/ci-measurements.md
index 6a1f63f25..00428ebfd 100644
--- a/context/ci-measurements.md
+++ b/context/ci-measurements.md
@@ -73,6 +73,12 @@ This keeps probe-specific collection code separate from the reusable regression
system. A new probe should not fork comparison, markdown rendering, or asset
publication logic.
+The reusable engine boundary is specified in
+[ci-measurement-engine.md](./ci-measurement-engine.md). The long-term direction
+is to keep this artifact and comment contract as the source of truth while
+moving comparison and rendering out of generated shell/jq snippets into a typed
+native CLI.
+
## Gate Semantics
Deterministic observations use `comparisonMode: "budget"`.
@@ -243,3 +249,19 @@ Reports must distinguish raw movement from actionable evidence.
This prevents a large historical wall-clock delta from looking like a proven
PR regression when the measurement lacks causal evidence.
+
+## External Tools
+
+External benchmarking tools may complement this system, but they do not replace
+the merge-gate contract.
+
+- Bencher-like systems may store historical trends, apply threshold models, and
+ provide dashboards.
+- CodSpeed-like instrumentation may be useful for language-level benchmark
+ suites whose execution model matches the tool.
+- OTEL backends remain diagnostic evidence for explaining where time went.
+- GitHub comments remain the human review surface for PR decisions.
+
+For wall-clock PR gates, the authoritative evidence is still same-run paired
+base/head samples emitted in `measurements.json`. For deterministic quantities,
+the authoritative evidence is the comparable value and its configured budget.
From aa2130a1329ead50bd5a88954582efe2e7421f36 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Tue, 19 May 2026 10:02:43 +0200
Subject: [PATCH 48/81] Clarify quick check measurement workloads
---
.github/workflows/ci.yml | 42 +++++++++++------
.github/workflows/ci.yml.genie.ts | 30 ++++++++++--
context/ci-measurement-engine.md | 8 ++++
context/ci-measurements.md | 15 ++++++
genie/ci-workflow/measurements.ts | 78 +++++++++++++++++++++++++------
5 files changed, 143 insertions(+), 30 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 020756f98..634ab1329 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2423,7 +2423,7 @@ jobs:
BASELINE_SEED_RUNS_JSON: '[{"runId":"25959801150","label":"PR #655","sha":"df0420cd0397ffc6928d3c6ccc9c23052d6bc255","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959802067","label":"PR #657","sha":"62833cba5d83b1c13462728edeafa684e61c006f","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959802958","label":"PR #656","sha":"21029998522a0e9435df151259611650fb948a20","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959803805","label":"PR #651","sha":"95515f971b27ef279e39c982f52e46cf9e8270e9","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959804678","label":"PR #654","sha":"58e96b9a2b87b3703de6920b6d9571f3805d0171","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959805512","label":"PR #653","sha":"d1cca16339f19d7e1a27b001edc4c2c7ecd13dc4","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959806473","label":"PR #652","sha":"acd6c63f5e235e7e5f2710fc62b2231e0ba904a6","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959807303","label":"PR #648","sha":"a5a07703ff951fb7396a40844e9491d88ed40edf","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959808097","label":"PR #649","sha":"360ff47c59a206064711dfcb6c610afd0e6b0d53","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959808775","label":"PR #647","sha":"8d1810b2c359ae95f245e56329018aab5020f8c0","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959809449","label":"PR #646","sha":"89e1396766ccd2a813680acd440cb78f540ca6c1","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959810069","label":"PR #643","sha":"239715520370436901a3f2218d162dc7b12f4b4c","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959810666","label":"PR #641","sha":"6b3751b4684ba45f496f1a1bff8b86ef6ba8275b","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959811321","label":"PR #640","sha":"fed50ae2502ac0a65395bbef5af43fcf384d5d04","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959811864","label":"PR #639","sha":"0e03df2c6f20e4d154f286fd69a4e2980d21a12d","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959812634","label":"PR #636","sha":"7efdbee4b571f2c80f5b6173bc9a84b51fbef5eb","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959813189","label":"PR #638","sha":"350d1b98baa943dcae63412eeffded7b5160bc8a","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959813761","label":"PR #637","sha":"f25336193b9f6b042eb027eca27acc4cc75a69d6","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959814335","label":"PR #634","sha":"4ba441d4ad8b6c49e9ee03d9cdfd2f04a129b714","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959814835","label":"PR #632","sha":"1ad5fd735c7f45ad5e07c8033e5b68a642ada69c","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."}]'
BASELINE_MAX_RUNS: '20'
BASELINE_MAX_CANDIDATE_RUNS: '60'
- BASELINE_REQUIRED_OBSERVATIONS_JSON: '[{"id":"devenv.shell_eval_warm.duration","minSources":10},{"id":"devenv.tasks_list.duration","minSources":10},{"id":"devenv.processes_help.duration","minSources":10},{"id":"devenv.task_pnpm_install.duration","minSources":10},{"id":"devenv.task_genie_run.duration","minSources":10},{"id":"devenv.task_check_quick.duration","minSources":10},{"id":"devenv.genie_check_direct.duration","minSources":10}]'
+ BASELINE_REQUIRED_OBSERVATIONS_JSON: '[{"id":"devenv.shell_eval_warm.duration","minSources":10},{"id":"devenv.tasks_list.duration","minSources":10},{"id":"devenv.processes_help.duration","minSources":10},{"id":"devenv.task_pnpm_install.duration","minSources":10},{"id":"devenv.task_genie_run.duration","minSources":10},{"id":"devenv.task_check_quick_warm.duration","minSources":10},{"id":"devenv.task_check_quick_forced.duration","minSources":10},{"id":"devenv.genie_check_direct.duration","minSources":10}]'
run: |
set -euo pipefail
@@ -2702,6 +2702,7 @@ jobs:
local stderr="$8"
local trace="$9"
local gate_policy="${10}"
+ local metadata_json="${11}"
local samples_file="$ARTIFACT_DIR/$id.samples.json"
if [ "$first" -eq 0 ]; then
@@ -2721,6 +2722,7 @@ jobs:
--arg stderr "$stderr" \
--arg trace "$trace" \
--argjson gatePolicy "$gate_policy" \
+ --argjson metadata "$metadata_json" \
'def median:
sort as $sorted
| ($sorted | length) as $count
@@ -2762,7 +2764,8 @@ jobs:
durationMs:$durationMs,
stdout:$stdout,
stderr:$stderr,
- trace:(if $trace == "" then null else $trace end),
+ trace:(if $trace == "" then null else $trace end),
+ metadata:$metadata,
gatePolicy:$gatePolicy,
statistics: {
sampleCount: ($sampleList | length),
@@ -2805,7 +2808,8 @@ jobs:
local warmup_repetitions="$6"
local repetitions="$7"
local gate_policy="$8"
- shift 8
+ local metadata_json="$9"
+ shift 9
case "$trace_file" in
'$ARTIFACT_DIR'*) trace_file="${ARTIFACT_DIR}${trace_file#'$ARTIFACT_DIR'}" ;;
esac
@@ -2973,7 +2977,7 @@ jobs:
cp "$stdout" "$ARTIFACT_DIR/$id.stdout" 2>/dev/null || true
cp "$stderr" "$ARTIFACT_DIR/$id.stderr" 2>/dev/null || true
- json_append_timing "$id" "$label" "$group" "$description" "$status" "$duration_ms" "$ARTIFACT_DIR/$id.stdout" "$ARTIFACT_DIR/$id.stderr" "$trace_file" "$gate_policy"
+ json_append_timing "$id" "$label" "$group" "$description" "$status" "$duration_ms" "$ARTIFACT_DIR/$id.stdout" "$ARTIFACT_DIR/$id.stderr" "$trace_file" "$gate_policy" "$metadata_json"
if [ "$status" -ne 0 ]; then
if [ "${CI_MEASUREMENT_ALLOW_PROBE_FAILURES:-}" = "1" ]; then
@@ -2988,14 +2992,15 @@ jobs:
fi
}
- measure 'shell_eval_traced' 'Shell eval with OTEL trace' 'devenv shell' 'Evaluates the dev shell with native devenv JSON tracing enabled.' '$ARTIFACT_DIR/traces/shell_eval_traced.json' '0' '1' '{"enabled":false,"minBaselineSources":10,"minCurrentSamples":3,"warnRatio":1.25,"failRatio":1.5,"warnAbs":1.5,"failAbs":3,"noiseFloor":0.5,"statisticalToleranceRatio":0.2,"statisticalToleranceAbs":1}' '$DEVENV_SHELL_TRACE_COMMAND'
- measure 'shell_eval_warm' 'Warm shell eval' 'devenv shell' 'Evaluates a warm dev shell without reloading direnv state.' '' '1' '5' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":5,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.25}' '$DEVENV_BIN' 'shell' '--no-reload' '--' 'true'
- measure 'tasks_list' 'devenv tasks list' 'devenv cli' 'Lists devenv tasks to measure task graph loading overhead.' '' '1' '9' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":7,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.25,"failRatio":1.5,"warnAbs":0.05,"failAbs":0.15,"noiseFloor":0.03,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.03}' '$DEVENV_BIN' 'tasks' 'list'
- measure 'processes_help' 'devenv processes --help' 'devenv cli' 'Loads the devenv processes command help path.' '' '1' '9' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":7,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.25,"failRatio":1.5,"warnAbs":0.05,"failAbs":0.15,"noiseFloor":0.03,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.03}' '$DEVENV_BIN' 'processes' '--help'
- measure 'task_pnpm_install' 'pnpm install task' 'workspace setup' 'Runs the cached pnpm install devenv task.' '' '1' '5' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":5,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.25}' '$DEVENV_BIN' 'tasks' 'run' 'pnpm:install' '--mode' 'before' '--no-tui' '--show-output'
- measure 'task_genie_run' 'Genie run task' 'genie' 'Runs the normal devenv genie:run task including its declared dependencies.' '' '1' '5' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":5,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.25}' '$DEVENV_BIN' 'tasks' 'run' 'genie:run' '--mode' 'before' '--no-tui' '--show-output'
- measure 'task_check_quick' 'Quick check task' 'quality gates' 'Runs the fast local quality gate through devenv.' '' '1' '5' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":5,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.25}' '$DEVENV_BIN' 'tasks' 'run' 'check:quick' '--mode' 'before' '--no-tui' '--show-output'
- measure 'genie_check_direct' 'Genie check direct' 'genie' 'Runs Genie directly in check mode to isolate generator runtime from devenv task dependency overhead.' '' '1' '5' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":5,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.25}' '$DEVENV_BIN' 'shell' '--no-reload' '--' 'bun' 'packages/@overeng/genie/bin/genie.tsx' '--output' 'ci-plain' '--check'
+ measure 'shell_eval_traced' 'Shell eval with OTEL trace' 'devenv shell' 'Evaluates the dev shell with native devenv JSON tracing enabled.' '$ARTIFACT_DIR/traces/shell_eval_traced.json' '0' '1' '{"enabled":false,"minBaselineSources":10,"minCurrentSamples":3,"warnRatio":1.25,"failRatio":1.5,"warnAbs":1.5,"failAbs":3,"noiseFloor":0.5,"statisticalToleranceRatio":0.2,"statisticalToleranceAbs":1}' '{"path":[],"dimensions":{}}' '$DEVENV_SHELL_TRACE_COMMAND'
+ measure 'shell_eval_warm' 'Warm shell eval' 'devenv shell' 'Evaluates a warm dev shell without reloading direnv state.' '' '1' '5' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":5,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.25}' '{"path":[],"dimensions":{}}' '$DEVENV_BIN' 'shell' '--no-reload' '--' 'true'
+ measure 'tasks_list' 'devenv tasks list' 'devenv cli' 'Lists devenv tasks to measure task graph loading overhead.' '' '1' '9' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":7,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.25,"failRatio":1.5,"warnAbs":0.05,"failAbs":0.15,"noiseFloor":0.03,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.03}' '{"path":[],"dimensions":{}}' '$DEVENV_BIN' 'tasks' 'list'
+ measure 'processes_help' 'devenv processes --help' 'devenv cli' 'Loads the devenv processes command help path.' '' '1' '9' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":7,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.25,"failRatio":1.5,"warnAbs":0.05,"failAbs":0.15,"noiseFloor":0.03,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.03}' '{"path":[],"dimensions":{}}' '$DEVENV_BIN' 'processes' '--help'
+ measure 'task_pnpm_install' 'pnpm install task' 'workspace setup' 'Runs the cached pnpm install devenv task.' '' '1' '5' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":5,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.25}' '{"path":[],"dimensions":{}}' '$DEVENV_BIN' 'tasks' 'run' 'pnpm:install' '--mode' 'before' '--no-tui' '--show-output'
+ measure 'task_genie_run' 'Genie run task' 'genie' 'Runs the normal devenv genie:run task including its declared dependencies.' '' '1' '5' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":5,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.25}' '{"path":[],"dimensions":{}}' '$DEVENV_BIN' 'tasks' 'run' 'genie:run' '--mode' 'before' '--no-tui' '--show-output'
+ measure 'task_check_quick_warm' 'Warm cached check:quick' 'quality gates' 'Runs the fast local quality gate through devenv after a warmup. This measures the cached no-op path and task/status orchestration overhead.' '' '1' '5' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":5,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.25}' '{"path":["quality gates","check:quick"],"dimensions":{"workload":"cached-no-op","taskCacheMode":"warm"}}' '$DEVENV_BIN' 'tasks' 'run' 'check:quick' '--mode' 'before' '--no-tui' '--show-output'
+ measure 'task_check_quick_forced' 'Forced check:quick' 'quality gates' 'Runs the fast local quality gate through devenv with task-cache refresh. This measures the developer-facing quick-check workload rather than the cached no-op path.' '' '0' '3' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":3,"minBaselineSources":10,"minCurrentSamples":3,"warnRatio":1.15,"failRatio":1.3,"warnAbs":1.5,"failAbs":4,"noiseFloor":0.75,"statisticalToleranceRatio":0.15,"statisticalToleranceAbs":1}' '{"path":["quality gates","check:quick"],"dimensions":{"workload":"forced-task-cache","taskCacheMode":"refresh"}}' '$DEVENV_BIN' 'tasks' 'run' 'check:quick' '--mode' 'before' '--no-tui' '--show-output' '--refresh-task-cache'
+ measure 'genie_check_direct' 'Genie check direct' 'genie' 'Runs Genie directly in check mode to isolate generator runtime from devenv task dependency overhead.' '' '1' '5' '{"enabled":true,"comparisonMode":"paired","minPairedSamples":5,"minBaselineSources":10,"minCurrentSamples":5,"warnRatio":1.1,"failRatio":1.2,"warnAbs":0.25,"failAbs":1,"noiseFloor":0.1,"statisticalToleranceRatio":0.1,"statisticalToleranceAbs":0.25}' '{"path":[],"dimensions":{}}' '$DEVENV_BIN' 'shell' '--no-reload' '--' 'bun' 'packages/@overeng/genie/bin/genie.tsx' '--output' 'ci-plain' '--check'
printf ']\n' >>"$ARTIFACT_DIR/timings.json"
@@ -3081,6 +3086,7 @@ jobs:
id: ("devenv." + .id + ".duration"),
label: .label,
group: .group,
+ path: (.metadata.path // []),
description: .description,
measurementKind: (if (.gatePolicy.enabled == false) then "diagnostic" else "wall-clock" end),
name: ("devenv." + .id + ".duration"),
@@ -3156,7 +3162,7 @@ jobs:
),
pairedDeltaSamples: ((.statistics.pairedDeltaSampleDurationMs // []) | map(. / 1000))
},
- dimensions: {
+ dimensions: ((.metadata.dimensions // {}) + {
probe: .id,
probeLabel: .label,
status: .status,
@@ -3181,7 +3187,7 @@ jobs:
phase: "warm",
devenvRev: $devenvRev,
otelServiceName: $otelServiceName
- }
+ })
})
),
artifacts: [
@@ -4052,6 +4058,8 @@ jobs:
task_pnpm_install: 'pnpm:install',
task_genie_run: 'genie:run',
task_check_quick: 'check:quick',
+ task_check_quick_warm: 'Warm cached check:quick',
+ task_check_quick_forced: 'Forced check:quick',
}
if (probe && labels[probe]) return labels[probe]
if (name.startsWith('devenv.') && name.endsWith('.duration')) {
@@ -4087,6 +4095,8 @@ jobs:
task_pnpm_install: 'pnpm:install',
task_genie_run: 'genie:run',
task_check_quick: 'check:quick',
+ task_check_quick_warm: 'Warm cached check:quick',
+ task_check_quick_forced: 'Forced check:quick',
}
if (probe && labels[probe]) return labels[probe]
return humanProbe(row)
@@ -5808,6 +5818,8 @@ jobs:
task_pnpm_install: 'pnpm:install',
task_genie_run: 'genie:run',
task_check_quick: 'check:quick',
+ task_check_quick_warm: 'Warm cached check:quick',
+ task_check_quick_forced: 'Forced check:quick',
}
if (probe && labels[probe]) return labels[probe]
if (name.startsWith('devenv.') && name.endsWith('.duration')) {
@@ -5843,6 +5855,8 @@ jobs:
task_pnpm_install: 'pnpm:install',
task_genie_run: 'genie:run',
task_check_quick: 'check:quick',
+ task_check_quick_warm: 'Warm cached check:quick',
+ task_check_quick_forced: 'Forced check:quick',
}
if (probe && labels[probe]) return labels[probe]
return humanProbe(row)
diff --git a/.github/workflows/ci.yml.genie.ts b/.github/workflows/ci.yml.genie.ts
index 1fd33e14f..a148459d2 100644
--- a/.github/workflows/ci.yml.genie.ts
+++ b/.github/workflows/ci.yml.genie.ts
@@ -315,7 +315,8 @@ const extraJobs: Record = {
sha,
source: 'manual-backfill',
artifacts: ['devenv-perf'],
- notes: 'Backfilled with the current measurement workflow for the effect-utils #658 rollout.',
+ notes:
+ 'Backfilled with the current measurement workflow for the effect-utils #658 rollout.',
})),
],
baselineMaxRuns: 20,
@@ -344,12 +345,35 @@ const extraJobs: Record = {
},
{
task: 'check:quick',
- label: 'Quick check task',
+ id: 'task_check_quick_warm',
+ label: 'Warm cached check:quick',
group: 'quality gates',
- description: 'Runs the fast local quality gate through devenv.',
+ path: ['quality gates', 'check:quick'],
+ description:
+ 'Runs the fast local quality gate through devenv after a warmup. This measures the cached no-op path and task/status orchestration overhead.',
+ dimensions: {
+ workload: 'cached-no-op',
+ taskCacheMode: 'warm',
+ },
warmupRepetitions: 1,
repetitions: 5,
},
+ {
+ task: 'check:quick',
+ id: 'task_check_quick_forced',
+ label: 'Forced check:quick',
+ group: 'quality gates',
+ path: ['quality gates', 'check:quick'],
+ description:
+ 'Runs the fast local quality gate through devenv with task-cache refresh. This measures the developer-facing quick-check workload rather than the cached no-op path.',
+ dimensions: {
+ workload: 'forced-task-cache',
+ taskCacheMode: 'refresh',
+ },
+ extraArgs: ['--refresh-task-cache'],
+ warmupRepetitions: 0,
+ repetitions: 3,
+ },
],
probes: [
{
diff --git a/context/ci-measurement-engine.md b/context/ci-measurement-engine.md
index f4901f197..ff54aced4 100644
--- a/context/ci-measurement-engine.md
+++ b/context/ci-measurement-engine.md
@@ -72,6 +72,14 @@ Every observation is interpreted through a registry entry:
The registry is the public API for cross-repo reuse. Repos may add local entries,
but they must not fork comparison semantics.
+Wall-clock registry entries should include a workload dimension when the same
+logical command can be measured under different cache conditions. For example,
+`task_check_quick_warm` and `task_check_quick_forced` intentionally share the
+semantic path `devenv / quality gates / check:quick`, but they are separate IDs
+because one measures the warm cached no-op path while the other refreshes the
+devenv task cache. This avoids false product claims such as treating a cached
+orchestration improvement as a full developer quick-check improvement.
+
## Comparison Semantics
| Kind | Merge-gate mode | Evidence model |
diff --git a/context/ci-measurements.md b/context/ci-measurements.md
index 00428ebfd..b86c98d7b 100644
--- a/context/ci-measurements.md
+++ b/context/ci-measurements.md
@@ -160,6 +160,21 @@ The comparison operates on per-pair deltas. A wall-clock row becomes gateable
only when the configured minimum paired sample count is present. Until then,
the row is partial/advisory even if the historical raw delta is large.
+Wall-clock probe IDs must name the workload they actually measure. Repeated
+warm probes are useful for shell and task-orchestration overhead, but they are
+not a proxy for an uncached developer workflow. For example:
+
+| Probe | Workload | Interprets As |
+| ------------------------- | ------------------------ | -------------------------------------------------- |
+| `task_check_quick_warm` | Warm cached no-op path | Devenv task/status orchestration overhead. |
+| `task_check_quick_forced` | `--refresh-task-cache` | Developer-facing quick-check work with cache miss. |
+| `shell_eval_warm` | Warm shell entry | Shell evaluation and setup overhead. |
+| `shell_eval_traced` | Trace capture diagnostic | Explanation input, not a gate. |
+
+The label and `dimensions.workload` must make this distinction visible in the
+PR comment so reviewers do not read a cached-path movement as an end-to-end
+developer speedup.
+
For PR gates, the preferred evidence protocol is `paired-delta-quantile-v1`:
```text
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 474bd542b..181fc8a1c 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -19,6 +19,7 @@ export type CiMeasurementDescriptor = {
readonly group?: string
readonly path?: readonly string[]
readonly description?: string
+ readonly dimensions?: Record
}
export type CiMeasurementGatePolicy = {
@@ -244,7 +245,8 @@ export const ciMeasurementBaselineWorkflowDispatchInputs = {
type: 'string',
},
measurement_baseline_label: {
- description: 'Optional human label for a measurement baseline backfill run, for example PR number.',
+ description:
+ 'Optional human label for a measurement baseline backfill run, for example PR number.',
required: false,
default: '',
type: 'string',
@@ -286,6 +288,9 @@ export type DevenvPerfTaskProbe =
readonly label?: string
readonly group?: string
readonly description?: string
+ readonly path?: readonly string[]
+ readonly dimensions?: Record
+ readonly extraArgs?: readonly string[]
readonly warmupRepetitions?: number
readonly repetitions?: number
readonly gate?: CiMeasurementGatePolicy
@@ -341,6 +346,22 @@ const defaultDevenvPerfGatePolicy = (probeId: string): CiMeasurementGatePolicy =
statisticalToleranceAbs: 0.03,
}
}
+ if (probeId === 'task_check_quick_forced') {
+ return {
+ enabled: true,
+ comparisonMode: 'paired',
+ minPairedSamples: 3,
+ minBaselineSources: 10,
+ minCurrentSamples: 3,
+ warnRatio: 1.15,
+ failRatio: 1.3,
+ warnAbs: 1.5,
+ failAbs: 4,
+ noiseFloor: 0.75,
+ statisticalToleranceRatio: 0.15,
+ statisticalToleranceAbs: 1,
+ }
+ }
return {
enabled: true,
comparisonMode: 'paired',
@@ -366,11 +387,18 @@ const devenvPerfProbeLine = (probe: DevenvPerfProbe) => {
const args = probe.command.map(shellSingleQuote).join(' ')
const trace = probe.traceOutput ?? ''
const gatePolicy = devenvPerfGatePolicy(probe)
+ const metadata = JSON.stringify({
+ path: probe.path ?? [],
+ dimensions: probe.dimensions ?? {},
+ })
const defaultRepetitions = gatePolicy.enabled ? gatePolicy.minCurrentSamples : 1
const repetitions = Math.max(1, Math.floor(probe.repetitions ?? defaultRepetitions))
const defaultWarmupRepetitions = gatePolicy.enabled && repetitions > 1 ? 1 : 0
- const warmupRepetitions = Math.max(0, Math.floor(probe.warmupRepetitions ?? defaultWarmupRepetitions))
- return `measure ${shellSingleQuote(probe.id)} ${shellSingleQuote(probe.label)} ${shellSingleQuote(probe.group ?? '')} ${shellSingleQuote(probe.description ?? '')} ${shellSingleQuote(trace)} ${shellSingleQuote(String(warmupRepetitions))} ${shellSingleQuote(String(repetitions))} ${shellSingleQuote(JSON.stringify(gatePolicy))} ${args}`
+ const warmupRepetitions = Math.max(
+ 0,
+ Math.floor(probe.warmupRepetitions ?? defaultWarmupRepetitions),
+ )
+ return `measure ${shellSingleQuote(probe.id)} ${shellSingleQuote(probe.label)} ${shellSingleQuote(probe.group ?? '')} ${shellSingleQuote(probe.description ?? '')} ${shellSingleQuote(trace)} ${shellSingleQuote(String(warmupRepetitions))} ${shellSingleQuote(String(repetitions))} ${shellSingleQuote(JSON.stringify(gatePolicy))} ${shellSingleQuote(metadata)} ${args}`
}
const defaultDevenvPerfTaskProbe = (probe: DevenvPerfTaskProbe): DevenvPerfProbe => {
@@ -379,6 +407,9 @@ const defaultDevenvPerfTaskProbe = (probe: DevenvPerfTaskProbe): DevenvPerfProbe
const label = typeof probe === 'string' ? undefined : probe.label
const group = typeof probe === 'string' ? undefined : probe.group
const description = typeof probe === 'string' ? undefined : probe.description
+ const path = typeof probe === 'string' ? undefined : probe.path
+ const dimensions = typeof probe === 'string' ? undefined : probe.dimensions
+ const extraArgs = typeof probe === 'string' ? [] : (probe.extraArgs ?? [])
const warmupRepetitions = typeof probe === 'string' ? undefined : probe.warmupRepetitions
const repetitions = typeof probe === 'string' ? undefined : probe.repetitions
const gate = typeof probe === 'string' ? undefined : probe.gate
@@ -386,11 +417,23 @@ const defaultDevenvPerfTaskProbe = (probe: DevenvPerfTaskProbe): DevenvPerfProbe
id: id ?? `task_${task.replaceAll(':', '_')}`,
label: label ?? task,
group: group ?? 'devenv tasks',
+ path,
description: description ?? `Runs the devenv task '${task}' in before mode without the TUI.`,
+ dimensions,
warmupRepetitions,
repetitions,
gate,
- command: ['$DEVENV_BIN', 'tasks', 'run', task, '--mode', 'before', '--no-tui', '--show-output'],
+ command: [
+ '$DEVENV_BIN',
+ 'tasks',
+ 'run',
+ task,
+ '--mode',
+ 'before',
+ '--no-tui',
+ '--show-output',
+ ...extraArgs,
+ ],
}
}
@@ -402,9 +445,7 @@ const devenvPerfProbes = (
label: 'Shell eval with OTEL trace',
group: 'devenv shell',
description: 'Evaluates the dev shell with native devenv JSON tracing enabled.',
- command: [
- '$DEVENV_SHELL_TRACE_COMMAND',
- ],
+ command: ['$DEVENV_SHELL_TRACE_COMMAND'],
traceOutput: '$ARTIFACT_DIR/traces/shell_eval_traced.json',
},
{
@@ -534,6 +575,7 @@ json_append_timing() {
local stderr="$8"
local trace="$9"
local gate_policy="${dollar}{10}"
+ local metadata_json="${dollar}{11}"
local samples_file="$ARTIFACT_DIR/$id.samples.json"
if [ "$first" -eq 0 ]; then
@@ -553,6 +595,7 @@ json_append_timing() {
--arg stderr "$stderr" \
--arg trace "$trace" \
--argjson gatePolicy "$gate_policy" \
+ --argjson metadata "$metadata_json" \
'def median:
sort as $sorted
| ($sorted | length) as $count
@@ -594,7 +637,8 @@ json_append_timing() {
durationMs:$durationMs,
stdout:$stdout,
stderr:$stderr,
- trace:(if $trace == "" then null else $trace end),
+ trace:(if $trace == "" then null else $trace end),
+ metadata:$metadata,
gatePolicy:$gatePolicy,
statistics: {
sampleCount: ($sampleList | length),
@@ -637,7 +681,8 @@ measure() {
local warmup_repetitions="$6"
local repetitions="$7"
local gate_policy="$8"
- shift 8
+ local metadata_json="$9"
+ shift 9
case "$trace_file" in
'$ARTIFACT_DIR'*) trace_file="${dollar}{ARTIFACT_DIR}${dollar}{trace_file#'$ARTIFACT_DIR'}" ;;
esac
@@ -805,7 +850,7 @@ measure() {
cp "$stdout" "$ARTIFACT_DIR/$id.stdout" 2>/dev/null || true
cp "$stderr" "$ARTIFACT_DIR/$id.stderr" 2>/dev/null || true
- json_append_timing "$id" "$label" "$group" "$description" "$status" "$duration_ms" "$ARTIFACT_DIR/$id.stdout" "$ARTIFACT_DIR/$id.stderr" "$trace_file" "$gate_policy"
+ json_append_timing "$id" "$label" "$group" "$description" "$status" "$duration_ms" "$ARTIFACT_DIR/$id.stdout" "$ARTIFACT_DIR/$id.stderr" "$trace_file" "$gate_policy" "$metadata_json"
if [ "$status" -ne 0 ]; then
if [ "${dollar}{CI_MEASUREMENT_ALLOW_PROBE_FAILURES:-}" = "1" ]; then
@@ -906,6 +951,7 @@ jq -n \
id: ("devenv." + .id + ".duration"),
label: .label,
group: .group,
+ path: (.metadata.path // []),
description: .description,
measurementKind: (if (.gatePolicy.enabled == false) then "diagnostic" else "wall-clock" end),
name: ("devenv." + .id + ".duration"),
@@ -981,7 +1027,7 @@ jq -n \
),
pairedDeltaSamples: ((.statistics.pairedDeltaSampleDurationMs // []) | map(. / 1000))
},
- dimensions: {
+ dimensions: ((.metadata.dimensions // {}) + {
probe: .id,
probeLabel: .label,
status: .status,
@@ -1006,7 +1052,7 @@ jq -n \
phase: "warm",
devenvRev: $devenvRev,
otelServiceName: $otelServiceName
- }
+ })
})
),
artifacts: [
@@ -1078,7 +1124,9 @@ export const downloadPreviousGitHubArtifactStep = (opts: GitHubPreviousArtifactS
BASELINE_BRANCH: opts.branch ?? '${{ github.base_ref || github.ref_name }}',
BASELINE_SEED_RUNS_JSON: ciMeasurementBaselineSeedRunsJson(opts),
BASELINE_MAX_RUNS: String(opts.maxRuns ?? 5),
- BASELINE_MAX_CANDIDATE_RUNS: String(opts.maxCandidateRuns ?? Math.max((opts.maxRuns ?? 5) * 3, 20)),
+ BASELINE_MAX_CANDIDATE_RUNS: String(
+ opts.maxCandidateRuns ?? Math.max((opts.maxRuns ?? 5) * 3, 20),
+ ),
BASELINE_REQUIRED_OBSERVATIONS_JSON: ciMeasurementRequiredObservationsJson(opts),
},
run: String.raw`set -euo pipefail
@@ -2512,6 +2560,8 @@ const humanProbe = (row) => {
task_pnpm_install: 'pnpm:install',
task_genie_run: 'genie:run',
task_check_quick: 'check:quick',
+ task_check_quick_warm: 'Warm cached check:quick',
+ task_check_quick_forced: 'Forced check:quick',
}
if (probe && labels[probe]) return labels[probe]
if (name.startsWith('devenv.') && name.endsWith('.duration')) {
@@ -2547,6 +2597,8 @@ const chartProbe = (row) => {
task_pnpm_install: 'pnpm:install',
task_genie_run: 'genie:run',
task_check_quick: 'check:quick',
+ task_check_quick_warm: 'Warm cached check:quick',
+ task_check_quick_forced: 'Forced check:quick',
}
if (probe && labels[probe]) return labels[probe]
return humanProbe(row)
From ac2ce6ea8030d6c9acb881be7f6707d2ee64d7a4 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Tue, 19 May 2026 10:14:53 +0200
Subject: [PATCH 49/81] Update measurement workflow expectations
---
.../github-workflow/ci-workflow-helpers.unit.test.ts | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
index bb25e5b81..6f91d2574 100644
--- a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
+++ b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
@@ -464,7 +464,11 @@ describe('ci workflow devenv perf helpers', () => {
expect(generatedCiWorkflowYamlSource).toContain('BASELINE_SEED_RUNS_JSON:')
expect(generatedCiWorkflowYamlSource).toContain('BASELINE_REQUIRED_OBSERVATIONS_JSON:')
expect(generatedCiWorkflowYamlSource).toContain('BASELINE_MAX_CANDIDATE_RUNS:')
- expect(generatedCiWorkflowYamlSource).toContain('"id":"devenv.task_check_quick.duration"')
+ expect(generatedCiWorkflowYamlSource).toContain('"id":"devenv.task_check_quick_warm.duration"')
+ expect(generatedCiWorkflowYamlSource).toContain(
+ '"id":"devenv.task_check_quick_forced.duration"',
+ )
+ expect(generatedCiWorkflowYamlSource).not.toContain('"id":"devenv.task_check_quick.duration"')
expect(ciWorkflowSource).toContain(
'requiredObservations?: readonly CiMeasurementRequiredBaselineObservation[]',
)
From b78ad06596bbe0433f4084a8d07278ba5415e023 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Tue, 19 May 2026 10:18:36 +0200
Subject: [PATCH 50/81] Backfill source shape measurement artifacts
---
.github/workflows/ci.yml | 8 +++++-
.github/workflows/ci.yml.genie.ts | 42 ++++++++++++++++++-------------
2 files changed, 31 insertions(+), 19 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 634ab1329..f01c5bea8 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -4572,7 +4572,6 @@ jobs:
retention-days: 30
timeout-minutes: 30
source-shape:
- if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
runs-on:
[namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}']
timeout-minutes: 30
@@ -4591,6 +4590,11 @@ jobs:
CI_MEASUREMENT_ALLOW_PROBE_FAILURES: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && '1' || '' }}
steps:
- uses: actions/checkout@v6
+ - name: Checkout CI measurement baseline ref
+ if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }}
+ uses: actions/checkout@v6
+ with:
+ ref: ${{ inputs.measurement_baseline_ref }}
- name: 'Download previous artifact: source-shape'
shell: bash
env:
@@ -4799,6 +4803,7 @@ jobs:
echo "Downloaded $(wc -l <"$downloaded_runs_file" | tr -d ' ') baseline artifact(s), latest $artifact_name from run $run_id into $BASELINE_OUTPUT_DIR"
+ if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
- name: 'Measure source shape: effect-utils'
shell: bash
env:
@@ -6320,6 +6325,7 @@ jobs:
exit "$exit_code"
fi
+ if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
- name: 'Upload CI measurements: source-shape'
if: always()
uses: actions/upload-artifact@v4
diff --git a/.github/workflows/ci.yml.genie.ts b/.github/workflows/ci.yml.genie.ts
index a148459d2..813a821af 100644
--- a/.github/workflows/ci.yml.genie.ts
+++ b/.github/workflows/ci.yml.genie.ts
@@ -408,7 +408,6 @@ const extraJobs: Record = {
'timeout-minutes': jobTimeoutMinutes,
},
'source-shape': {
- if: normalCiIf,
'runs-on': namespaceRunner({
profile: 'namespace-profile-linux-x86-64',
runId: '${{ github.run_id }}',
@@ -419,11 +418,15 @@ const extraJobs: Record = {
env: ciMeasurementSubjectEnv,
steps: [
checkoutStep(),
- downloadPreviousGitHubArtifactStep({
- artifactName: 'source-shape',
- outputDir: `${sourceShapeMeasurementsDir}/baseline`,
- maxRuns: 20,
- }),
+ ciMeasurementBaselineCheckoutStep,
+ {
+ ...downloadPreviousGitHubArtifactStep({
+ artifactName: 'source-shape',
+ outputDir: `${sourceShapeMeasurementsDir}/baseline`,
+ maxRuns: 20,
+ }),
+ if: normalCiIf,
+ },
sourceShapeMeasurementStep({
artifactDir: `${sourceShapeMeasurementsDir}/current/effect-utils`,
targetId: 'effect_utils',
@@ -458,18 +461,21 @@ const extraJobs: Record = {
},
],
}),
- compareCiMeasurementsStep({
- currentDir: `${sourceShapeMeasurementsDir}/current`,
- baselineDir: `${sourceShapeMeasurementsDir}/baseline`,
- outputFile: `${sourceShapeMeasurementsDir}/measurement-comparison.json`,
- regressionMode: 'warn',
- prComment: {
- enabled: true,
- title: 'Source Shape Measurements',
- maxRows: 12,
- maxHistory: 20,
- },
- }),
+ {
+ ...compareCiMeasurementsStep({
+ currentDir: `${sourceShapeMeasurementsDir}/current`,
+ baselineDir: `${sourceShapeMeasurementsDir}/baseline`,
+ outputFile: `${sourceShapeMeasurementsDir}/measurement-comparison.json`,
+ regressionMode: 'warn',
+ prComment: {
+ enabled: true,
+ title: 'Source Shape Measurements',
+ maxRows: 12,
+ maxHistory: 20,
+ },
+ }),
+ if: normalCiIf,
+ },
ciMeasurementsArtifactStep({
artifactName: 'source-shape',
path: sourceShapeMeasurementsDir,
From 4a4888f771574ff0011bad537a34958f94ac6847 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Tue, 19 May 2026 10:20:24 +0200
Subject: [PATCH 51/81] Seed source shape baseline artifact
---
.github/workflows/ci.yml | 2 +-
.github/workflows/ci.yml.genie.ts | 11 +++++++++++
2 files changed, 12 insertions(+), 1 deletion(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f01c5bea8..5d41b3822 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -4603,7 +4603,7 @@ jobs:
BASELINE_OUTPUT_DIR: tmp/source-shape-ci/baseline
BASELINE_WORKFLOW_NAME: ${{ github.workflow }}
BASELINE_BRANCH: ${{ github.base_ref || github.ref_name }}
- BASELINE_SEED_RUNS_JSON: '[]'
+ BASELINE_SEED_RUNS_JSON: '[{"runId":"26085158592","label":"main baseline","sha":"ce7cf8f8ebfaa1da6c7e9122cd195a5f95ce2fca","source":"manual-backfill","artifacts":["source-shape"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."}]'
BASELINE_MAX_RUNS: '20'
BASELINE_MAX_CANDIDATE_RUNS: '60'
BASELINE_REQUIRED_OBSERVATIONS_JSON: '[]'
diff --git a/.github/workflows/ci.yml.genie.ts b/.github/workflows/ci.yml.genie.ts
index 813a821af..f66b9215d 100644
--- a/.github/workflows/ci.yml.genie.ts
+++ b/.github/workflows/ci.yml.genie.ts
@@ -423,6 +423,17 @@ const extraJobs: Record = {
...downloadPreviousGitHubArtifactStep({
artifactName: 'source-shape',
outputDir: `${sourceShapeMeasurementsDir}/baseline`,
+ seedRuns: [
+ {
+ runId: '26085158592',
+ label: 'main baseline',
+ sha: 'ce7cf8f8ebfaa1da6c7e9122cd195a5f95ce2fca',
+ source: 'manual-backfill',
+ artifacts: ['source-shape'],
+ notes:
+ 'Backfilled with the current measurement workflow for the effect-utils #658 rollout.',
+ },
+ ],
maxRuns: 20,
}),
if: normalCiIf,
From 8a9bed2b655851f3b747a77f7e2b07d9ec30f8a4 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Tue, 19 May 2026 10:32:42 +0200
Subject: [PATCH 52/81] Rank measurement comments by semantic impact
---
.github/workflows/ci.yml | 24 ++++++++++++++++++------
genie/ci-workflow/measurements.ts | 12 +++++++++---
2 files changed, 27 insertions(+), 9 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 5d41b3822..6b6c0d24e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -4114,14 +4114,20 @@ jobs:
const rank = (row) => {
if (row.status === 'fail') return 0
if (row.status === 'warn') return 1
- if (row.status === 'missing_baseline') return 2
- return 3
+ if (row.status === 'missing_baseline') return 3
+ return 2
}
const allRows = Object.values(comparison.comparisons || {}).sort((left, right) => {
const byRank = rank(left) - rank(right)
if (byRank !== 0) return byRank
- return (right.delta || 0) - (left.delta || 0)
+ const leftImpact = typeof left.semanticImpactScore === 'number' ? Math.abs(left.semanticImpactScore) : 0
+ const rightImpact = typeof right.semanticImpactScore === 'number' ? Math.abs(right.semanticImpactScore) : 0
+ if (rightImpact !== leftImpact) return rightImpact - leftImpact
+ const leftDelta = typeof left.delta === 'number' ? Math.abs(left.delta) : 0
+ const rightDelta = typeof right.delta === 'number' ? Math.abs(right.delta) : 0
+ if (rightDelta !== leftDelta) return rightDelta - leftDelta
+ return humanProbe(left).localeCompare(humanProbe(right))
})
const protocolLabel = (() => {
const protocols = new Set(
@@ -5879,14 +5885,20 @@ jobs:
const rank = (row) => {
if (row.status === 'fail') return 0
if (row.status === 'warn') return 1
- if (row.status === 'missing_baseline') return 2
- return 3
+ if (row.status === 'missing_baseline') return 3
+ return 2
}
const allRows = Object.values(comparison.comparisons || {}).sort((left, right) => {
const byRank = rank(left) - rank(right)
if (byRank !== 0) return byRank
- return (right.delta || 0) - (left.delta || 0)
+ const leftImpact = typeof left.semanticImpactScore === 'number' ? Math.abs(left.semanticImpactScore) : 0
+ const rightImpact = typeof right.semanticImpactScore === 'number' ? Math.abs(right.semanticImpactScore) : 0
+ if (rightImpact !== leftImpact) return rightImpact - leftImpact
+ const leftDelta = typeof left.delta === 'number' ? Math.abs(left.delta) : 0
+ const rightDelta = typeof right.delta === 'number' ? Math.abs(right.delta) : 0
+ if (rightDelta !== leftDelta) return rightDelta - leftDelta
+ return humanProbe(left).localeCompare(humanProbe(right))
})
const protocolLabel = (() => {
const protocols = new Set(
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 181fc8a1c..5fafc4062 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -2616,14 +2616,20 @@ const dimensions = (row) => {
const rank = (row) => {
if (row.status === 'fail') return 0
if (row.status === 'warn') return 1
- if (row.status === 'missing_baseline') return 2
- return 3
+ if (row.status === 'missing_baseline') return 3
+ return 2
}
const allRows = Object.values(comparison.comparisons || {}).sort((left, right) => {
const byRank = rank(left) - rank(right)
if (byRank !== 0) return byRank
- return (right.delta || 0) - (left.delta || 0)
+ const leftImpact = typeof left.semanticImpactScore === 'number' ? Math.abs(left.semanticImpactScore) : 0
+ const rightImpact = typeof right.semanticImpactScore === 'number' ? Math.abs(right.semanticImpactScore) : 0
+ if (rightImpact !== leftImpact) return rightImpact - leftImpact
+ const leftDelta = typeof left.delta === 'number' ? Math.abs(left.delta) : 0
+ const rightDelta = typeof right.delta === 'number' ? Math.abs(right.delta) : 0
+ if (rightDelta !== leftDelta) return rightDelta - leftDelta
+ return humanProbe(left).localeCompare(humanProbe(right))
})
const protocolLabel = (() => {
const protocols = new Set(
From b5ff591c8c7d3ee08720b668261c83341d273db3 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Tue, 19 May 2026 10:44:04 +0200
Subject: [PATCH 53/81] Clarify below-threshold measurement improvements
---
.github/workflows/ci.yml | 20 ++++++++++++++++----
genie/ci-workflow/measurements.ts | 10 ++++++++--
2 files changed, 24 insertions(+), 6 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 6b6c0d24e..32da7f7b8 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -4013,12 +4013,18 @@ jobs:
tone: 'neutral',
color: '#94a3b8',
}
- if (row.direction === 'improved') return {
+ if (row.direction === 'improved' && typeof row.semanticImpactScore === 'number' && row.semanticImpactScore <= -1) return {
label: 'Meaningfully lower',
- detail: 'Lower than baseline by more than the noise floor and outside normal range.',
+ detail: 'Lower than baseline by enough to cross the configured review threshold.',
tone: 'good',
color: '#10b981',
}
+ if (row.direction === 'improved') return {
+ label: 'Slightly lower, ok',
+ detail: 'Lower than baseline, but still inside the configured review budget.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
if (row.direction === 'regressed') return {
label: 'Slightly higher, ok',
detail: 'Higher than baseline but still inside the configured budget.',
@@ -5784,12 +5790,18 @@ jobs:
tone: 'neutral',
color: '#94a3b8',
}
- if (row.direction === 'improved') return {
+ if (row.direction === 'improved' && typeof row.semanticImpactScore === 'number' && row.semanticImpactScore <= -1) return {
label: 'Meaningfully lower',
- detail: 'Lower than baseline by more than the noise floor and outside normal range.',
+ detail: 'Lower than baseline by enough to cross the configured review threshold.',
tone: 'good',
color: '#10b981',
}
+ if (row.direction === 'improved') return {
+ label: 'Slightly lower, ok',
+ detail: 'Lower than baseline, but still inside the configured review budget.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
if (row.direction === 'regressed') return {
label: 'Slightly higher, ok',
detail: 'Higher than baseline but still inside the configured budget.',
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 5fafc4062..325f46b68 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -2515,12 +2515,18 @@ const interpretation = (row) => {
tone: 'neutral',
color: '#94a3b8',
}
- if (row.direction === 'improved') return {
+ if (row.direction === 'improved' && typeof row.semanticImpactScore === 'number' && row.semanticImpactScore <= -1) return {
label: 'Meaningfully lower',
- detail: 'Lower than baseline by more than the noise floor and outside normal range.',
+ detail: 'Lower than baseline by enough to cross the configured review threshold.',
tone: 'good',
color: '#10b981',
}
+ if (row.direction === 'improved') return {
+ label: 'Slightly lower, ok',
+ detail: 'Lower than baseline, but still inside the configured review budget.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
if (row.direction === 'regressed') return {
label: 'Slightly higher, ok',
detail: 'Higher than baseline but still inside the configured budget.',
From 0468728b2ac62b1cd486defdc0e8526daa0f6d9a Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Tue, 19 May 2026 12:02:28 +0200
Subject: [PATCH 54/81] Add reusable Nix closure measurement helpers
---
genie/ci-workflow.ts | 6 +
genie/ci-workflow/measurements.ts | 127 +++++++++++++++++-
genie/external.ts | 7 +
.../ci-workflow-helpers.unit.test.ts | 6 +-
4 files changed, 142 insertions(+), 4 deletions(-)
diff --git a/genie/ci-workflow.ts b/genie/ci-workflow.ts
index d23f975ee..a3edf6068 100644
--- a/genie/ci-workflow.ts
+++ b/genie/ci-workflow.ts
@@ -57,10 +57,13 @@ export {
ciMeasurementsArtifactStep,
ciMeasurementsCommentPermissions,
compareCiMeasurementsStep,
+ defaultNixClosureMeasurementBuckets,
downloadPreviousGitHubArtifactStep,
devenvPerfArtifactStep,
devenvPerfBenchmarkStep,
devenvPerfJob,
+ nixClosureMeasurementSteps,
+ nixClosureMeasurementsJob,
nixClosureMeasurementStep,
sourceShapeMeasurementStep,
type CiMeasurementDescriptor,
@@ -73,6 +76,9 @@ export {
type GitHubPreviousArtifactStepOptions,
type NixClosureMeasurementBucket,
type NixClosureMeasurementStepOptions,
+ type NixClosureMeasurementTarget,
+ type NixClosureMeasurementsJobOptions,
+ type NixClosureMeasurementsStepsOptions,
type SourceShapeMeasurementScope,
type SourceShapeMeasurementStepOptions,
} from './ci-workflow/measurements.ts'
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 325f46b68..8a8943da9 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -151,10 +151,50 @@ export type NixClosureMeasurementStepOptions = {
readonly targetName?: string
readonly targetLabel?: string
readonly targetGroup?: string
+ readonly targetPath?: readonly string[]
+ readonly targetDescription?: string
readonly targetSystem?: string
readonly artifactDir?: string
readonly artifactFile?: string
readonly buckets?: readonly NixClosureMeasurementBucket[]
+ readonly gate?: CiMeasurementGatePolicy
+}
+
+export type NixClosureMeasurementTarget = {
+ readonly installable: string
+ readonly id: string
+ readonly name?: string
+ readonly label: string
+ readonly group: string
+ readonly path?: readonly string[]
+ readonly description: string
+ readonly system?: string
+ readonly buckets?: readonly NixClosureMeasurementBucket[]
+ readonly gate?: CiMeasurementGatePolicy
+}
+
+export type NixClosureMeasurementsStepsOptions = {
+ readonly artifactDir?: string
+ readonly artifactName: string
+ readonly baselineArtifactName?: string
+ readonly baselineSeedRuns?: readonly CiMeasurementBaselineSeedRun[]
+ readonly baselineSeedRunIds?: readonly string[]
+ readonly baselineMaxRuns?: number
+ readonly baselineMaxCandidateRuns?: number
+ readonly targets: readonly [NixClosureMeasurementTarget, ...NixClosureMeasurementTarget[]]
+ readonly buckets?: readonly NixClosureMeasurementBucket[]
+ readonly retentionDays?: number
+ readonly regressionMode?: 'off' | 'warn' | 'fail'
+ readonly prComment?: CiMeasurementsComparisonStepOptions['prComment']
+}
+
+export type NixClosureMeasurementsJobOptions = NixClosureMeasurementsStepsOptions & {
+ readonly runsOn?: readonly string[]
+ readonly setupSteps?: readonly GitHubWorkflowArgs['jobs'][string]['steps'][number][]
+ readonly ifExpr?: string
+ readonly timeoutMinutes?: number
+ readonly env?: Record
+ readonly permissions?: GitHubWorkflowArgs['jobs'][string]['permissions']
}
export type SourceShapeMeasurementScope = CiMeasurementDescriptor & {
@@ -258,6 +298,12 @@ export const ciMeasurementBaselineBackfillPredicate =
export const ciMeasurementNotBaselineBackfillPredicate =
`!(${ciMeasurementBaselineBackfillPredicate})` as const
+export const defaultNixClosureMeasurementBuckets = [
+ { name: 'node', label: 'Node / pnpm', pathRegex: 'node_modules|npm-deps|pnpm' },
+ { name: 'nix-sources', label: 'Nix sources', pathRegex: '-source$' },
+ { name: 'rust', label: 'Rust', pathRegex: 'cargo|rust|rustc' },
+] as const satisfies readonly NixClosureMeasurementBucket[]
+
/** Conditional checkout step that replaces the default checkout with the baseline subject. */
export const ciMeasurementBaselineCheckoutStep = {
name: 'Checkout CI measurement baseline ref',
@@ -1369,6 +1415,10 @@ export const nixClosureMeasurementStep = (opts: NixClosureMeasurementStepOptions
const targetLabel = opts.targetLabel ?? targetName
const targetGroup = opts.targetGroup ?? 'nix closure'
const buckets = JSON.stringify(opts.buckets ?? [])
+ const targetPath = JSON.stringify(opts.targetPath ?? [])
+ const gatePolicy = JSON.stringify(opts.gate ?? {})
+ const targetDescription =
+ opts.targetDescription ?? 'Resolved Nix closure for the configured flake installable.'
const targetSystemAssignment =
opts.targetSystem === undefined
? `target_system="${dollar}{DEVENV_SYSTEM:-${dollar}{RUNNER_OS:-unknown}}"`
@@ -1389,6 +1439,7 @@ target_id=${shellSingleQuote(targetId)}
target_name=${shellSingleQuote(targetName)}
target_label=${shellSingleQuote(targetLabel)}
target_group=${shellSingleQuote(targetGroup)}
+target_description=${shellSingleQuote(targetDescription)}
artifact_file=${artifactFileAssignment}
${targetSystemAssignment}
@@ -1422,9 +1473,12 @@ jq -n \
--arg targetId "$target_id" \
--arg targetLabel "$target_label" \
--arg targetGroup "$target_group" \
+ --arg targetDescription "$target_description" \
--arg targetSystem "$target_system" \
--arg outPath "$out_path" \
--argjson buckets ${shellSingleQuote(buckets)} \
+ --argjson targetPath ${shellSingleQuote(targetPath)} \
+ --argjson gatePolicy ${shellSingleQuote(gatePolicy)} \
'
($paths[0] // []) as $closurePaths
| ($closurePaths | map(.narSize) | add // 0) as $totalNarSize
@@ -1436,6 +1490,7 @@ jq -n \
id: "nix.closure.bucket.nar_size",
label: (($bucket.label // $bucket.name) + " closure size"),
group: "nix closure buckets",
+ path: ($targetPath + ["buckets", $bucket.name]),
description: ("NAR size contributed by closure paths matching " + $bucket.pathRegex),
measurementKind: "deterministic",
unit: "bytes",
@@ -1444,6 +1499,7 @@ jq -n \
| map(select(.path | test($bucket.pathRegex)) | .narSize)
| add // 0
),
+ policy: $gatePolicy,
dimensions: { bucket: $bucket.name }
}
)) as $bucketObservations
@@ -1469,28 +1525,32 @@ jq -n \
traceId: $traceId,
runner: { name: $runnerName, os: $runnerOs, arch: $runnerArch, class: $runnerClass }
},
- target: { kind: "nix-closure", id: $targetId, name: $targetName, label: $targetLabel, group: $targetGroup, system: $targetSystem },
+ target: { kind: "nix-closure", id: $targetId, name: $targetName, label: $targetLabel, group: $targetGroup, path: $targetPath, system: $targetSystem },
observations: ([
{
id: "nix.closure.nar_size",
label: "Total closure size",
group: "nix closure",
- description: "Total NAR size for all paths in the resolved Nix closure.",
+ path: ($targetPath + ["total", "nar-size"]),
+ description: ("Total NAR size for all paths in " + $targetDescription),
name: "nix.closure.nar_size",
measurementKind: "deterministic",
unit: "bytes",
value: $totalNarSize,
+ policy: $gatePolicy,
dimensions: { bucket: "total" }
},
{
id: "nix.closure.path_count",
label: "Total closure path count",
group: "nix closure",
- description: "Number of store paths in the resolved Nix closure.",
+ path: ($targetPath + ["total", "path-count"]),
+ description: ("Number of store paths in " + $targetDescription),
name: "nix.closure.path_count",
measurementKind: "deterministic",
unit: "count",
value: $pathCount,
+ policy: $gatePolicy,
dimensions: { bucket: "total" }
}
] + $bucketObservations),
@@ -1510,6 +1570,67 @@ cat "$artifact_file"
} as const
}
+export const nixClosureMeasurementSteps = (opts: NixClosureMeasurementsStepsOptions) => {
+ const artifactDir = opts.artifactDir ?? 'tmp/nix-closure-measurements'
+ const baselineArtifactName = opts.baselineArtifactName ?? opts.artifactName
+ const buckets = opts.buckets ?? defaultNixClosureMeasurementBuckets
+
+ return [
+ downloadPreviousGitHubArtifactStep({
+ artifactName: baselineArtifactName,
+ outputDir: `${artifactDir}/baseline`,
+ seedRuns: opts.baselineSeedRuns,
+ seedRunIds: opts.baselineSeedRunIds,
+ maxRuns: opts.baselineMaxRuns,
+ maxCandidateRuns: opts.baselineMaxCandidateRuns,
+ }),
+ ...opts.targets.map((target) =>
+ nixClosureMeasurementStep({
+ installable: target.installable,
+ targetId: target.id,
+ targetName: target.name ?? target.id,
+ targetLabel: target.label,
+ targetGroup: target.group,
+ targetPath: target.path,
+ targetDescription: target.description,
+ targetSystem: target.system,
+ artifactDir: `${artifactDir}/current/${target.id}`,
+ buckets: target.buckets ?? buckets,
+ gate: target.gate,
+ }),
+ ),
+ compareCiMeasurementsStep({
+ currentDir: `${artifactDir}/current`,
+ baselineDir: `${artifactDir}/baseline`,
+ outputFile: `${artifactDir}/measurement-comparison.json`,
+ regressionMode: opts.regressionMode ?? 'warn',
+ prComment: opts.prComment,
+ }),
+ ciMeasurementsArtifactStep({
+ artifactName: opts.artifactName,
+ path: artifactDir,
+ retentionDays: opts.retentionDays,
+ }),
+ ] as const
+}
+
+export const nixClosureMeasurementsJob = (opts: NixClosureMeasurementsJobOptions) =>
+ ({
+ ...(opts.ifExpr === undefined ? {} : { if: opts.ifExpr }),
+ 'runs-on': opts.runsOn ?? linuxX64Runner,
+ ...(opts.timeoutMinutes === undefined ? {} : { 'timeout-minutes': opts.timeoutMinutes }),
+ ...(opts.permissions === undefined ? {} : { permissions: opts.permissions }),
+ defaults: bashShellDefaults,
+ env: {
+ ...standardCIEnv,
+ ...opts.env,
+ },
+ steps: [
+ ...(opts.setupSteps ?? [checkoutStep(), installNixStep(), validateNixStoreStep]),
+ ...nixClosureMeasurementSteps(opts),
+ ],
+ }) as const
+
export const sourceShapeMeasurementStep = (opts: SourceShapeMeasurementStepOptions) => {
const artifactDir = opts.artifactDir ?? 'tmp/ci-measurements'
const artifactFileAssignment =
diff --git a/genie/external.ts b/genie/external.ts
index 165b62d07..a87a56358 100644
--- a/genie/external.ts
+++ b/genie/external.ts
@@ -640,6 +640,9 @@ export {
devenvPerfArtifactStep,
devenvPerfBenchmarkStep,
devenvPerfJob,
+ defaultNixClosureMeasurementBuckets,
+ nixClosureMeasurementSteps,
+ nixClosureMeasurementsJob,
pnpmStateSetupStep,
restorePnpmStateStep,
savePnpmStateStep,
@@ -659,6 +662,10 @@ export {
type DevenvPerfJobOptions,
type DevenvPerfProbe,
type DevenvPerfTaskProbe,
+ type NixClosureMeasurementBucket,
+ type NixClosureMeasurementTarget,
+ type NixClosureMeasurementsJobOptions,
+ type NixClosureMeasurementsStepsOptions,
type NixBinaryCache,
type RunnerProfile,
} from './ci-workflow.ts'
diff --git a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
index 6f91d2574..297bc7018 100644
--- a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
+++ b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
@@ -381,7 +381,11 @@ describe('ci workflow devenv perf helpers', () => {
expect(ciWorkflowSource).toContain('export type DevenvPerfProbe')
expect(ciWorkflowSource).toContain('export type DevenvPerfTaskProbe')
expect(ciWorkflowSource).toContain('export const nixClosureMeasurementStep')
+ expect(ciWorkflowSource).toContain('export const nixClosureMeasurementSteps')
+ expect(ciWorkflowSource).toContain('export const nixClosureMeasurementsJob')
+ expect(ciWorkflowSource).toContain('export const defaultNixClosureMeasurementBuckets')
expect(ciWorkflowSource).toContain('export type NixClosureMeasurementBucket')
+ expect(ciWorkflowSource).toContain('export type NixClosureMeasurementTarget')
})
it('emits the standard warm shell and task-list probes with native trace artifacts', () => {
@@ -447,7 +451,7 @@ describe('ci workflow devenv perf helpers', () => {
expect(ciWorkflowSource).toContain('artifact_file=${artifactFileAssignment}')
expect(ciWorkflowSource).not.toContain('artifact_file=${shellSingleQuote(artifactFile)}')
expect(ciWorkflowSource).toContain(
- 'target: { kind: "nix-closure", id: $targetId, name: $targetName, label: $targetLabel, group: $targetGroup, system: $targetSystem }',
+ 'target: { kind: "nix-closure", id: $targetId, name: $targetName, label: $targetLabel, group: $targetGroup, path: $targetPath, system: $targetSystem }',
)
expect(ciWorkflowSource).toContain('nix path-info --recursive --json "$out_path"')
expect(ciWorkflowSource).toContain(
From 5978381df8a7f59675cf1c2dd29976375a37ae36 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Tue, 19 May 2026 12:05:23 +0200
Subject: [PATCH 55/81] Measure effect-utils Nix closure sizes
---
.github/workflows/ci.yml | 2240 +++++++++++++++++++++++++++++
.github/workflows/ci.yml.genie.ts | 67 +
2 files changed, 2307 insertions(+)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 32da7f7b8..42754ce29 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -4583,6 +4583,2246 @@ jobs:
if-no-files-found: error
retention-days: 30
timeout-minutes: 30
+ nix-closure-sizes:
+ if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
+ runs-on:
+ [namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}']
+ timeout-minutes: 30
+ defaults:
+ run:
+ shell: bash
+ permissions:
+ actions: read
+ contents: write
+ issues: write
+ pull-requests: write
+ env:
+ CI_MEASUREMENT_SUBJECT_REF: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.ref || github.ref }}
+ CI_MEASUREMENT_SUBJECT_SHA: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.sha || github.sha }}
+ CI_MEASUREMENT_SUBJECT_LABEL: ${{ inputs.measurement_baseline_label }}
+ CI_MEASUREMENT_ALLOW_PROBE_FAILURES: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && '1' || '' }}
+ steps:
+ - uses: actions/checkout@v6
+ - name: Checkout CI measurement baseline ref
+ if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }}
+ uses: actions/checkout@v6
+ with:
+ ref: ${{ inputs.measurement_baseline_ref }}
+ - name: Install Nix
+ uses: DeterminateSystems/determinate-nix-action@v3
+ with:
+ extra-conf: |
+ experimental-features = nix-command flakes
+ accept-flake-config = true
+ extra-substituters = https://devenv.cachix.org
+ extra-trusted-public-keys = devenv.cachix.org-1:w1cLUi8dv3hnoSPGAuibQv+f9TZLr6cv/Hm9XgU50cw=
+ access-tokens = github.com=${{ github.token }}
+ summarize: true
+ - name: Provide cachix CLI from nixpkgs
+ shell: bash
+ run: |
+ set -euo pipefail
+ out=$(nix build --no-link --print-out-paths nixpkgs#cachix)
+ echo "$out/bin" >> "$GITHUB_PATH"
+ - name: Enable Cachix cache
+ uses: cachix/cachix-action@v17
+ with:
+ name: overeng-effect-utils
+ authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
+ - name: Use pinned devenv from lock
+ run: |
+ DEVENV_REV=$(jq -r .nodes.devenv.locked.rev devenv.lock)
+ if [ -z "$DEVENV_REV" ] || [ "$DEVENV_REV" = "null" ]; then
+ echo '::error::devenv.lock missing .nodes.devenv.locked.rev'
+ exit 1
+ fi
+ echo "DEVENV_REV=$DEVENV_REV" >> "$GITHUB_ENV"
+ echo "Pinned devenv rev: $DEVENV_REV"
+ shell: bash
+ - name: Isolate pnpm state
+ shell: bash
+ run: |
+ echo "PNPM_STORE_DIR=${{ runner.temp }}/pnpm-store/${{ github.job }}" >> "$GITHUB_ENV"
+ echo "PNPM_HOME=${{ github.workspace }}/.pnpm-home" >> "$GITHUB_ENV"
+ - id: restore-pnpm-state
+ name: Restore pnpm state
+ uses: actions/cache/restore@v4
+ with:
+ path: |
+ ${{ github.workspace }}/.pnpm-home
+ ${{ runner.temp }}/pnpm-store/${{ github.job }}
+ key: "pnpm-state-v1-${{ runner.os }}-${{ runner.arch }}-${{ hashFiles('**/pnpm-lock.yaml') }}"
+ - name: Resolve devenv
+ run: |
+ DEVENV_REV=$(jq -r .nodes.devenv.locked.rev devenv.lock)
+ if [ -z "$DEVENV_REV" ] || [ "$DEVENV_REV" = "null" ]; then
+ echo '::error::devenv.lock missing .nodes.devenv.locked.rev'
+ exit 1
+ fi
+
+ resolve_devenv() {
+ nix build \
+ --accept-flake-config \
+ --option extra-substituters https://devenv.cachix.org \
+ --option extra-trusted-public-keys devenv.cachix.org-1:w1cLUi8dv3hnoSPGAuibQv+f9TZLr6cv/Hm9XgU50cw= \
+ --no-link \
+ --print-out-paths \
+ "github:cachix/devenv/$DEVENV_REV#devenv"
+ }
+
+ # Temporary: capture diagnostics dir for #272 root-cause analysis.
+ DIAG_ROOT="${RUNNER_TEMP:-/tmp}/nix-store-diagnostics-${GITHUB_JOB:-job}-${RUNNER_OS:-unknown}-${GITHUB_RUN_ATTEMPT:-0}"
+ mkdir -p "$DIAG_ROOT"
+ echo "NIX_STORE_DIAGNOSTICS_DIR=$DIAG_ROOT" >> "$GITHUB_ENV"
+
+ {
+ echo "timestamp_utc=$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+ echo "runner_name=${RUNNER_NAME:-unknown}"
+ echo "runner_os=${RUNNER_OS:-unknown}"
+ echo "runner_arch=${RUNNER_ARCH:-unknown}"
+ echo "github_job=${GITHUB_JOB:-unknown}"
+ echo "github_run_id=${GITHUB_RUN_ID:-unknown}"
+ echo "nix_user_conf_files=${NIX_USER_CONF_FILES:-}"
+ nix --version || true
+ } > "$DIAG_ROOT/environment.txt" 2>&1
+
+ if ! DEVENV_OUT=$(resolve_devenv 2> >(tee "$DIAG_ROOT/resolve-devenv.log" >&2)); then
+ echo "::error::resolve_devenv failed. Last 30 lines of log:"
+ tail -30 "$DIAG_ROOT/resolve-devenv.log" || true
+ exit 1
+ fi
+ DEVENV_BIN="$DEVENV_OUT/bin/devenv"
+
+ # Fast validity check on the devenv store path (~1-2s vs ~25s for devenv info).
+ if ! nix-store --check-validity "$DEVENV_OUT" 2>/dev/null; then
+ echo "::warning::devenv store path invalid, repairing targeted path..."
+ nix-store --repair-path "$DEVENV_OUT" > "$DIAG_ROOT/nix-store-verify-repair.log" 2>&1 || true
+ rm -rf "${XDG_CACHE_HOME:-$HOME/.cache}"/nix/eval-cache-* ~/.cache/nix/eval-cache-*
+ if ! DEVENV_OUT=$(resolve_devenv 2> >(tee "$DIAG_ROOT/resolve-devenv-post-repair.log" >&2)); then
+ echo "::error::resolve_devenv failed after repair. Last 30 lines of log:"
+ tail -30 "$DIAG_ROOT/resolve-devenv-post-repair.log" || true
+ exit 1
+ fi
+ DEVENV_BIN="$DEVENV_OUT/bin/devenv"
+ fi
+
+ echo "DEVENV_BIN=$DEVENV_BIN" >> "$GITHUB_ENV"
+ "$DEVENV_BIN" version | tee "$DIAG_ROOT/devenv-version.txt"
+ shell: bash
+ - name: Evict cached pnpm deps for oxlint-npm
+ shell: bash
+ run: |
+ targetRef='.#oxlint-npm'
+ entriesJson=$(mktemp)
+ if nix eval --json "$targetRef.passthru.depsBuildEntries" >"$entriesJson" 2>/dev/null; then
+ while IFS=$'\t' read -r attrName drv; do
+ [ -n "$drv" ] || continue
+ while IFS= read -r outPath; do
+ [ -n "$outPath" ] || continue
+ if nix path-info "$outPath" >/dev/null 2>&1; then
+ echo "evicting cached: $(basename "$outPath")"
+ if ! nix store delete --ignore-liveness "$outPath" >/dev/null 2>&1; then
+ echo "::error::failed to evict cached pnpm-deps output: $outPath"
+ exit 1
+ fi
+ if nix path-info "$outPath" >/dev/null 2>&1; then
+ echo "::error::cached pnpm-deps output still present after eviction: $outPath"
+ exit 1
+ fi
+ fi
+ done < <(nix-store -q --outputs "$drv" 2>/dev/null || true)
+ done < <(jq -r '.[] | [.attrName, (.drvPath // "")] | @tsv' "$entriesJson")
+ else
+ topDrv=$(nix path-info --derivation "$targetRef" 2>/dev/null || true)
+ if [ -n "$topDrv" ]; then
+ while IFS= read -r drv; do
+ [ -n "$drv" ] || continue
+ attrName=""
+ while IFS= read -r outPath; do
+ [ -n "$outPath" ] || continue
+ if nix path-info "$outPath" >/dev/null 2>&1; then
+ echo "evicting cached: $(basename "$outPath")"
+ if ! nix store delete --ignore-liveness "$outPath" >/dev/null 2>&1; then
+ echo "::error::failed to evict cached pnpm-deps output: $outPath"
+ exit 1
+ fi
+ if nix path-info "$outPath" >/dev/null 2>&1; then
+ echo "::error::cached pnpm-deps output still present after eviction: $outPath"
+ exit 1
+ fi
+ fi
+ done < <(nix-store -q --outputs "$drv" 2>/dev/null || true)
+ done < <(nix-store -qR "$topDrv" 2>/dev/null | grep "pnpm-deps-[a-z0-9-]*-v[0-9].*\.drv$" || true)
+ fi
+ fi
+ rm -f "$entriesJson"
+ - name: Force diagnostics failure (debug)
+ if: ${{ github.event_name == 'workflow_dispatch' && (inputs.debug_force_nix_diagnostics_failure == true || inputs.debug_force_nix_diagnostics_failure == 'true') }}
+ shell: bash
+ run: |
+ diag_dir="${NIX_STORE_DIAGNOSTICS_DIR:-${RUNNER_TEMP:-/tmp}/nix-store-diagnostics-missing}"
+ mkdir -p "$diag_dir"
+ cat > "$diag_dir/synthetic-signature.log" <<'EOF'
+ Failed to convert config.cachix to JSON
+ ... while evaluating the option `cachix.package`
+ error: path '/nix/store/synthetic-invalid-path' is not valid
+ EOF
+ echo "::warning::Intentional failure for diagnostics validation (#272)"
+ exit 1
+ - name: 'Download previous artifact: nix-closure-measurements'
+ shell: bash
+ env:
+ GH_TOKEN: ${{ github.token }}
+ BASELINE_ARTIFACT_NAME: nix-closure-measurements
+ BASELINE_OUTPUT_DIR: tmp/nix-closure-ci/baseline
+ BASELINE_WORKFLOW_NAME: ${{ github.workflow }}
+ BASELINE_BRANCH: ${{ github.base_ref || github.ref_name }}
+ BASELINE_SEED_RUNS_JSON: '[]'
+ BASELINE_MAX_RUNS: '20'
+ BASELINE_MAX_CANDIDATE_RUNS: '60'
+ BASELINE_REQUIRED_OBSERVATIONS_JSON: '[]'
+ run: |
+ set -euo pipefail
+
+ mkdir -p "$BASELINE_OUTPUT_DIR"
+
+ if command -v gh >/dev/null 2>&1; then
+ GH_BIN="$(command -v gh)"
+ else
+ echo "::notice::gh is not on PATH; resolving GitHub CLI through Nix"
+ if ! GH_BIN="$(nix build --no-link --print-out-paths nixpkgs#gh 2>/dev/null)/bin/gh"; then
+ echo "::notice::unable to resolve GitHub CLI through Nix; skipping previous artifact download"
+ exit 0
+ fi
+ fi
+ echo "Using GitHub CLI: $GH_BIN"
+
+ repo="${GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}"
+ workflow="${BASELINE_WORKFLOW_NAME:-CI}"
+ branch="${BASELINE_BRANCH:-${GITHUB_BASE_REF:-${GITHUB_REF_NAME:-main}}}"
+ seed_runs_file="$BASELINE_OUTPUT_DIR/baseline-seed-runs.json"
+ required_observations_file="$BASELINE_OUTPUT_DIR/baseline-required-observations.json"
+ printf '%s' "${BASELINE_SEED_RUNS_JSON:-[]}" >"$seed_runs_file"
+ printf '%s' "${BASELINE_REQUIRED_OBSERVATIONS_JSON:-[]}" >"$required_observations_file"
+ if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.runId | type == "string")) else false end' \
+ "$seed_runs_file" >/dev/null; then
+ echo "::error::BASELINE_SEED_RUNS_JSON must be an array of objects with string runId fields"
+ exit 1
+ fi
+ if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.id | type == "string") and (.minSources | type == "number")) else false end' \
+ "$required_observations_file" >/dev/null; then
+ echo "::error::BASELINE_REQUIRED_OBSERVATIONS_JSON must be an array of objects with string id and numeric minSources fields"
+ exit 1
+ fi
+ seed_run_ids="$(jq -r '.[].runId' "$seed_runs_file")"
+ required_observation_count="$(jq 'length' "$required_observations_file")"
+ max_candidate_runs="${BASELINE_MAX_CANDIDATE_RUNS:-${BASELINE_MAX_RUNS:-5}}"
+ if ! [[ "$max_candidate_runs" =~ ^[0-9]+$ ]] || [ "$max_candidate_runs" -lt 1 ]; then
+ max_candidate_runs=1
+ fi
+
+ candidate_runs="$(
+ "$GH_BIN" run list \
+ --repo "$repo" \
+ --workflow "$workflow" \
+ --branch "$branch" \
+ --event push \
+ --status success \
+ --json databaseId,headSha \
+ --limit "$max_candidate_runs" \
+ --jq '[.[] | select(.headSha != env.GITHUB_SHA) | .databaseId] | .[]'
+ )"
+
+ candidate_runs="$seed_run_ids
+ $candidate_runs"
+
+ max_runs="${BASELINE_MAX_RUNS:-5}"
+ if ! [[ "$max_runs" =~ ^[0-9]+$ ]] || [ "$max_runs" -lt 1 ]; then
+ max_runs=1
+ fi
+
+ write_baseline_observation_counts() {
+ local measurement_index="$BASELINE_OUTPUT_DIR/baseline-measurement-files.txt"
+ local counts_file="$BASELINE_OUTPUT_DIR/baseline-observation-counts.json"
+ find "$BASELINE_OUTPUT_DIR" \
+ -mindepth 2 \
+ -maxdepth 2 \
+ -name measurements.json \
+ -type f \
+ -print \
+ | sort >"$measurement_index" || true
+
+ if [ -s "$measurement_index" ]; then
+ xargs -r jq -s \
+ --slurpfile required "$required_observations_file" \
+ '
+ ([.[] | (.observations // [])[]? | select(.value | type == "number") | .id] | sort | group_by(.) | map({id: .[0], sources: length})) as $counts
+ | ($required[0] // []) as $requiredRows
+ | {
+ counts: $counts,
+ required: (
+ $requiredRows
+ | map(. as $requiredRow | ($counts | map(select(.id == $requiredRow.id)) | .[0].sources // 0) as $actual | $requiredRow + {sources:$actual, satisfied:($actual >= $requiredRow.minSources)})
+ )
+ }
+ ' <"$measurement_index" >"$counts_file"
+ else
+ jq -n --slurpfile required "$required_observations_file" \
+ '{counts: [], required: (($required[0] // []) | map(. + {sources:0, satisfied:false}))}' >"$counts_file"
+ fi
+ }
+
+ baseline_requirements_satisfied() {
+ if [ "$required_observation_count" -eq 0 ]; then
+ return 1
+ fi
+ write_baseline_observation_counts
+ jq -e '.required | all(.satisfied == true)' "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" >/dev/null
+ }
+
+ run_id=""
+ artifact_name=""
+ artifact_id=""
+ downloaded_runs_file="$BASELINE_OUTPUT_DIR/baseline-runs.jsonl"
+ seen_runs_file="$BASELINE_OUTPUT_DIR/baseline-seen-runs.txt"
+ : >"$downloaded_runs_file"
+ : >"$seen_runs_file"
+ for candidate_run in $candidate_runs; do
+ if [ -z "$candidate_run" ]; then
+ continue
+ fi
+ if grep -qxF "$candidate_run" "$seen_runs_file"; then
+ continue
+ fi
+ downloaded_count="$(wc -l <"$downloaded_runs_file" | tr -d ' ')"
+ if [ "$downloaded_count" -ge "$max_runs" ]; then
+ if baseline_requirements_satisfied; then
+ break
+ fi
+ echo "::notice::downloaded $downloaded_count baseline artifact(s), but required observation counts are not satisfied yet; continuing through bounded candidate history"
+ fi
+ if [ "$(wc -l <"$seen_runs_file" | tr -d ' ')" -ge "$max_candidate_runs" ]; then
+ break
+ fi
+ printf '%s\n' "$candidate_run" >>"$seen_runs_file"
+
+ artifact_json="$(
+ "$GH_BIN" api "repos/$repo/actions/runs/$candidate_run/artifacts" \
+ | jq --arg artifactName "$BASELINE_ARTIFACT_NAME" '.artifacts
+ | map(select(.expired == false))
+ | map(select(.name == $artifactName or (.name | startswith($artifactName + "-"))))
+ | sort_by(.created_at // "")
+ | reverse
+ | .[0] // empty'
+ )"
+
+ if [ -n "$artifact_json" ]; then
+ current_artifact_name="$(printf '%s' "$artifact_json" | jq -r '.name')"
+ current_artifact_id="$(printf '%s' "$artifact_json" | jq -r '.id')"
+ current_output_dir="$BASELINE_OUTPUT_DIR/run-$candidate_run"
+ mkdir -p "$current_output_dir"
+ if "$GH_BIN" run download "$candidate_run" \
+ --repo "$repo" \
+ --name "$current_artifact_name" \
+ --dir "$current_output_dir"; then
+ if [ -z "$run_id" ]; then
+ run_id="$candidate_run"
+ artifact_name="$current_artifact_name"
+ artifact_id="$current_artifact_id"
+ fi
+ jq -cn \
+ --arg runId "$candidate_run" \
+ --arg artifactName "$current_artifact_name" \
+ --arg artifactId "$current_artifact_id" \
+ --arg path "run-$candidate_run" \
+ '{runId:$runId, artifactName:$artifactName, artifactId:$artifactId, path:$path}' \
+ >>"$downloaded_runs_file"
+ else
+ echo "::notice::failed to download baseline artifact $current_artifact_name from run $candidate_run"
+ fi
+ fi
+ done
+
+ write_baseline_observation_counts
+
+ if [ -z "$run_id" ] || [ -z "$artifact_name" ]; then
+ echo "::notice::no successful baseline run found for $repo workflow=$workflow branch=$branch"
+ exit 0
+ fi
+
+ jq -n \
+ --slurpfile runs "$downloaded_runs_file" \
+ --slurpfile seedRuns "$seed_runs_file" \
+ --slurpfile observationCounts "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" \
+ --argjson schemaVersion 1 \
+ --arg repository "$repo" \
+ --arg workflow "$workflow" \
+ --arg branch "$branch" \
+ --arg runId "$run_id" \
+ --arg artifactName "$artifact_name" \
+ --arg artifactId "$artifact_id" \
+ '{
+ schemaVersion: $schemaVersion,
+ source: "github-actions-artifact",
+ repository: $repository,
+ workflow: $workflow,
+ branch: $branch,
+ runId: $runId,
+ artifactName: $artifactName,
+ artifactId: $artifactId,
+ seedRuns: ($seedRuns[0] // []),
+ runs: $runs,
+ observationCounts: ($observationCounts[0] // null)
+ }' >"$BASELINE_OUTPUT_DIR/baseline-provenance.json"
+
+ echo "Downloaded $(wc -l <"$downloaded_runs_file" | tr -d ' ') baseline artifact(s), latest $artifact_name from run $run_id into $BASELINE_OUTPUT_DIR"
+
+ - name: 'Measure Nix closure: genie'
+ shell: bash
+ env:
+ ARTIFACT_DIR: tmp/nix-closure-ci/current/genie_package
+ RUNNER_CLASS: '${{ runner.os }}-${{ runner.arch }}'
+ run: |
+ set -euo pipefail
+
+ mkdir -p "$ARTIFACT_DIR"
+ installable='.#genie'
+ target_id='genie_package'
+ target_name='genie'
+ target_label='Genie package'
+ target_group='packages'
+ target_description='the packaged Genie CLI closure'
+ artifact_file="$ARTIFACT_DIR/measurements.json"
+ target_system='x86_64-linux'
+
+ out_path="$(nix build --no-update-lock-file --no-link --print-out-paths "$installable")"
+ path_info="$ARTIFACT_DIR/nix-closure-path-info.json"
+ paths_file="$ARTIFACT_DIR/nix-closure-paths.json"
+
+ nix path-info --recursive --json "$out_path" >"$path_info"
+ jq 'to_entries | map({ path: .key, narSize: (.value.narSize // 0) })' "$path_info" >"$paths_file"
+
+ jq -n \
+ --slurpfile paths "$paths_file" \
+ --argjson schemaVersion 1 \
+ --arg generatedAt "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
+ --arg repository "${GITHUB_REPOSITORY:-unknown}" \
+ --arg branchKind "${GITHUB_EVENT_NAME:-unknown}" \
+ --arg ref "${CI_MEASUREMENT_SUBJECT_REF:-${GITHUB_REF:-unknown}}" \
+ --arg headSha "${CI_MEASUREMENT_SUBJECT_SHA:-${GITHUB_SHA:-unknown}}" \
+ --arg baseSha "${GITHUB_BASE_SHA:-}" \
+ --arg runnerName "${RUNNER_NAME:-unknown}" \
+ --arg runnerOs "${RUNNER_OS:-unknown}" \
+ --arg runnerArch "${RUNNER_ARCH:-unknown}" \
+ --arg runnerClass "${RUNNER_CLASS:-unknown}" \
+ --arg githubRunId "${GITHUB_RUN_ID:-unknown}" \
+ --arg githubRunAttempt "${GITHUB_RUN_ATTEMPT:-unknown}" \
+ --arg githubJob "${GITHUB_JOB:-unknown}" \
+ --arg taskId "${CROSSTASK_TASK_ID:-}" \
+ --arg taskAttemptId "${CROSSTASK_ATTEMPT_ID:-}" \
+ --arg traceId "${TRACE_ID:-}" \
+ --arg targetName "$target_name" \
+ --arg targetId "$target_id" \
+ --arg targetLabel "$target_label" \
+ --arg targetGroup "$target_group" \
+ --arg targetDescription "$target_description" \
+ --arg targetSystem "$target_system" \
+ --arg outPath "$out_path" \
+ --argjson buckets '[{"name":"node","label":"Node / pnpm","pathRegex":"node_modules|npm-deps|pnpm"},{"name":"nix-sources","label":"Nix sources","pathRegex":"-source$"},{"name":"rust","label":"Rust","pathRegex":"cargo|rust|rustc"}]' \
+ --argjson targetPath '["nix","closures","packages","genie"]' \
+ --argjson gatePolicy '{}' \
+ '
+ ($paths[0] // []) as $closurePaths
+ | ($closurePaths | map(.narSize) | add // 0) as $totalNarSize
+ | ($closurePaths | length) as $pathCount
+ | ($buckets | map(
+ . as $bucket
+ | {
+ name: "nix.closure.bucket.nar_size",
+ id: "nix.closure.bucket.nar_size",
+ label: (($bucket.label // $bucket.name) + " closure size"),
+ group: "nix closure buckets",
+ path: ($targetPath + ["buckets", $bucket.name]),
+ description: ("NAR size contributed by closure paths matching " + $bucket.pathRegex),
+ measurementKind: "deterministic",
+ unit: "bytes",
+ value: (
+ $closurePaths
+ | map(select(.path | test($bucket.pathRegex)) | .narSize)
+ | add // 0
+ ),
+ policy: $gatePolicy,
+ dimensions: { bucket: $bucket.name }
+ }
+ )) as $bucketObservations
+ | {
+ schemaVersion: $schemaVersion,
+ generatedAt: $generatedAt,
+ producer: { name: "effect-utils-ci-measurement", version: 1 },
+ subject: {
+ repo: $repository,
+ branchKind: (if $branchKind == "" then "unknown" else $branchKind end),
+ ref: $ref,
+ headSha: $headSha,
+ baseSha: $baseSha
+ },
+ execution: {
+ provider: (if ($githubRunId != "" and $githubRunId != "unknown") then "github-actions" else "local" end),
+ workflow: "CI",
+ job: $githubJob,
+ runId: $githubRunId,
+ runAttempt: $githubRunAttempt,
+ taskId: $taskId,
+ attemptId: $taskAttemptId,
+ traceId: $traceId,
+ runner: { name: $runnerName, os: $runnerOs, arch: $runnerArch, class: $runnerClass }
+ },
+ target: { kind: "nix-closure", id: $targetId, name: $targetName, label: $targetLabel, group: $targetGroup, path: $targetPath, system: $targetSystem },
+ observations: ([
+ {
+ id: "nix.closure.nar_size",
+ label: "Total closure size",
+ group: "nix closure",
+ path: ($targetPath + ["total", "nar-size"]),
+ description: ("Total NAR size for all paths in " + $targetDescription),
+ name: "nix.closure.nar_size",
+ measurementKind: "deterministic",
+ unit: "bytes",
+ value: $totalNarSize,
+ policy: $gatePolicy,
+ dimensions: { bucket: "total" }
+ },
+ {
+ id: "nix.closure.path_count",
+ label: "Total closure path count",
+ group: "nix closure",
+ path: ($targetPath + ["total", "path-count"]),
+ description: ("Number of store paths in " + $targetDescription),
+ name: "nix.closure.path_count",
+ measurementKind: "deterministic",
+ unit: "count",
+ value: $pathCount,
+ policy: $gatePolicy,
+ dimensions: { bucket: "total" }
+ }
+ ] + $bucketObservations),
+ artifacts: [
+ { name: "nix-closure-path-info", path: "nix-closure-path-info.json", contentType: "application/json" },
+ { name: "nix-closure-paths", path: "nix-closure-paths.json", contentType: "application/json" }
+ ],
+ details: {
+ outPath: $outPath,
+ topPaths: ($closurePaths | sort_by(.narSize) | reverse | .[:30])
+ }
+ }
+ ' >"$artifact_file"
+
+ cat "$artifact_file"
+
+ - name: 'Measure Nix closure: megarepo'
+ shell: bash
+ env:
+ ARTIFACT_DIR: tmp/nix-closure-ci/current/megarepo_package
+ RUNNER_CLASS: '${{ runner.os }}-${{ runner.arch }}'
+ run: |
+ set -euo pipefail
+
+ mkdir -p "$ARTIFACT_DIR"
+ installable='.#megarepo'
+ target_id='megarepo_package'
+ target_name='megarepo'
+ target_label='Megarepo package'
+ target_group='packages'
+ target_description='the packaged megarepo CLI closure'
+ artifact_file="$ARTIFACT_DIR/measurements.json"
+ target_system='x86_64-linux'
+
+ out_path="$(nix build --no-update-lock-file --no-link --print-out-paths "$installable")"
+ path_info="$ARTIFACT_DIR/nix-closure-path-info.json"
+ paths_file="$ARTIFACT_DIR/nix-closure-paths.json"
+
+ nix path-info --recursive --json "$out_path" >"$path_info"
+ jq 'to_entries | map({ path: .key, narSize: (.value.narSize // 0) })' "$path_info" >"$paths_file"
+
+ jq -n \
+ --slurpfile paths "$paths_file" \
+ --argjson schemaVersion 1 \
+ --arg generatedAt "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
+ --arg repository "${GITHUB_REPOSITORY:-unknown}" \
+ --arg branchKind "${GITHUB_EVENT_NAME:-unknown}" \
+ --arg ref "${CI_MEASUREMENT_SUBJECT_REF:-${GITHUB_REF:-unknown}}" \
+ --arg headSha "${CI_MEASUREMENT_SUBJECT_SHA:-${GITHUB_SHA:-unknown}}" \
+ --arg baseSha "${GITHUB_BASE_SHA:-}" \
+ --arg runnerName "${RUNNER_NAME:-unknown}" \
+ --arg runnerOs "${RUNNER_OS:-unknown}" \
+ --arg runnerArch "${RUNNER_ARCH:-unknown}" \
+ --arg runnerClass "${RUNNER_CLASS:-unknown}" \
+ --arg githubRunId "${GITHUB_RUN_ID:-unknown}" \
+ --arg githubRunAttempt "${GITHUB_RUN_ATTEMPT:-unknown}" \
+ --arg githubJob "${GITHUB_JOB:-unknown}" \
+ --arg taskId "${CROSSTASK_TASK_ID:-}" \
+ --arg taskAttemptId "${CROSSTASK_ATTEMPT_ID:-}" \
+ --arg traceId "${TRACE_ID:-}" \
+ --arg targetName "$target_name" \
+ --arg targetId "$target_id" \
+ --arg targetLabel "$target_label" \
+ --arg targetGroup "$target_group" \
+ --arg targetDescription "$target_description" \
+ --arg targetSystem "$target_system" \
+ --arg outPath "$out_path" \
+ --argjson buckets '[{"name":"node","label":"Node / pnpm","pathRegex":"node_modules|npm-deps|pnpm"},{"name":"nix-sources","label":"Nix sources","pathRegex":"-source$"},{"name":"rust","label":"Rust","pathRegex":"cargo|rust|rustc"}]' \
+ --argjson targetPath '["nix","closures","packages","megarepo"]' \
+ --argjson gatePolicy '{}' \
+ '
+ ($paths[0] // []) as $closurePaths
+ | ($closurePaths | map(.narSize) | add // 0) as $totalNarSize
+ | ($closurePaths | length) as $pathCount
+ | ($buckets | map(
+ . as $bucket
+ | {
+ name: "nix.closure.bucket.nar_size",
+ id: "nix.closure.bucket.nar_size",
+ label: (($bucket.label // $bucket.name) + " closure size"),
+ group: "nix closure buckets",
+ path: ($targetPath + ["buckets", $bucket.name]),
+ description: ("NAR size contributed by closure paths matching " + $bucket.pathRegex),
+ measurementKind: "deterministic",
+ unit: "bytes",
+ value: (
+ $closurePaths
+ | map(select(.path | test($bucket.pathRegex)) | .narSize)
+ | add // 0
+ ),
+ policy: $gatePolicy,
+ dimensions: { bucket: $bucket.name }
+ }
+ )) as $bucketObservations
+ | {
+ schemaVersion: $schemaVersion,
+ generatedAt: $generatedAt,
+ producer: { name: "effect-utils-ci-measurement", version: 1 },
+ subject: {
+ repo: $repository,
+ branchKind: (if $branchKind == "" then "unknown" else $branchKind end),
+ ref: $ref,
+ headSha: $headSha,
+ baseSha: $baseSha
+ },
+ execution: {
+ provider: (if ($githubRunId != "" and $githubRunId != "unknown") then "github-actions" else "local" end),
+ workflow: "CI",
+ job: $githubJob,
+ runId: $githubRunId,
+ runAttempt: $githubRunAttempt,
+ taskId: $taskId,
+ attemptId: $taskAttemptId,
+ traceId: $traceId,
+ runner: { name: $runnerName, os: $runnerOs, arch: $runnerArch, class: $runnerClass }
+ },
+ target: { kind: "nix-closure", id: $targetId, name: $targetName, label: $targetLabel, group: $targetGroup, path: $targetPath, system: $targetSystem },
+ observations: ([
+ {
+ id: "nix.closure.nar_size",
+ label: "Total closure size",
+ group: "nix closure",
+ path: ($targetPath + ["total", "nar-size"]),
+ description: ("Total NAR size for all paths in " + $targetDescription),
+ name: "nix.closure.nar_size",
+ measurementKind: "deterministic",
+ unit: "bytes",
+ value: $totalNarSize,
+ policy: $gatePolicy,
+ dimensions: { bucket: "total" }
+ },
+ {
+ id: "nix.closure.path_count",
+ label: "Total closure path count",
+ group: "nix closure",
+ path: ($targetPath + ["total", "path-count"]),
+ description: ("Number of store paths in " + $targetDescription),
+ name: "nix.closure.path_count",
+ measurementKind: "deterministic",
+ unit: "count",
+ value: $pathCount,
+ policy: $gatePolicy,
+ dimensions: { bucket: "total" }
+ }
+ ] + $bucketObservations),
+ artifacts: [
+ { name: "nix-closure-path-info", path: "nix-closure-path-info.json", contentType: "application/json" },
+ { name: "nix-closure-paths", path: "nix-closure-paths.json", contentType: "application/json" }
+ ],
+ details: {
+ outPath: $outPath,
+ topPaths: ($closurePaths | sort_by(.narSize) | reverse | .[:30])
+ }
+ }
+ ' >"$artifact_file"
+
+ cat "$artifact_file"
+
+ - name: 'Measure Nix closure: oxlint-npm'
+ shell: bash
+ env:
+ ARTIFACT_DIR: tmp/nix-closure-ci/current/oxlint_npm_package
+ RUNNER_CLASS: '${{ runner.os }}-${{ runner.arch }}'
+ run: |
+ set -euo pipefail
+
+ mkdir -p "$ARTIFACT_DIR"
+ installable='.#oxlint-npm'
+ target_id='oxlint_npm_package'
+ target_name='oxlint-npm'
+ target_label='oxlint npm package'
+ target_group='packages'
+ target_description='the packaged oxlint npm compatibility wrapper closure'
+ artifact_file="$ARTIFACT_DIR/measurements.json"
+ target_system='x86_64-linux'
+
+ out_path="$(nix build --no-update-lock-file --no-link --print-out-paths "$installable")"
+ path_info="$ARTIFACT_DIR/nix-closure-path-info.json"
+ paths_file="$ARTIFACT_DIR/nix-closure-paths.json"
+
+ nix path-info --recursive --json "$out_path" >"$path_info"
+ jq 'to_entries | map({ path: .key, narSize: (.value.narSize // 0) })' "$path_info" >"$paths_file"
+
+ jq -n \
+ --slurpfile paths "$paths_file" \
+ --argjson schemaVersion 1 \
+ --arg generatedAt "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
+ --arg repository "${GITHUB_REPOSITORY:-unknown}" \
+ --arg branchKind "${GITHUB_EVENT_NAME:-unknown}" \
+ --arg ref "${CI_MEASUREMENT_SUBJECT_REF:-${GITHUB_REF:-unknown}}" \
+ --arg headSha "${CI_MEASUREMENT_SUBJECT_SHA:-${GITHUB_SHA:-unknown}}" \
+ --arg baseSha "${GITHUB_BASE_SHA:-}" \
+ --arg runnerName "${RUNNER_NAME:-unknown}" \
+ --arg runnerOs "${RUNNER_OS:-unknown}" \
+ --arg runnerArch "${RUNNER_ARCH:-unknown}" \
+ --arg runnerClass "${RUNNER_CLASS:-unknown}" \
+ --arg githubRunId "${GITHUB_RUN_ID:-unknown}" \
+ --arg githubRunAttempt "${GITHUB_RUN_ATTEMPT:-unknown}" \
+ --arg githubJob "${GITHUB_JOB:-unknown}" \
+ --arg taskId "${CROSSTASK_TASK_ID:-}" \
+ --arg taskAttemptId "${CROSSTASK_ATTEMPT_ID:-}" \
+ --arg traceId "${TRACE_ID:-}" \
+ --arg targetName "$target_name" \
+ --arg targetId "$target_id" \
+ --arg targetLabel "$target_label" \
+ --arg targetGroup "$target_group" \
+ --arg targetDescription "$target_description" \
+ --arg targetSystem "$target_system" \
+ --arg outPath "$out_path" \
+ --argjson buckets '[{"name":"node","label":"Node / pnpm","pathRegex":"node_modules|npm-deps|pnpm"},{"name":"nix-sources","label":"Nix sources","pathRegex":"-source$"},{"name":"rust","label":"Rust","pathRegex":"cargo|rust|rustc"}]' \
+ --argjson targetPath '["nix","closures","packages","oxlint-npm"]' \
+ --argjson gatePolicy '{}' \
+ '
+ ($paths[0] // []) as $closurePaths
+ | ($closurePaths | map(.narSize) | add // 0) as $totalNarSize
+ | ($closurePaths | length) as $pathCount
+ | ($buckets | map(
+ . as $bucket
+ | {
+ name: "nix.closure.bucket.nar_size",
+ id: "nix.closure.bucket.nar_size",
+ label: (($bucket.label // $bucket.name) + " closure size"),
+ group: "nix closure buckets",
+ path: ($targetPath + ["buckets", $bucket.name]),
+ description: ("NAR size contributed by closure paths matching " + $bucket.pathRegex),
+ measurementKind: "deterministic",
+ unit: "bytes",
+ value: (
+ $closurePaths
+ | map(select(.path | test($bucket.pathRegex)) | .narSize)
+ | add // 0
+ ),
+ policy: $gatePolicy,
+ dimensions: { bucket: $bucket.name }
+ }
+ )) as $bucketObservations
+ | {
+ schemaVersion: $schemaVersion,
+ generatedAt: $generatedAt,
+ producer: { name: "effect-utils-ci-measurement", version: 1 },
+ subject: {
+ repo: $repository,
+ branchKind: (if $branchKind == "" then "unknown" else $branchKind end),
+ ref: $ref,
+ headSha: $headSha,
+ baseSha: $baseSha
+ },
+ execution: {
+ provider: (if ($githubRunId != "" and $githubRunId != "unknown") then "github-actions" else "local" end),
+ workflow: "CI",
+ job: $githubJob,
+ runId: $githubRunId,
+ runAttempt: $githubRunAttempt,
+ taskId: $taskId,
+ attemptId: $taskAttemptId,
+ traceId: $traceId,
+ runner: { name: $runnerName, os: $runnerOs, arch: $runnerArch, class: $runnerClass }
+ },
+ target: { kind: "nix-closure", id: $targetId, name: $targetName, label: $targetLabel, group: $targetGroup, path: $targetPath, system: $targetSystem },
+ observations: ([
+ {
+ id: "nix.closure.nar_size",
+ label: "Total closure size",
+ group: "nix closure",
+ path: ($targetPath + ["total", "nar-size"]),
+ description: ("Total NAR size for all paths in " + $targetDescription),
+ name: "nix.closure.nar_size",
+ measurementKind: "deterministic",
+ unit: "bytes",
+ value: $totalNarSize,
+ policy: $gatePolicy,
+ dimensions: { bucket: "total" }
+ },
+ {
+ id: "nix.closure.path_count",
+ label: "Total closure path count",
+ group: "nix closure",
+ path: ($targetPath + ["total", "path-count"]),
+ description: ("Number of store paths in " + $targetDescription),
+ name: "nix.closure.path_count",
+ measurementKind: "deterministic",
+ unit: "count",
+ value: $pathCount,
+ policy: $gatePolicy,
+ dimensions: { bucket: "total" }
+ }
+ ] + $bucketObservations),
+ artifacts: [
+ { name: "nix-closure-path-info", path: "nix-closure-path-info.json", contentType: "application/json" },
+ { name: "nix-closure-paths", path: "nix-closure-paths.json", contentType: "application/json" }
+ ],
+ details: {
+ outPath: $outPath,
+ topPaths: ($closurePaths | sort_by(.narSize) | reverse | .[:30])
+ }
+ }
+ ' >"$artifact_file"
+
+ cat "$artifact_file"
+
+ - name: Compare CI measurements with baseline
+ shell: bash
+ env:
+ CI_MEASUREMENT_CURRENT_DIR: tmp/nix-closure-ci/current
+ CI_MEASUREMENT_BASELINE_DIR: tmp/nix-closure-ci/baseline
+ CI_MEASUREMENT_COMPARISON_FILE: tmp/nix-closure-ci/measurement-comparison.json
+ CI_MEASUREMENT_REGRESSION_MODE: warn
+ CI_MEASUREMENT_PR_COMMENT_ENABLED: 'true'
+ CI_MEASUREMENT_PR_COMMENT_TITLE: Nix Closure Measurements
+ CI_MEASUREMENT_PR_COMMENT_MAX_ROWS: '8'
+ CI_MEASUREMENT_PR_COMMENT_MAX_HISTORY: '20'
+ CI_MEASUREMENT_PR_COMMENT_ASSET_BRANCH: ci-measurement-assets
+ GH_TOKEN: ${{ github.token }}
+ run: |
+ set -euo pipefail
+
+ export PATH="/run/current-system/sw/bin:/usr/bin:/bin:$PATH"
+
+ current_dir="${CI_MEASUREMENT_CURRENT_DIR:?CI_MEASUREMENT_CURRENT_DIR not set}"
+ baseline_dir="${CI_MEASUREMENT_BASELINE_DIR:?CI_MEASUREMENT_BASELINE_DIR not set}"
+ comparison_file="${CI_MEASUREMENT_COMPARISON_FILE:?CI_MEASUREMENT_COMPARISON_FILE not set}"
+ mode="${CI_MEASUREMENT_REGRESSION_MODE:-warn}"
+ mkdir -p "$(dirname "$comparison_file")"
+
+ if [ "$mode" = "off" ]; then
+ jq -n --argjson schemaVersion 1 --arg status skipped --arg mode "$mode" \
+ '{schemaVersion:$schemaVersion,status:$status,mode:$mode,comparisons:{}}' \
+ >"$comparison_file"
+ exit 0
+ fi
+
+ current_index="$(mktemp)"
+ baseline_index="$(mktemp)"
+ find "$current_dir" -name baseline -type d -prune -o -name measurements.json -type f -print | sort >"$current_index" || true
+ {
+ find "$baseline_dir" -name baseline -type d ! -path "$baseline_dir" -prune -o -name measurements.json -type f -print
+ } | sort -u >"$baseline_index" || true
+
+ if [ ! -s "$current_index" ]; then
+ echo "::error::no current measurements.json files found under $current_dir"
+ exit 1
+ fi
+
+ current_json="$comparison_file.current.json"
+ baseline_json="$comparison_file.baseline.json"
+ xargs -r jq -s '.' <"$current_index" >"$current_json"
+ if [ -s "$baseline_index" ]; then
+ xargs -r jq -s '.' <"$baseline_index" >"$baseline_json"
+ else
+ printf '[]\n' >"$baseline_json"
+ fi
+
+ jq -n \
+ --slurpfile current "$current_json" \
+ --slurpfile baseline "$baseline_json" \
+ --argjson schemaVersion 1 \
+ --arg mode "$mode" \
+ --arg currentDir "$current_dir" \
+ --arg baselineDir "$baseline_dir" \
+ '
+ def identity_dimensions:
+ (.dimensions // {})
+ | to_entries
+ | map(select(.key as $key | ["devenvRev", "otelServiceName", "status", "probeLabel", "sampleCount", "measuredSampleCount"] | index($key) | not))
+ | sort_by(.key)
+ | map("\(.key)=\(.value|tostring)")
+ | join(",");
+
+ def observation_key($doc):
+ [
+ ($doc.target.kind // "unknown"),
+ ($doc.target.id // $doc.target.name // "unknown"),
+ ($doc.target.system // "unknown"),
+ (.id // .name // "unknown"),
+ (.unit // "unknown"),
+ identity_dimensions
+ ] | join("|");
+
+ def median:
+ sort as $sorted
+ | ($sorted | length) as $count
+ | if $count == 0 then null
+ elif ($count % 2) == 1 then $sorted[($count / 2 | floor)]
+ else (($sorted[($count / 2 - 1)] + $sorted[($count / 2)]) / 2)
+ end;
+
+ def percentile($p):
+ sort as $sorted
+ | ($sorted | length) as $count
+ | if $count == 0 then null
+ else $sorted[(($p * ($count - 1)) | floor)]
+ end;
+
+ def abs_value: if . < 0 then -. else . end;
+
+ def observations_by_key($docs):
+ reduce $docs[]? as $doc
+ ({};
+ reduce (($doc.observations // [])[]? | select(.value | type == "number")) as $obs
+ (.;
+ ($obs | observation_key($doc)) as $key
+ | .[$key] = ((.[$key] // []) + [{
+ target: $doc.target,
+ observation: $obs,
+ generatedAt: $doc.generatedAt
+ }])
+ )
+ );
+
+ def observation_stats($items):
+ ($items | map(.observation.value)) as $values
+ | ($items | map(.observation.comparison.baseline // empty)) as $pairedBaselineValues
+ | ($items | map(.observation.statistics.pairedDeltaMedian // empty)) as $pairedDeltaMedianValues
+ | ($items | map(.observation.statistics.pairedDeltaP25 // empty)) as $pairedDeltaP25Values
+ | ($items | map(.observation.statistics.pairedDeltaP75 // empty)) as $pairedDeltaP75Values
+ | ($items | map(.observation.statistics.pairedDeltaMad // empty)) as $pairedDeltaMadValues
+ | ($items | map(.observation.statistics.pairedDeltaSamples // []) | add // []) as $pairedDeltaSampleValues
+ | ($items | map(.observation.statistics.measuredSampleCount // .observation.statistics.sampleCount // 1) | add // ($items | length)) as $sampleCount
+ | ($values | median) as $median
+ | {
+ target: ($items[0].target // {}),
+ observation: ($items[-1].observation // {}),
+ measurementKind: ($items[-1].observation.measurementKind // null),
+ value: $median,
+ min: ($values | min),
+ max: ($values | max),
+ p25: ($values | percentile(0.25)),
+ p75: ($values | percentile(0.75)),
+ p95: ($values | percentile(0.95)),
+ mad: ($values | map(. - $median | if . < 0 then -. else . end) | median),
+ sourceCount: ($items | length),
+ sampleCount: $sampleCount,
+ pairedSampleCount: ($items | map(.observation.statistics.pairedSampleCount // .observation.comparison.pairedSampleCount // 0) | add // 0),
+ pairedBaselineValue: (if ($pairedBaselineValues | length) == 0 then null else ($pairedBaselineValues | median) end),
+ pairedDeltaMedianValue: (if ($pairedDeltaMedianValues | length) == 0 then null else ($pairedDeltaMedianValues | median) end),
+ pairedDeltaP25Value: (if ($pairedDeltaP25Values | length) == 0 then null else ($pairedDeltaP25Values | median) end),
+ pairedDeltaP75Value: (if ($pairedDeltaP75Values | length) == 0 then null else ($pairedDeltaP75Values | median) end),
+ pairedDeltaMadValue: (if ($pairedDeltaMadValues | length) == 0 then null else ($pairedDeltaMadValues | median) end),
+ pairedDeltaSampleValues: $pairedDeltaSampleValues,
+ generatedAt: ($items[-1].generatedAt // null)
+ };
+
+ def budget($metric; $unit):
+ if $metric == "nix.closure.nar_size" then
+ {warnRatio:1.05, failRatio:1.10, warnAbs:52428800, failAbs:209715200, statisticalToleranceRatio:0.02, statisticalToleranceAbs:10485760}
+ elif $metric == "nix.closure.bucket.nar_size" then
+ {warnRatio:1.10, failRatio:1.20, warnAbs:52428800, failAbs:209715200, statisticalToleranceRatio:0.05, statisticalToleranceAbs:10485760}
+ elif $metric == "nix.closure.path_count" then
+ {warnRatio:1.05, failRatio:1.10, warnAbs:100, failAbs:500, statisticalToleranceRatio:0.02, statisticalToleranceAbs:10}
+ elif $unit == "seconds" then
+ {warnRatio:1.10, failRatio:1.20, warnAbs:0.25, failAbs:1, statisticalToleranceRatio:0.10, statisticalToleranceAbs:0.25}
+ else
+ {warnRatio:1.25, failRatio:1.50, warnAbs:1, failAbs:3, statisticalToleranceRatio:0.10, statisticalToleranceAbs:1}
+ end;
+
+ def noise_floor($metric; $unit):
+ if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" then 10485760
+ elif $metric == "nix.closure.path_count" then 10
+ elif $unit == "seconds" then 0.1
+ else 0
+ end;
+ def default_policy($metric; $unit):
+ budget($metric; $unit) as $b
+ | noise_floor($metric; $unit) as $noise
+ | $b + {
+ enabled:true,
+ comparisonMode:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" or $unit != "seconds" then "budget" else "historical" end),
+ minBaselineSources:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" or $unit != "seconds" then 1 else 10 end),
+ minCurrentSamples:(if $unit == "seconds" then 3 else 1 end),
+ minPairedSamples:(if $unit == "seconds" then 5 else 0 end),
+ noiseFloor:$noise
+ };
+ def observation_policy($obs):
+ default_policy($obs.name // "unknown"; $obs.unit // "unknown") + ($obs.policy // {});
+ def policy_enabled($policy):
+ if ($policy | has("enabled")) then $policy.enabled else true end;
+
+ def classify($metric; $unit; $measurementKind; $policy; $current; $currentP25; $currentP75; $currentMad; $baseline; $baselineMin; $baselineMax; $baselineP25; $baselineP75; $baselineP95; $baselineMad; $currentSamples; $baselineSources; $pairedSamples; $pairedDeltaMedian; $pairedDeltaP25; $pairedDeltaP75; $pairedDeltaMad; $pairedDeltaValues):
+ $policy as $b
+ | ($policy.comparisonMode // (if $measurementKind == "deterministic" or $unit != "seconds" then "budget" elif $measurementKind == "diagnostic" then "diagnostic" else "historical" end)) as $comparisonMode
+ | ($policy.noiseFloor // noise_floor($metric; $unit)) as $noise
+ | ($current - $baseline) as $delta
+ | (if $comparisonMode == "paired" and $pairedDeltaMedian != null then $pairedDeltaMedian else $delta end) as $evidenceDelta
+ | (($policy.pairedEvidenceQuantile // 0.25) | tonumber) as $pairedEvidenceQuantile
+ | (if $baseline > 0 then ($current / $baseline) else null end) as $ratio
+ | (($baselineP75 // $baseline) - ($baselineP25 // $baseline)) as $iqr
+ | (($currentP75 // $current) - ($currentP25 // $current)) as $currentIqr
+ | (($pairedDeltaP75 // $evidenceDelta) - ($pairedDeltaP25 // $evidenceDelta)) as $pairedDeltaIqr
+ | ([
+ $noise,
+ (($policy.statisticalToleranceAbs // 0) | tonumber),
+ (if $baseline > 0 then ($baseline * (($policy.statisticalToleranceRatio // 0) | tonumber)) else 0 end),
+ (($baselineMad // 0) * 3),
+ (($iqr // 0) * 1.5)
+ ] | max) as $robustTolerance
+ | (if $currentSamples > 1 then ([
+ $noise,
+ (($policy.statisticalToleranceAbs // 0) | tonumber),
+ (if $current > 0 then ($current * (($policy.statisticalToleranceRatio // 0) | tonumber)) else 0 end),
+ (($currentMad // 0) * 3),
+ (($currentIqr // 0) * 1.5)
+ ] | max) else 0 end) as $currentRobustTolerance
+ | ([
+ $noise,
+ (($policy.statisticalToleranceAbs // 0) | tonumber),
+ (if $baseline > 0 then ($baseline * (($policy.statisticalToleranceRatio // 0) | tonumber)) else 0 end),
+ (($pairedDeltaMad // 0) * 3),
+ (($pairedDeltaIqr // 0) * 1.5)
+ ] | max) as $pairedDeltaTolerance
+ | ($baseline + $robustTolerance) as $robustUpper
+ | ($baseline - $robustTolerance) as $robustLower
+ | ($current + $currentRobustTolerance) as $currentRobustUpper
+ | ($current - $currentRobustTolerance) as $currentRobustLower
+ | (if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then ($pairedDeltaValues | percentile($pairedEvidenceQuantile)) else ($evidenceDelta - $pairedDeltaTolerance) end) as $evidenceDeltaLower
+ | (if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then ($pairedDeltaValues | percentile(1 - $pairedEvidenceQuantile)) else ($evidenceDelta + $pairedDeltaTolerance) end) as $evidenceDeltaUpper
+ | ([($b.warnAbs // 0), (if $baseline > 0 then ($baseline * (($b.warnRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $warnBudget
+ | ([($b.failAbs // 0), (if $baseline > 0 then ($baseline * (($b.failRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $failBudget
+ | ($comparisonMode != "paired") as $needsHistoricalBaselineCount
+ | (
+ ($current >= $robustLower and $current <= $robustUpper)
+ or ($currentRobustTolerance > 0 and $currentRobustLower <= $robustUpper and $currentRobustUpper >= $robustLower)
+ ) as $withinRobustBand
+ | ($comparisonMode == "historical" and $measurementKind != "deterministic") as $canUseRobustBandSuppression
+ | (
+ $baselineMin != null
+ and $baselineMax != null
+ and $current >= $baselineMin
+ and $current <= $baselineMax
+ ) as $withinBaselineRange
+ | (
+ if $baseline <= 0 then "unknown"
+ elif $comparisonMode == "paired" and $evidenceDeltaLower > $failBudget then "fail"
+ elif $comparisonMode == "paired" and $evidenceDeltaLower > $warnBudget then "warn"
+ elif $comparisonMode == "paired" then "pass"
+ elif ($delta > $b.failAbs and $current > ($baseline * $b.failRatio)) then "fail"
+ elif ($delta > $b.warnAbs and $current > ($baseline * $b.warnRatio)) then "warn"
+ else "pass"
+ end
+ ) as $thresholdStatus
+ | (
+ policy_enabled($policy) == true
+ and $baseline > 0
+ and (if $needsHistoricalBaselineCount then $baselineSources >= ($policy.minBaselineSources // 1) else true end)
+ and $currentSamples >= ($policy.minCurrentSamples // 1)
+ and (if $comparisonMode == "paired" then $pairedSamples >= ($policy.minPairedSamples // 1) else true end)
+ and (if $comparisonMode == "paired" then $pairedDeltaMedian != null else true end)
+ ) as $gateable
+ | (
+ if (policy_enabled($policy) != true) then "disabled"
+ elif $baseline <= 0 then "missing_baseline"
+ elif $needsHistoricalBaselineCount and $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
+ elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
+ elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
+ elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta"
+ else "eligible"
+ end
+ ) as $gateReason
+ | (
+ if $baseline <= 0 then "unknown"
+ elif (policy_enabled($policy) != true) then "diagnostic"
+ elif ($delta | abs_value) <= $noise then "noise_floor"
+ elif $needsHistoricalBaselineCount and $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
+ elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
+ elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
+ elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta"
+ elif $comparisonMode == "paired" and $thresholdStatus == "pass" and $evidenceDelta > $warnBudget then "paired_uncertain"
+ elif ($canUseRobustBandSuppression and $thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band"
+ elif $thresholdStatus == "pass" then "within_budget"
+ else "threshold_exceeded"
+ end
+ ) as $confidence
+ | (
+ if ($gateable and $confidence == "threshold_exceeded") then $thresholdStatus
+ elif $thresholdStatus == "unknown" then "unknown"
+ else "pass"
+ end
+ ) as $status
+ | (
+ if $baseline <= 0 then "unknown"
+ elif $comparisonMode == "paired" and ($evidenceDelta | abs_value) <= $noise then "unchanged"
+ elif $comparisonMode == "paired" and $evidenceDeltaLower <= 0 and $evidenceDeltaUpper >= 0 then "unchanged"
+ elif $comparisonMode == "paired" and $evidenceDelta < 0 then "improved"
+ elif $comparisonMode == "paired" then "regressed"
+ elif ($delta | abs_value) <= $noise then "unchanged"
+ elif $canUseRobustBandSuppression and $withinRobustBand then "unchanged"
+ elif $delta < 0 then "improved"
+ else "regressed"
+ end
+ ) as $direction
+ | (
+ if $baseline <= 0 then null
+ elif (policy_enabled($policy) != true) then null
+ elif $comparisonMode == "paired" and ($evidenceDeltaLower <= 0 and $evidenceDeltaUpper >= 0) then 0
+ elif $comparisonMode == "paired" and ($evidenceDelta | abs_value) <= $noise then 0
+ elif $comparisonMode == "paired" and $evidenceDelta > 0 then ([0, $evidenceDeltaLower] | max) / $warnBudget
+ elif $comparisonMode == "paired" then -(([0, (-$evidenceDeltaUpper)] | max) / $warnBudget)
+ elif $canUseRobustBandSuppression and $withinRobustBand then 0
+ elif ($delta | abs_value) <= $noise then 0
+ elif ($confidence == "threshold_exceeded" and $delta > 0) then ([0, ($currentRobustLower - $robustUpper), $delta] | max) / $warnBudget
+ elif ($confidence == "threshold_exceeded" and $delta < 0) then -(([0, ($robustLower - $currentRobustUpper), (-$delta)] | max) / $warnBudget)
+ elif $delta > 0 then ([0, ($currentRobustLower - $robustUpper)] | max) / $warnBudget
+ else -(([0, ($robustLower - $currentRobustUpper)] | max) / $warnBudget)
+ end
+ ) as $semanticImpactScore
+ | (
+ if (policy_enabled($policy) != true) then "diagnostic"
+ elif $semanticImpactScore == null then "unknown"
+ elif $semanticImpactScore == 0 then "neutral"
+ elif $semanticImpactScore >= ($failBudget / $warnBudget) then "fail_boundary"
+ elif $semanticImpactScore >= 1 then "warn_boundary"
+ elif $semanticImpactScore > 0 then "below_warn_boundary"
+ else "improvement"
+ end
+ ) as $semanticImpactKind
+ | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,comparisonMode:$comparisonMode,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,semanticImpactScore:$semanticImpactScore,semanticImpactKind:$semanticImpactKind,semanticWarnBudget:$warnBudget,semanticFailBudget:$failBudget,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange,pairedSamples:$pairedSamples,evidenceDelta:$evidenceDelta,evidenceDeltaLower:$evidenceDeltaLower,evidenceDeltaUpper:$evidenceDeltaUpper,evidenceDeltaTolerance:$pairedDeltaTolerance,pairedEvidenceQuantile:$pairedEvidenceQuantile,pairedEvidenceProtocol:(if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then "paired-delta-quantile-v1" elif $comparisonMode == "paired" then "paired-summary-robust-band-v1" else null end)};
+
+ (observations_by_key($current[0]) | with_entries(.value = observation_stats(.value))) as $currentObs
+ | (observations_by_key($baseline[0]) | with_entries(.value = observation_stats(.value))) as $baselineObs
+ | (
+ $currentObs
+ | to_entries
+ | map(
+ .key as $key
+ | .value as $currentValue
+ | ($baselineObs[$key] // null) as $baselineValue
+ | ($currentValue.observation | observation_policy(.)) as $policy
+ | ($policy.comparisonMode // (if ($currentValue.observation.measurementKind // $currentValue.measurementKind) == "deterministic" or ($currentValue.observation.unit // "") != "seconds" then "budget" elif ($currentValue.observation.measurementKind // $currentValue.measurementKind) == "diagnostic" then "diagnostic" else "historical" end)) as $comparisonMode
+ | ($currentValue.pairedBaselineValue // null) as $pairedBaselineValue
+ | (if $comparisonMode == "paired" and $pairedBaselineValue != null then {
+ value: $pairedBaselineValue,
+ min: $pairedBaselineValue,
+ max: $pairedBaselineValue,
+ p25: $pairedBaselineValue,
+ p75: $pairedBaselineValue,
+ p95: $pairedBaselineValue,
+ mad: 0,
+ sourceCount: $currentValue.pairedSampleCount
+ } else $baselineValue end) as $effectiveBaselineValue
+ | {
+ key: $key,
+ value: (
+ if $effectiveBaselineValue == null then
+ {
+ status: "missing_baseline",
+ target: $currentValue.target,
+ observation: $currentValue.observation,
+ current: $currentValue.value,
+ currentSamples: $currentValue.sampleCount,
+ baselineSources: 0,
+ gatePolicy: $policy,
+ comparisonMode: $comparisonMode,
+ gateable: false,
+ gateReason: "missing_baseline",
+ confidence: "missing_baseline",
+ direction: "unknown"
+ }
+ else
+ classify(
+ $currentValue.observation.name;
+ $currentValue.observation.unit;
+ ($currentValue.observation.measurementKind // $currentValue.measurementKind);
+ $policy;
+ $currentValue.value;
+ $currentValue.p25;
+ $currentValue.p75;
+ $currentValue.mad;
+ $effectiveBaselineValue.value;
+ $effectiveBaselineValue.min;
+ $effectiveBaselineValue.max;
+ $effectiveBaselineValue.p25;
+ $effectiveBaselineValue.p75;
+ $effectiveBaselineValue.p95;
+ $effectiveBaselineValue.mad;
+ $currentValue.sampleCount;
+ $effectiveBaselineValue.sourceCount;
+ $currentValue.pairedSampleCount;
+ $currentValue.pairedDeltaMedianValue;
+ $currentValue.pairedDeltaP25Value;
+ $currentValue.pairedDeltaP75Value;
+ $currentValue.pairedDeltaMadValue;
+ ($currentValue.pairedDeltaSampleValues // [])
+ ) + {
+ target: $currentValue.target,
+ observation: $currentValue.observation,
+ currentSamples: $currentValue.sampleCount,
+ baselineSources: $effectiveBaselineValue.sourceCount,
+ baselineMin: $effectiveBaselineValue.min,
+ baselineMax: $effectiveBaselineValue.max,
+ baselineP25: $effectiveBaselineValue.p25,
+ baselineP75: $effectiveBaselineValue.p75,
+ baselineP95: $effectiveBaselineValue.p95
+ ,baselineMad: $effectiveBaselineValue.mad
+ }
+ end
+ )
+ }
+ )
+ | from_entries
+ ) as $comparisons
+ | (
+ if any($comparisons[]?; .status == "fail") then "fail"
+ elif any($comparisons[]?; .status == "warn") then "warn"
+ elif any($comparisons[]?;
+ (if (.gatePolicy | has("enabled")) then .gatePolicy.enabled else true end)
+ and (.gateReason == "missing_baseline"
+ or .gateReason == "low_baseline_count"
+ or .gateReason == "low_current_sample_count"
+ or .gateReason == "low_paired_sample_count"
+ or .gateReason == "missing_paired_delta")
+ ) then "partial"
+ else "pass"
+ end
+ ) as $status
+ | (
+ [$comparisons[]?]
+ | {
+ enabledCount: (map(select((if (.gatePolicy | has("enabled")) then .gatePolicy.enabled else true end))) | length),
+ gateableCount: (map(select(.gateable == true)) | length),
+ missingBaselineCount: (map(select(.gateReason == "missing_baseline")) | length),
+ lowBaselineCount: (map(select(.gateReason == "low_baseline_count")) | length),
+ lowCurrentSampleCount: (map(select(.gateReason == "low_current_sample_count")) | length),
+ lowPairedSampleCount: (map(select(.gateReason == "low_paired_sample_count")) | length),
+ missingPairedDeltaCount: (map(select(.gateReason == "missing_paired_delta")) | length)
+ }
+ | . + {
+ nonGateableCount: (.enabledCount - .gateableCount),
+ enforceable: (.enabledCount == .gateableCount)
+ }
+ ) as $readiness
+ | {
+ schemaVersion:$schemaVersion,
+ status:$status,
+ mode:$mode,
+ readiness:$readiness,
+ currentDir:$currentDir,
+ baselineDir:$baselineDir,
+ comparisons:$comparisons
+ }
+ ' >"$comparison_file"
+
+ baseline_provenance_file="$baseline_dir/baseline-provenance.json"
+ if [ -f "$baseline_provenance_file" ]; then
+ comparison_with_provenance="$(mktemp)"
+ jq --slurpfile baselineProvenance "$baseline_provenance_file" \
+ '. + {baselineProvenance: ($baselineProvenance[0] // null)}' \
+ "$comparison_file" >"$comparison_with_provenance"
+ mv "$comparison_with_provenance" "$comparison_file"
+ fi
+
+ status="$(jq -r '.status' "$comparison_file")"
+ exit_code=0
+ case "$status:$mode" in
+ fail:fail)
+ echo "::error::CI measurement regression detected"
+ exit_code=1
+ ;;
+ fail:*|warn:*)
+ echo "::warning::CI measurement regression threshold exceeded"
+ ;;
+ partial:*)
+ echo "::notice::CI measurement comparison is partial because one or more enabled observations are not gateable"
+ ;;
+ esac
+
+ if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
+ {
+ echo "### ${CI_MEASUREMENT_PR_COMMENT_TITLE:-CI Measurements}"
+ echo ""
+ jq -r '"- Status: " + .status + "\n- Gate: " + (if .mode == "fail" then "enforced" elif .mode == "warn" then "advisory" elif .mode == "off" then "off" else (.mode // "unknown") end) + "\n- Baseline: " + .baselineDir' "$comparison_file"
+ echo ""
+ echo "| Status | Gate | Target | Observation | Current | Baseline | Delta | Ratio |"
+ echo "| --- | --- | --- | --- | ---: | ---: | ---: | ---: |"
+ jq -r '
+ .comparisons
+ | to_entries
+ | sort_by(
+ if .value.status == "fail" then 0
+ elif .value.status == "warn" then 1
+ elif .value.status == "missing_baseline" then 2
+ else 3
+ end
+ )
+ | .[:20]
+ | .[]
+ | .value as $v
+ | [
+ $v.status,
+ (if ($v.gateable // false) then "yes" else ($v.gateReason // "no") end),
+ (($v.target.kind // "unknown") + "/" + ($v.target.name // "unknown") + "/" + ($v.target.system // "unknown")),
+ ($v.observation.name // "unknown"),
+ (($v.current // $v.observation.value // 0) | tostring),
+ (($v.baseline // "") | tostring),
+ (($v.delta // "") | tostring),
+ (if $v.ratio == null or $v.ratio == "" then "" else (($v.ratio * 100 | round / 100) | tostring) end)
+ ]
+ | "| " + (map(gsub("\\|"; "\\\\|")) | join(" | ")) + " |"
+ ' "$comparison_file"
+ } >>"$GITHUB_STEP_SUMMARY"
+ fi
+
+ if [ "${CI_MEASUREMENT_PR_COMMENT_ENABLED:-false}" = "true" ] && { [ "${GITHUB_EVENT_NAME:-}" = "pull_request" ] || [ -n "${CI_MEASUREMENT_PR_COMMENT_PR_NUMBER:-}" ]; }; then
+ can_render_pr_comment=true
+
+ ensure_ci_measurement_tool() {
+ tool_name="$1"
+ nix_attr="$2"
+ if command -v "$tool_name" >/dev/null 2>&1; then
+ return 0
+ fi
+ if ! command -v nix >/dev/null 2>&1; then
+ return 1
+ fi
+ if tool_out="$(nix build --no-link --print-out-paths "nixpkgs#$nix_attr" 2>/dev/null)"; then
+ export PATH="$tool_out/bin:$PATH"
+ fi
+ command -v "$tool_name" >/dev/null 2>&1
+ }
+
+ if ! ensure_ci_measurement_tool gh gh; then
+ echo "::notice::gh is not available; skipping CI measurement PR comment"
+ can_render_pr_comment=false
+ fi
+ if ! ensure_ci_measurement_tool node nodejs; then
+ echo "::notice::node is not available; skipping CI measurement PR comment"
+ can_render_pr_comment=false
+ fi
+ if ! command -v jq >/dev/null 2>&1; then
+ if ensure_ci_measurement_tool jq jq; then
+ :
+ else
+ echo "::notice::jq is not available; skipping CI measurement PR comment"
+ can_render_pr_comment=false
+ fi
+ fi
+ if [ -z "${GH_TOKEN:-${GITHUB_TOKEN:-}}" ]; then
+ echo "::notice::GH_TOKEN/GITHUB_TOKEN is not set; skipping CI measurement PR comment"
+ can_render_pr_comment=false
+ fi
+
+ event_path="${GITHUB_EVENT_PATH:-}"
+ pr_number="${CI_MEASUREMENT_PR_COMMENT_PR_NUMBER:-}"
+ if [ "$can_render_pr_comment" = "true" ] && [ -n "$event_path" ] && [ -f "$event_path" ]; then
+ pr_number="${pr_number:-$(jq -r '.pull_request.number // empty' "$event_path")}"
+ fi
+ if [ "$can_render_pr_comment" = "true" ] && [ -z "$pr_number" ]; then
+ echo "::notice::pull request number is unavailable; skipping CI measurement PR comment"
+ can_render_pr_comment=false
+ fi
+
+ if [ "$can_render_pr_comment" = "true" ]; then
+ repo="${GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}"
+ comment_tmp_dir="$(mktemp -d)"
+ comments_json="$comment_tmp_dir/comments.json"
+ comment_body="$comment_tmp_dir/comment.md"
+ comment_id_file="$comment_tmp_dir/comment-id.txt"
+ chart_file="$comment_tmp_dir/perf-change-vs-baseline.svg"
+ chart_dark_file="$comment_tmp_dir/perf-change-vs-baseline-dark.svg"
+ chart_png_file="$comment_tmp_dir/perf-change-vs-baseline.png"
+ chart_dark_png_file="$comment_tmp_dir/perf-change-vs-baseline-dark.png"
+ renderer_script="$comment_tmp_dir/render-ci-measurement-comment.mjs"
+
+ if ! gh api "repos/$repo/issues/$pr_number/comments" --paginate >"$comments_json"; then
+ echo "::notice::unable to list PR comments; skipping CI measurement PR comment"
+ can_render_pr_comment=false
+ fi
+
+ if [ "$can_render_pr_comment" = "true" ]; then
+ asset_branch="${CI_MEASUREMENT_PR_COMMENT_ASSET_BRANCH:-ci-measurement-assets}"
+ asset_title="$(printf '%s' "${CI_MEASUREMENT_PR_COMMENT_TITLE:-ci-measurements}" | tr '[:upper:]' '[:lower:]' | sed -E 's/[^a-z0-9]+/-/g; s/^-+//; s/-+$//')"
+ if [ -z "$asset_title" ]; then
+ asset_title="ci-measurements"
+ fi
+ asset_head_sha="${CI_MEASUREMENT_SUBJECT_SHA:-${GITHUB_HEAD_SHA:-${GITHUB_SHA:-unknown}}}"
+ asset_run_id="${GITHUB_RUN_ID:-local}"
+ asset_run_attempt="${GITHUB_RUN_ATTEMPT:-0}"
+ asset_svg_path="ci-measurements/pr-$pr_number/${asset_head_sha}/run-${asset_run_id}-attempt-${asset_run_attempt}/${asset_title}.svg"
+ asset_png_path="ci-measurements/pr-$pr_number/${asset_head_sha}/run-${asset_run_id}-attempt-${asset_run_attempt}/${asset_title}.png"
+ asset_dark_png_path="ci-measurements/pr-$pr_number/${asset_head_sha}/run-${asset_run_id}-attempt-${asset_run_attempt}/${asset_title}-dark.png"
+ public_asset_command="${CI_MEASUREMENT_PR_COMMENT_PUBLIC_ASSET_COMMAND:-}"
+ repo_private="$(gh api "repos/$repo" --jq '.private // false' 2>/dev/null || printf 'true')"
+ require_public_asset=false
+ if [ "$repo_private" = "true" ]; then
+ require_public_asset=true
+ fi
+ if [ "${GITHUB_SERVER_URL:-https://github.com}" = "https://github.com" ]; then
+ github_raw_chart_url="https://raw.githubusercontent.com/$repo/$asset_branch/$asset_png_path"
+ github_raw_chart_dark_url="https://raw.githubusercontent.com/$repo/$asset_branch/$asset_dark_png_path"
+ github_raw_chart_source_url="https://raw.githubusercontent.com/$repo/$asset_branch/$asset_svg_path"
+ else
+ github_raw_chart_url="${GITHUB_SERVER_URL:-https://github.com}/$repo/raw/$asset_branch/$asset_png_path"
+ github_raw_chart_dark_url="${GITHUB_SERVER_URL:-https://github.com}/$repo/raw/$asset_branch/$asset_dark_png_path"
+ github_raw_chart_source_url="${GITHUB_SERVER_URL:-https://github.com}/$repo/raw/$asset_branch/$asset_svg_path"
+ fi
+ if [ "$repo_private" = "true" ]; then
+ chart_url=""
+ chart_dark_url=""
+ chart_source_url=""
+ else
+ chart_url="$github_raw_chart_url"
+ chart_dark_url="$github_raw_chart_dark_url"
+ chart_source_url="$github_raw_chart_source_url"
+ fi
+ export CI_MEASUREMENT_PR_COMMENT_CHART_URL="$chart_url"
+ export CI_MEASUREMENT_PR_COMMENT_CHART_DARK_URL="$chart_dark_url"
+ export CI_MEASUREMENT_PR_COMMENT_CHART_SOURCE_URL="$chart_source_url"
+
+ cat > "$renderer_script" <<'EOF'
+ import { readFileSync, writeFileSync } from 'node:fs'
+
+ const [comparisonPath, commentsPath, bodyPath, commentIdPath, chartPath, chartDarkPath] = process.argv.slice(2)
+ const title = process.env.CI_MEASUREMENT_PR_COMMENT_TITLE || 'CI Measurements'
+ const maxRows = Number.parseInt(process.env.CI_MEASUREMENT_PR_COMMENT_MAX_ROWS || '10', 10)
+ const maxHistory = Number.parseInt(process.env.CI_MEASUREMENT_PR_COMMENT_MAX_HISTORY || '20', 10)
+ const repo = process.env.GITHUB_REPOSITORY || 'unknown'
+ const runId = process.env.GITHUB_RUN_ID || ''
+ const runAttempt = process.env.GITHUB_RUN_ATTEMPT || ''
+ const sha = process.env.GITHUB_SHA || ''
+ const headSha = process.env.CI_MEASUREMENT_SUBJECT_SHA || process.env.GITHUB_HEAD_SHA || sha
+ const serverUrl = process.env.GITHUB_SERVER_URL || 'https://github.com'
+ const workflow = process.env.GITHUB_WORKFLOW || 'CI'
+ const job = process.env.GITHUB_JOB || ''
+ const chartUrl = process.env.CI_MEASUREMENT_PR_COMMENT_CHART_URL || ''
+ const chartDarkUrl = process.env.CI_MEASUREMENT_PR_COMMENT_CHART_DARK_URL || ''
+ const chartSourceUrl = process.env.CI_MEASUREMENT_PR_COMMENT_CHART_SOURCE_URL || ''
+
+ const markerScope = (process.env.CI_MEASUREMENT_PR_COMMENT_MARKER || title)
+ .toLowerCase()
+ .replace(/[^a-z0-9]+/g, '-')
+ .replace(/^-+|-+$/g, '') || 'default'
+ const marker = ''
+ const legacyMarker = ''
+ const statePrefix = ''
+ const stateTag = 'ci-measurement-comment-state'
+ const schemaVersion = 1
+
+ const comparison = JSON.parse(readFileSync(comparisonPath, 'utf8'))
+ const comments = JSON.parse(readFileSync(commentsPath, 'utf8'))
+ if (!Array.isArray(comments)) throw new Error('comments response must be an array')
+
+ const existing = comments.find((comment) => {
+ if (typeof comment?.body !== 'string') return false
+ return comment.body.includes(marker) ||
+ (comment.body.includes(legacyMarker) && comment.body.includes('## ' + title))
+ })
+
+ const extractState = (body) => {
+ if (typeof body !== 'string') return undefined
+ const start = body.indexOf(statePrefix)
+ if (start === -1) return undefined
+ const end = body.indexOf(stateSuffix, start + statePrefix.length)
+ if (end === -1) return undefined
+ try {
+ const parsed = JSON.parse(body.slice(start + statePrefix.length, end))
+ if (parsed && parsed._tag === stateTag && Array.isArray(parsed.runs)) return parsed
+ } catch {
+ return undefined
+ }
+ return undefined
+ }
+
+ const formatNumber = (value) => {
+ if (value === null || value === undefined || Number.isNaN(value)) return 'n/a'
+ if (Number.isInteger(value)) return String(value)
+ return String(Math.round(value * 1000) / 1000)
+ }
+
+ const formatValue = (value, unit) => {
+ if (value === null || value === undefined) return 'n/a'
+ if (unit === 'bytes') {
+ if (value >= 1073741824) return formatNumber(Math.round((value / 1073741824) * 10) / 10) + ' GiB'
+ if (value >= 1048576) return formatNumber(Math.round((value / 1048576) * 10) / 10) + ' MiB'
+ if (value >= 1024) return formatNumber(Math.round((value / 1024) * 10) / 10) + ' KiB'
+ return formatNumber(value) + ' B'
+ }
+ if (unit === 'seconds') return formatNumber(value) + ' s'
+ return formatNumber(value) + (unit ? ' ' + unit : '')
+ }
+
+ const formatDelta = (value, unit) => {
+ if (value === null || value === undefined) return 'n/a'
+ const sign = value >= 0 ? '+' : '-'
+ return sign + formatValue(Math.abs(value), unit)
+ }
+
+ const formatRatio = (value) => {
+ if (value === null || value === undefined) return 'n/a'
+ return formatNumber(Math.round((value - 1) * 1000) / 10) + '%'
+ }
+
+ const formatSemanticImpact = (value) => {
+ if (value === null || value === undefined || Number.isNaN(value)) return 'n/a'
+ if (Math.abs(value) < 0.005) return '0.00x'
+ const sign = value > 0 ? '+' : ''
+ return sign + formatNumber(Math.round(value * 100) / 100) + 'x'
+ }
+
+ const formatRowImpact = (row) => {
+ if (row.confidence === 'diagnostic' || row.gateReason === 'disabled' || row.semanticImpactKind === 'diagnostic') {
+ return 'diagnostic'
+ }
+ return formatSemanticImpact(row.semanticImpactScore)
+ }
+
+ const formatEvidence = (row) => {
+ const unit = row.observation?.unit
+ if (row.comparisonMode === 'paired' && typeof row.evidenceDeltaLower === 'number' && typeof row.evidenceDeltaUpper === 'number') {
+ const quantile = typeof row.pairedEvidenceQuantile === 'number'
+ ? Math.round(row.pairedEvidenceQuantile * 100)
+ : 25
+ return (row.confidence || 'unknown')
+ + '
paired n=' + (row.pairedSamples ?? 0)
+ + ', ' + quantile + '-' + (100 - quantile) + '% delta '
+ + formatValue(row.evidenceDeltaLower, unit)
+ + ' - ' + formatValue(row.evidenceDeltaUpper, unit)
+ + ''
+ }
+ return (row.confidence || 'unknown') + '
baseline n=' + (row.baselineSources ?? 0) + ', current samples=' + (row.currentSamples ?? 1) + ''
+ }
+
+ const interpretation = (row) => {
+ if (row.confidence === 'low_baseline_count') return {
+ label: 'Needs more baseline',
+ detail: 'Not enough compatible baseline runs to make this gate trustworthy.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ if (row.confidence === 'low_current_sample_count') return {
+ label: 'Needs repeat',
+ detail: 'Current run has too few successful measured samples.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ if (row.confidence === 'low_paired_sample_count') return {
+ label: 'Needs paired evidence',
+ detail: 'Wall-clock gates require same-run base/head samples before they can block merges.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ if (row.confidence === 'missing_paired_delta') return {
+ label: 'Needs paired delta stats',
+ detail: 'Wall-clock gates require per-pair delta statistics, not only paired medians.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ if (row.confidence === 'paired_uncertain') return {
+ label: 'Uncertain wall-clock movement',
+ detail: 'The paired median moved, but the paired delta band still crosses the configured budget.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ if (row.confidence === 'diagnostic') return {
+ label: 'Diagnostic only',
+ detail: 'Shown for investigation, but intentionally excluded from gating.',
+ tone: 'diagnostic',
+ color: '#a78bfa',
+ }
+ if (row.status === 'fail') return {
+ label: 'Regression - blocks merge',
+ detail: 'Worse than the configured fail threshold with enough samples.',
+ tone: 'bad',
+ color: '#ef4444',
+ }
+ if (row.status === 'warn') return {
+ label: 'Regression - review',
+ detail: 'Worse than the configured warning threshold.',
+ tone: 'warn',
+ color: '#f59e0b',
+ }
+ if (row.status === 'missing_baseline') return {
+ label: 'No baseline yet',
+ detail: 'Current value is measured, but no comparable baseline exists.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ if (row.confidence === 'noise_floor') return {
+ label: 'Too small to matter',
+ detail: 'The absolute change is below the noise floor for this metric.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ if (row.confidence === 'within_baseline_range') return {
+ label: 'Historical range only',
+ detail: 'Inside the full historical min/max range, but this range is not used to pass a gate.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ if (row.confidence === 'within_robust_band' || row.confidence === 'within_baseline_distribution') return {
+ label: 'Within noise band',
+ detail: 'Current and baseline robust noise bands overlap.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ if (row.direction === 'improved' && typeof row.semanticImpactScore === 'number' && row.semanticImpactScore <= -1) return {
+ label: 'Meaningfully lower',
+ detail: 'Lower than baseline by enough to cross the configured review threshold.',
+ tone: 'good',
+ color: '#10b981',
+ }
+ if (row.direction === 'improved') return {
+ label: 'Slightly lower, ok',
+ detail: 'Lower than baseline, but still inside the configured review budget.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ if (row.direction === 'regressed') return {
+ label: 'Slightly higher, ok',
+ detail: 'Higher than baseline but still inside the configured budget.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ return {
+ label: 'Unchanged',
+ detail: 'No meaningful movement from baseline.',
+ tone: 'neutral',
+ color: '#94a3b8',
+ }
+ }
+
+ const formatGate = (row) => {
+ if (row.gateable) return 'yes'
+ const reason = row.gateReason || row.confidence || 'unknown'
+ return 'no
' + reason + ''
+ }
+
+ const escapeCell = (value) => String(value ?? '-').replaceAll('|', '\\|').replaceAll('\n', '
')
+ const escapeXml = (value) => String(value)
+ .replaceAll('&', '&')
+ .replaceAll('<', '<')
+ .replaceAll('>', '>')
+ .replaceAll('"', '"')
+
+ const humanProbe = (row) => {
+ if (row.observation?.label) return row.observation.label
+ const probe = row.observation?.dimensions?.probe
+ const name = row.observation?.name || 'unknown'
+ const labels = {
+ shell_eval_traced: 'Shell eval with OTEL trace',
+ shell_eval_warm: 'Warm shell eval',
+ tasks_list: 'devenv tasks list',
+ processes_help: 'devenv processes --help',
+ task_pnpm_install: 'pnpm:install',
+ task_genie_run: 'genie:run',
+ task_check_quick: 'check:quick',
+ task_check_quick_warm: 'Warm cached check:quick',
+ task_check_quick_forced: 'Forced check:quick',
+ }
+ if (probe && labels[probe]) return labels[probe]
+ if (name.startsWith('devenv.') && name.endsWith('.duration')) {
+ return name.slice('devenv.'.length, -'.duration'.length).replaceAll('_', ' ')
+ }
+ return name
+ }
+
+ const semanticPath = (row) => {
+ const parts = [
+ ...(Array.isArray(row.target?.path) ? row.target.path : []),
+ row.target?.group,
+ ...(Array.isArray(row.observation?.path) ? row.observation.path : []),
+ row.observation?.group,
+ ].filter((value) => typeof value === 'string' && value.length > 0)
+ const seen = new Set()
+ const unique = parts.filter((part) => {
+ if (seen.has(part)) return false
+ seen.add(part)
+ return true
+ })
+ return unique.length > 0 ? unique.join(' / ') : '-'
+ }
+
+ const chartProbe = (row) => {
+ if (row.observation?.label) return row.observation.label
+ const probe = row.observation?.dimensions?.probe
+ const labels = {
+ shell_eval_traced: 'Shell eval with OTEL trace',
+ shell_eval_warm: 'Warm shell eval',
+ tasks_list: 'devenv tasks list',
+ processes_help: 'processes --help',
+ task_pnpm_install: 'pnpm:install',
+ task_genie_run: 'genie:run',
+ task_check_quick: 'check:quick',
+ task_check_quick_warm: 'Warm cached check:quick',
+ task_check_quick_forced: 'Forced check:quick',
+ }
+ if (probe && labels[probe]) return labels[probe]
+ return humanProbe(row)
+ }
+
+ const dimensions = (row) => {
+ const entries = Object.entries(row.observation?.dimensions || {})
+ if (entries.length === 0) return '-'
+ return entries
+ .sort(([left], [right]) => left.localeCompare(right))
+ .map(([key, value]) => key + '=' + String(value))
+ .join('
')
+ }
+
+ const rank = (row) => {
+ if (row.status === 'fail') return 0
+ if (row.status === 'warn') return 1
+ if (row.status === 'missing_baseline') return 3
+ return 2
+ }
+
+ const allRows = Object.values(comparison.comparisons || {}).sort((left, right) => {
+ const byRank = rank(left) - rank(right)
+ if (byRank !== 0) return byRank
+ const leftImpact = typeof left.semanticImpactScore === 'number' ? Math.abs(left.semanticImpactScore) : 0
+ const rightImpact = typeof right.semanticImpactScore === 'number' ? Math.abs(right.semanticImpactScore) : 0
+ if (rightImpact !== leftImpact) return rightImpact - leftImpact
+ const leftDelta = typeof left.delta === 'number' ? Math.abs(left.delta) : 0
+ const rightDelta = typeof right.delta === 'number' ? Math.abs(right.delta) : 0
+ if (rightDelta !== leftDelta) return rightDelta - leftDelta
+ return humanProbe(left).localeCompare(humanProbe(right))
+ })
+ const protocolLabel = (() => {
+ const protocols = new Set(
+ allRows
+ .map((row) => row.observation?.dimensions?.measurementProtocol)
+ .filter((value) => typeof value === 'string' && value.length > 0),
+ )
+ return protocols.size > 0 ? Array.from(protocols).join(', ') : 'legacy'
+ })()
+ const visibleLimit = Number.isFinite(maxRows) && maxRows > 0 ? maxRows : 10
+ const comparableRows = allRows.filter((row) => typeof row.baseline === 'number')
+ const hasComparableBaseline = comparableRows.length > 0
+ const visibleRows = (hasComparableBaseline
+ ? allRows.filter((row) => typeof row.baseline === 'number')
+ : allRows.slice().sort((left, right) => (right.current || 0) - (left.current || 0))
+ ).slice(0, visibleLimit)
+
+ const comparisonTable = (rows) => {
+ if (rows.length === 0) return 'No measurement regressions detected.'
+ return [
+ '| Group | Measurement | Baseline | Current | Raw change | Impact | Meaning | Gate | Evidence |',
+ '| --- | --- | ---: | ---: | ---: | ---: | --- | --- | --- |',
+ ...rows.map((row) => {
+ const unit = row.observation?.unit
+ const baselineRange = typeof row.baselineRobustLower === 'number' && typeof row.baselineRobustUpper === 'number' && row.baselineRobustLower !== row.baselineRobustUpper
+ ? '
noise band ' + formatValue(row.baselineRobustLower, unit) + ' - ' + formatValue(row.baselineRobustUpper, unit) + ''
+ : typeof row.baselineMin === 'number' && typeof row.baselineMax === 'number' && row.baselineMin !== row.baselineMax
+ ? '
range ' + formatValue(row.baselineMin, unit) + ' - ' + formatValue(row.baselineMax, unit) + ''
+ : ''
+ const meaning = interpretation(row)
+ return '| ' + [
+ semanticPath(row),
+ humanProbe(row),
+ formatValue(row.baseline, unit) + baselineRange,
+ formatValue(row.current, unit),
+ formatDelta(row.delta, unit) + ' / ' + formatRatio(row.ratio),
+ formatRowImpact(row),
+ meaning.label + '
' + meaning.detail + '',
+ formatGate(row),
+ formatEvidence(row),
+ ].map(escapeCell).join(' | ') + ' |'
+ }),
+ ].join('\n')
+ }
+
+ const currentOnlyTable = (rows) => {
+ if (rows.length === 0) return 'No current measurements found.'
+ return [
+ '| Group | Measurement | Current |',
+ '| --- | --- | ---: |',
+ ...rows.map((row) => {
+ return '| ' + [semanticPath(row), humanProbe(row), formatValue(row.current, row.observation?.unit)].map(escapeCell).join(' | ') + ' |'
+ }),
+ ].join('\n')
+ }
+
+ const allMeasurementsTable = (rows) => {
+ if (rows.length === 0) return 'No measurement regressions detected.'
+ return [
+ '| Status | Gate | Target | Observation | Dimensions | Baseline | Current | Delta | Ratio | Impact |',
+ '| --- | --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: |',
+ ...rows.map((row) => {
+ const unit = row.observation?.unit
+ return '| ' + [
+ row.status,
+ row.gateable ? 'yes' : (row.gateReason || 'no'),
+ row.target?.label || row.target?.name || 'unknown',
+ row.observation?.label || row.observation?.name || 'unknown',
+ dimensions(row),
+ formatValue(row.baseline, unit),
+ formatValue(row.current, unit),
+ formatDelta(row.delta, unit),
+ formatRatio(row.ratio),
+ formatRowImpact(row),
+ ].map(escapeCell).join(' | ') + ' |'
+ }),
+ ].join('\n')
+ }
+
+ const truncate = (value, maxLength) => {
+ const text = String(value)
+ if (text.length <= maxLength) return text
+ if (maxLength <= 1) return text.slice(0, maxLength)
+ return text.slice(0, Math.max(0, maxLength - 3)) + '...'
+ }
+
+ const renderPerfChangeSvg = (rows, theme = 'adaptive') => {
+ const chartRows = rows
+ .filter((row) => typeof row.current === 'number' && typeof row.baseline === 'number')
+ .filter((row) => row.gateable === true)
+ .filter((row) => typeof row.semanticImpactScore === 'number')
+ .sort((left, right) => (left.semanticImpactScore || 0) - (right.semanticImpactScore || 0))
+ .slice(0, visibleLimit)
+ if (chartRows.length === 0) return ''
+
+ const impactScores = chartRows.map((row) => row.semanticImpactScore || 0)
+ const minImpact = Math.min(-1, ...impactScores)
+ const maxImpact = Math.max(1, ...impactScores)
+ const lower = Math.floor(minImpact)
+ const upper = Math.ceil(maxImpact)
+ const span = upper - lower || 1
+ const width = 1040
+ const rowHeight = 46
+ const height = 112 + chartRows.length * rowHeight + 34
+ const labelX = 230
+ const plotX = 252
+ const plotWidth = 320
+ const impactX = 596
+ const nominalX = 672
+ const meaningX = 804
+ const topY = 92
+ const barHeight = 18
+ const zeroX = plotX + ((0 - lower) / span) * plotWidth
+ const themeCss = theme === 'dark'
+ ? [
+ ' .chart-bg { fill: #0d1117; }',
+ ' .chart-border { fill: none; stroke: #30363d; }',
+ ' .chart-title { fill: #f0f6fc; }',
+ ' .chart-muted { fill: #8b949e; }',
+ ' .chart-axis { stroke: #8b949e; }',
+ ' .chart-label { fill: #c9d1d9; }',
+ ' .chart-value { fill: #8b949e; }',
+ ' .chart-track { fill: #21262d; }',
+ ]
+ : [
+ ' .chart-bg { fill: #ffffff; }',
+ ' .chart-border { fill: none; stroke: #d0d7de; }',
+ ' .chart-title { fill: #24292f; }',
+ ' .chart-muted { fill: #57606a; }',
+ ' .chart-axis { stroke: #8c959f; }',
+ ' .chart-label { fill: #24292f; }',
+ ' .chart-value { fill: #57606a; }',
+ ' .chart-track { fill: #f6f8fa; }',
+ ...(theme === 'adaptive'
+ ? [
+ ' @media (prefers-color-scheme: dark) {',
+ ' .chart-bg { fill: #0d1117; }',
+ ' .chart-border { stroke: #30363d; }',
+ ' .chart-title { fill: #f0f6fc; }',
+ ' .chart-muted { fill: #8b949e; }',
+ ' .chart-axis { stroke: #8b949e; }',
+ ' .chart-label { fill: #c9d1d9; }',
+ ' .chart-value { fill: #8b949e; }',
+ ' .chart-track { fill: #21262d; }',
+ ' }',
+ ]
+ : []),
+ ]
+
+ const svg = [
+ '',
+ '',
+ '',
+ '',
+ '',
+ 'Actionable measurement impact',
+ '0 means no actionable PR impact; 1x reaches the warning budget.',
+ 'improved',
+ 'regressed',
+ 'impact',
+ 'baseline -> current',
+ 'meaning',
+ '',
+ ]
+
+ for (const [index, row] of chartRows.entries()) {
+ const impact = row.semanticImpactScore || 0
+ const y = topY + index * rowHeight
+ const valueWidth = Math.max(2, Math.abs(impact) / span * plotWidth)
+ const x = impact < 0 ? zeroX - valueWidth : zeroX
+ const meaning = interpretation(row)
+ const color = meaning.color
+ const formattedImpact = formatSemanticImpact(impact)
+ const label = chartProbe(row)
+ const nominal = formatValue(row.baseline, row.observation?.unit).replaceAll(' ', '') + ' -> ' + formatValue(row.current, row.observation?.unit).replaceAll(' ', '')
+ const barOpacity = meaning.tone === 'neutral' ? '0.65' : '1'
+ const dash = meaning.tone === 'diagnostic' ? ' stroke-dasharray="3 3"' : ''
+ svg.push(
+ '' + escapeXml(label) + '' + escapeXml(truncate(label, 28)) + '',
+ '',
+ '',
+ '' + escapeXml(formattedImpact) + '',
+ '' + escapeXml(nominal) + '' + escapeXml(truncate(nominal, 21)) + '',
+ '' + escapeXml(meaning.detail) + '' + escapeXml(truncate(meaning.label, 30)) + '',
+ )
+ }
+
+ svg.push(
+ '0',
+ '',
+ )
+ return svg.join('\n')
+ }
+
+ const statusWord = comparison.status || 'unknown'
+ const readiness = comparison.readiness || {}
+ const readinessLabel = readiness.enforceable
+ ? 'enforceable'
+ : 'partial (' + (readiness.gateableCount ?? 0) + '/' + (readiness.enabledCount ?? 0) + ' enabled observations gateable)'
+ const runUrl = runId ? serverUrl + '/' + repo + '/actions/runs/' + runId : undefined
+ const shortSha = (headSha || sha || 'unknown').slice(0, 7)
+ const existingState = extractState(existing?.body)
+ const currentRun = {
+ commitSha: headSha || sha || 'unknown',
+ shortSha,
+ generatedAt: new Date().toISOString(),
+ status: statusWord,
+ mode: comparison.mode || 'unknown',
+ runUrl,
+ runAttempt,
+ workflow,
+ job,
+ visibleRows: visibleRows.map((row) => ({
+ status: row.status,
+ target: row.target?.label || row.target?.name || 'unknown',
+ observation: row.observation?.label || row.observation?.name || 'unknown',
+ meaning: interpretation(row).label,
+ dimensions: dimensions(row).replaceAll('
', ', '),
+ baseline: formatValue(row.baseline, row.observation?.unit),
+ current: formatValue(row.current, row.observation?.unit),
+ delta: formatDelta(row.delta, row.observation?.unit),
+ ratio: formatRatio(row.ratio),
+ impact: formatSemanticImpact(row.semanticImpactScore),
+ })),
+ }
+ const hasComparableHistory = (run) => Array.isArray(run.visibleRows) && run.visibleRows.some((row) =>
+ row.status !== 'missing_baseline' &&
+ row.baseline !== 'n/a' &&
+ row.ratio !== 'n/a'
+ )
+ const previousRuns = (existingState?.runs || []).filter((run) => run.commitSha !== currentRun.commitSha && hasComparableHistory(run))
+ const historyLimit = Number.isFinite(maxHistory) && maxHistory > 0 ? maxHistory : 20
+ const state = { _tag: stateTag, schemaVersion, title, runs: [currentRun, ...previousRuns].slice(0, historyLimit) }
+ const gateModeLabel = (mode) => {
+ if (mode === 'fail') return 'enforced'
+ if (mode === 'warn') return 'advisory'
+ if (mode === 'off') return 'off'
+ return mode || 'unknown'
+ }
+ const historyRows = state.runs.slice(1).map((run) => {
+ const link = run.runUrl ? '[' + run.shortSha + '](' + run.runUrl + ')' : run.shortSha
+ const top = Array.isArray(run.visibleRows) && run.visibleRows.length > 0
+ ? run.visibleRows.slice(0, 3).map((row) => (row.meaning || row.status) + ' ' + row.target + ' ' + row.observation + ' ' + row.delta + ' / ' + row.ratio).join('
')
+ : 'No regressions'
+ return '| ' + [link, run.status, gateModeLabel(run.mode), top].map(escapeCell).join(' | ') + ' |'
+ })
+
+ const runLink = runUrl ? '[workflow run](' + runUrl + ')' : 'workflow run unavailable'
+ const baselineProvenance = comparison.baselineProvenance
+ const baselineLabel = baselineProvenance?.runId
+ ? '[main run ' + baselineProvenance.runId + '](' + serverUrl + '/' + repo + '/actions/runs/' + baselineProvenance.runId + ')' +
+ (Array.isArray(baselineProvenance.runs) && baselineProvenance.runs.length > 1 ? ' + ' + (baselineProvenance.runs.length - 1) + ' older baseline runs' : '')
+ : 'not available'
+ const chartSvg = hasComparableBaseline ? renderPerfChangeSvg(visibleRows.length > 0 ? visibleRows : allRows) : ''
+ const chartDarkSvg = hasComparableBaseline ? renderPerfChangeSvg(visibleRows.length > 0 ? visibleRows : allRows, 'dark') : ''
+ if (chartPath && chartSvg) writeFileSync(chartPath, chartSvg)
+ if (chartDarkPath && chartDarkSvg) writeFileSync(chartDarkPath, chartDarkSvg)
+ const chartImageMarkdown = chartUrl && chartSvg
+ ? (chartDarkUrl
+ ? '\n' +
+ ' \n' +
+ ' \n' +
+ '
\n' +
+ ''
+ : '')
+ : ''
+ const chartMarkdown = chartImageMarkdown
+ ? chartImageMarkdown +
+ (chartSourceUrl ? '\n\n[SVG source](' + chartSourceUrl + ')' : '')
+ : ''
+
+ const summaryLines = [
+ '## ' + title,
+ '',
+ '- Status: ' + statusWord,
+ '- Gate: ' + gateModeLabel(comparison.mode),
+ '- Commit: ' + shortSha,
+ '- Run: ' + runLink,
+ '- Baseline: ' + baselineLabel,
+ '- Readiness: ' + readinessLabel,
+ '- Protocol: ' + protocolLabel,
+ '',
+ hasComparableBaseline
+ ? 'Chart: bars show semantic impact. A value of 0 means the raw change is not actionable for this PR; raw percentage and nominal values stay in the table.'
+ : 'No compatible baseline was available, so this run shows current measurements only.',
+ '',
+ chartMarkdown,
+ '',
+ hasComparableBaseline ? comparisonTable(visibleRows) : currentOnlyTable(visibleRows),
+ '',
+ '',
+ 'All measurements
',
+ '',
+ allMeasurementsTable(allRows),
+ '',
+ ' ',
+ ]
+
+ if (historyRows.length > 0) {
+ summaryLines.push(
+ '',
+ '',
+ 'Previous runs
',
+ '',
+ '| Commit | Status | Gate | Top changes |',
+ '| --- | --- | --- | --- |',
+ ...historyRows,
+ '',
+ ' ',
+ )
+ }
+
+ summaryLines.push('', marker, statePrefix + JSON.stringify(state, null, 2) + stateSuffix)
+ writeFileSync(bodyPath, summaryLines.join('\n') + '\n')
+ writeFileSync(commentIdPath, existing?.id ? String(existing.id) : '')
+ EOF
+
+ node "$renderer_script" "$comparison_file" "$comments_json" "$comment_body" "$comment_id_file" "$chart_file" "$chart_dark_file"
+
+ if [ -s "$chart_file" ]; then
+ if [ "$require_public_asset" = "true" ] && [ -z "$public_asset_command" ]; then
+ echo "::error::CI measurement chart was rendered for a private repository, but CI_MEASUREMENT_PR_COMMENT_PUBLIC_ASSET_COMMAND is not configured. Private raw GitHub URLs cannot be embedded in PR comments."
+ exit 1
+ fi
+
+ if ensure_ci_measurement_tool resvg resvg; then
+ resvg_font_args=()
+ if command -v nix >/dev/null 2>&1; then
+ if font_out="$(nix build --no-link --print-out-paths nixpkgs#dejavu_fonts 2>/dev/null)"; then
+ resvg_font_args+=(--use-fonts-dir "$font_out/share/fonts/truetype")
+ fi
+ fi
+ if ! resvg --background '#ffffff' "${resvg_font_args[@]}" "$chart_file" "$chart_png_file"; then
+ echo "::notice::unable to render CI measurement chart PNG"
+ rm -f "$chart_png_file"
+ fi
+ if [ -s "$chart_dark_file" ] && ! resvg --background '#0d1117' "${resvg_font_args[@]}" "$chart_dark_file" "$chart_dark_png_file"; then
+ echo "::notice::unable to render dark CI measurement chart PNG"
+ rm -f "$chart_dark_png_file"
+ fi
+ else
+ echo "::notice::resvg is not available; skipping embedded CI measurement chart PNG"
+ fi
+
+ if ! gh api "repos/$repo/git/ref/heads/$asset_branch" >/dev/null 2>&1; then
+ default_branch_sha="$(gh api "repos/$repo/git/ref/heads/${GITHUB_BASE_REF:-main}" --jq '.object.sha' 2>/dev/null || true)"
+ if [ -z "$default_branch_sha" ]; then
+ default_branch_sha="${GITHUB_SHA:-}"
+ fi
+ if [ -n "$default_branch_sha" ]; then
+ gh api "repos/$repo/git/refs" --method POST --field ref="refs/heads/$asset_branch" --field sha="$default_branch_sha" >/dev/null || true
+ fi
+ fi
+ chart_content="$(base64 <"$chart_file" | tr -d '\n')"
+ if ! gh api "repos/$repo/contents/$asset_svg_path" --method PUT --field message="Update CI measurement chart SVG for PR #$pr_number" --field content="$chart_content" --field branch="$asset_branch" >/dev/null; then
+ echo "::notice::unable to upload CI measurement chart SVG asset"
+ if [ -z "$public_asset_command" ]; then
+ sed -i.bak '/\[SVG source\]/d' "$comment_body"
+ fi
+ fi
+ if [ -s "$chart_png_file" ]; then
+ chart_png_content="$(base64 <"$chart_png_file" | tr -d '\n')"
+ if ! gh api "repos/$repo/contents/$asset_png_path" --method PUT --field message="Update CI measurement chart PNG for PR #$pr_number" --field content="$chart_png_content" --field branch="$asset_branch" >/dev/null; then
+ echo "::notice::unable to upload CI measurement chart PNG asset"
+ if [ -z "$public_asset_command" ]; then
+ sed -i.bak '/!\[Measurement change vs baseline chart\]/d; /!\[Perf change vs baseline chart\]/d; //,/<\\/picture>/d' "$comment_body"
+ fi
+ fi
+ else
+ sed -i.bak '/!\[Measurement change vs baseline chart\]/d; /!\[Perf change vs baseline chart\]/d; //,/<\\/picture>/d' "$comment_body"
+ fi
+ if [ -s "$chart_dark_png_file" ]; then
+ chart_dark_png_content="$(base64 <"$chart_dark_png_file" | tr -d '\n')"
+ if ! gh api "repos/$repo/contents/$asset_dark_png_path" --method PUT --field message="Update dark CI measurement chart PNG for PR #$pr_number" --field content="$chart_dark_png_content" --field branch="$asset_branch" >/dev/null; then
+ echo "::notice::unable to upload dark CI measurement chart PNG asset"
+ if [ -z "$public_asset_command" ]; then
+ export CI_MEASUREMENT_PR_COMMENT_CHART_DARK_URL=""
+ node "$renderer_script" "$comparison_file" "$comments_json" "$comment_body" "$comment_id_file" "$chart_file" "$chart_dark_file"
+ fi
+ fi
+ fi
+
+ if [ -n "$public_asset_command" ] && [ -s "$chart_png_file" ]; then
+ if public_chart_url="$(bash -c "$public_asset_command" _ "$chart_png_file" png)" && [ -n "$public_chart_url" ]; then
+ chart_url="$public_chart_url"
+ export CI_MEASUREMENT_PR_COMMENT_CHART_URL="$chart_url"
+ else
+ echo "::notice::unable to publish CI measurement chart PNG to public asset host"
+ export CI_MEASUREMENT_PR_COMMENT_CHART_URL=""
+ fi
+ if [ -s "$chart_dark_png_file" ] && public_chart_dark_url="$(bash -c "$public_asset_command" _ "$chart_dark_png_file" png)" && [ -n "$public_chart_dark_url" ]; then
+ chart_dark_url="$public_chart_dark_url"
+ export CI_MEASUREMENT_PR_COMMENT_CHART_DARK_URL="$chart_dark_url"
+ else
+ echo "::notice::unable to publish dark CI measurement chart PNG to public asset host"
+ export CI_MEASUREMENT_PR_COMMENT_CHART_DARK_URL=""
+ fi
+ if public_chart_source_url="$(bash -c "$public_asset_command" _ "$chart_file" svg)" && [ -n "$public_chart_source_url" ]; then
+ chart_source_url="$public_chart_source_url"
+ export CI_MEASUREMENT_PR_COMMENT_CHART_SOURCE_URL="$chart_source_url"
+ else
+ echo "::notice::unable to publish CI measurement chart SVG to public asset host"
+ export CI_MEASUREMENT_PR_COMMENT_CHART_SOURCE_URL=""
+ fi
+ if [ "$require_public_asset" = "true" ] && [ -z "$chart_url" ]; then
+ echo "::error::unable to publish CI measurement chart PNG to a public asset host for private repository $repo"
+ exit 1
+ fi
+ if [ "$require_public_asset" = "true" ] && [ -s "$chart_dark_png_file" ] && [ -z "$chart_dark_url" ]; then
+ echo "::error::unable to publish dark CI measurement chart PNG to a public asset host for private repository $repo"
+ exit 1
+ fi
+ node "$renderer_script" "$comparison_file" "$comments_json" "$comment_body" "$comment_id_file" "$chart_file" "$chart_dark_file"
+ fi
+ fi
+
+ comment_id="$(cat "$comment_id_file")"
+ comment_payload_file="$comment_body.payload.json"
+ node -e "const fs=require('node:fs'); fs.writeFileSync(process.argv[2], JSON.stringify({ body: fs.readFileSync(process.argv[1], 'utf8') }))" "$comment_body" "$comment_payload_file"
+ if [ -n "$comment_id" ]; then
+ if ! gh api "repos/$repo/issues/comments/$comment_id" --method PATCH --input "$comment_payload_file" >/dev/null; then
+ echo "::notice::unable to update CI measurement PR comment"
+ fi
+ else
+ if ! gh api "repos/$repo/issues/$pr_number/comments" --method POST --input "$comment_payload_file" >/dev/null; then
+ echo "::notice::unable to create CI measurement PR comment"
+ fi
+ fi
+ fi
+ fi
+ fi
+
+ if [ "$exit_code" -ne 0 ]; then
+ exit "$exit_code"
+ fi
+
+ - name: 'Upload CI measurements: nix-closure-measurements'
+ if: always()
+ uses: actions/upload-artifact@v4
+ with:
+ name: nix-closure-measurements
+ path: |
+ tmp/nix-closure-ci
+ !tmp/nix-closure-ci/baseline/**
+ if-no-files-found: error
+ retention-days: 30
+ - name: Save pnpm state
+ if: ${{ success() && steps.restore-pnpm-state.outputs.cache-hit != 'true' }}
+ uses: actions/cache/save@v4
+ with:
+ path: |
+ ${{ github.workspace }}/.pnpm-home
+ ${{ runner.temp }}/pnpm-store/${{ github.job }}
+ key: "pnpm-state-v1-${{ runner.os }}-${{ runner.arch }}-${{ hashFiles('**/pnpm-lock.yaml') }}"
+ - name: Nix diagnostics summary
+ if: failure()
+ shell: bash
+ run: |
+ diag_dir="${NIX_STORE_DIAGNOSTICS_DIR:-}"
+ if [ -z "$diag_dir" ] || [ ! -d "$diag_dir" ]; then
+ echo "## Nix Store Diagnostics" >> "$GITHUB_STEP_SUMMARY"
+ echo "" >> "$GITHUB_STEP_SUMMARY"
+ echo "No diagnostics directory found (validation may have failed before capture)." >> "$GITHUB_STEP_SUMMARY"
+ exit 0
+ fi
+
+ {
+ echo "## Nix Store Diagnostics"
+ echo ""
+ echo "Temporary instrumentation for #272; remove after root cause is confirmed and CI is stable."
+ echo ""
+ echo "- Diagnostics directory: \`$diag_dir\`"
+ echo "- Tracking issue: https://github.com/overengineeringstudio/effect-utils/issues/272"
+ } >> "$GITHUB_STEP_SUMMARY"
+
+ markers_file="${RUNNER_TEMP:-/tmp}/nix-store-signature-markers.txt"
+ grep -R -n -E "config\\.cachix|cachix\\.package|error: path '/nix/store/.+ is not valid" --exclude="$(basename "$markers_file")" "$diag_dir" > "$markers_file" || true
+
+ if [ -s "$markers_file" ]; then
+ {
+ echo ""
+ echo "### Signature markers"
+ echo '```text'
+ head -n 120 "$markers_file"
+ echo '```'
+ } >> "$GITHUB_STEP_SUMMARY"
+ else
+ echo "" >> "$GITHUB_STEP_SUMMARY"
+ echo "- No signature markers found in captured diagnostics." >> "$GITHUB_STEP_SUMMARY"
+ fi
+ - name: Upload Nix diagnostics artifact
+ if: failure() && env.NIX_STORE_DIAGNOSTICS_DIR != ''
+ uses: actions/upload-artifact@v4
+ with:
+ name: 'nix-store-diagnostics-${{ github.job }}-${{ runner.os }}-run-${{ github.run_id }}-attempt-${{ github.run_attempt }}'
+ path: ${{ env.NIX_STORE_DIAGNOSTICS_DIR }}
+ if-no-files-found: ignore
+ retention-days: 14
+ - name: Failure note
+ if: failure()
+ shell: bash
+ run: |
+ echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
+ echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
source-shape:
runs-on:
[namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}']
diff --git a/.github/workflows/ci.yml.genie.ts b/.github/workflows/ci.yml.genie.ts
index f66b9215d..f3dcb128a 100644
--- a/.github/workflows/ci.yml.genie.ts
+++ b/.github/workflows/ci.yml.genie.ts
@@ -22,9 +22,11 @@ import {
ciMeasurementsCommentPermissions,
ciMeasurementsArtifactStep,
compareCiMeasurementsStep,
+ defaultNixClosureMeasurementBuckets,
devenvPerfJob,
downloadPreviousGitHubArtifactStep,
namespaceRunner,
+ nixClosureMeasurementSteps,
sourceShapeMeasurementStep,
validateColdPnpmDepsStep,
nixDiagnosticsArtifactStep,
@@ -277,6 +279,39 @@ const jobs: Record | ReturnType = {
@@ -407,6 +442,38 @@ const extraJobs: Record = {
}),
'timeout-minutes': jobTimeoutMinutes,
},
+ 'nix-closure-sizes': {
+ if: normalCiIf,
+ 'runs-on': namespaceRunner({
+ profile: 'namespace-profile-linux-x86-64',
+ runId: '${{ github.run_id }}',
+ }),
+ 'timeout-minutes': jobTimeoutMinutes,
+ defaults: bashShellDefaults,
+ permissions: ciMeasurementsCommentPermissions,
+ env: ciMeasurementSubjectEnv,
+ steps: [
+ ...baseSteps,
+ ...nixClosureMeasurementSteps({
+ artifactName: 'nix-closure-measurements',
+ artifactDir: nixClosureMeasurementsDir,
+ baselineMaxRuns: 20,
+ targets: nixClosureMeasurementTargets,
+ buckets: defaultNixClosureMeasurementBuckets,
+ regressionMode: 'warn',
+ prComment: {
+ enabled: true,
+ title: 'Nix Closure Measurements',
+ maxRows: 8,
+ maxHistory: 20,
+ },
+ }),
+ savePnpmStateStep(),
+ nixDiagnosticsSummaryStep,
+ nixDiagnosticsArtifactStep(),
+ failureReminderStep,
+ ],
+ },
'source-shape': {
'runs-on': namespaceRunner({
profile: 'namespace-profile-linux-x86-64',
From 21d7273abdd3339f7523d7d7fb2cf5bdfb8a0205 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Tue, 19 May 2026 12:17:23 +0200
Subject: [PATCH 56/81] Split zero-impact measurement rows
---
.github/workflows/ci.yml | 96 +++++++++++++++++++++++++++++--
genie/ci-workflow/measurements.ts | 32 ++++++++++-
2 files changed, 120 insertions(+), 8 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 42754ce29..784a805d7 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -4150,6 +4150,13 @@ jobs:
? allRows.filter((row) => typeof row.baseline === 'number')
: allRows.slice().sort((left, right) => (right.current || 0) - (left.current || 0))
).slice(0, visibleLimit)
+ const isZeroImpactRow = (row) =>
+ typeof row.semanticImpactScore === 'number' &&
+ !Number.isNaN(row.semanticImpactScore) &&
+ Math.abs(row.semanticImpactScore) < 0.005
+ const nonZeroImpactRows = comparableRows.filter((row) => !isZeroImpactRow(row))
+ const zeroImpactRows = comparableRows.filter(isZeroImpactRow)
+ const visibleNonZeroImpactRows = nonZeroImpactRows.slice(0, visibleLimit)
const comparisonTable = (rows) => {
if (rows.length === 0) return 'No measurement regressions detected.'
@@ -4424,7 +4431,28 @@ jobs:
'',
chartMarkdown,
'',
- hasComparableBaseline ? comparisonTable(visibleRows) : currentOnlyTable(visibleRows),
+ hasComparableBaseline
+ ? (visibleNonZeroImpactRows.length > 0
+ ? comparisonTable(visibleNonZeroImpactRows)
+ : 'No non-zero actionable measurement impact detected.')
+ : currentOnlyTable(visibleRows),
+ ]
+
+ if (hasComparableBaseline && zeroImpactRows.length > 0) {
+ summaryLines.push(
+ '',
+ '',
+ 'Zero-impact measurements (' + zeroImpactRows.length + ')
',
+ '',
+ 'These rows had compatible baseline data, but their semantic impact rounded to 0.00x because the movement was below the configured budget, below the noise floor, or inside the robust noise band.',
+ '',
+ comparisonTable(zeroImpactRows),
+ '',
+ ' ',
+ )
+ }
+
+ summaryLines.push(
'',
'',
'All measurements
',
@@ -4432,7 +4460,7 @@ jobs:
allMeasurementsTable(allRows),
'',
' ',
- ]
+ )
if (historyRows.length > 0) {
summaryLines.push(
@@ -6333,6 +6361,13 @@ jobs:
? allRows.filter((row) => typeof row.baseline === 'number')
: allRows.slice().sort((left, right) => (right.current || 0) - (left.current || 0))
).slice(0, visibleLimit)
+ const isZeroImpactRow = (row) =>
+ typeof row.semanticImpactScore === 'number' &&
+ !Number.isNaN(row.semanticImpactScore) &&
+ Math.abs(row.semanticImpactScore) < 0.005
+ const nonZeroImpactRows = comparableRows.filter((row) => !isZeroImpactRow(row))
+ const zeroImpactRows = comparableRows.filter(isZeroImpactRow)
+ const visibleNonZeroImpactRows = nonZeroImpactRows.slice(0, visibleLimit)
const comparisonTable = (rows) => {
if (rows.length === 0) return 'No measurement regressions detected.'
@@ -6607,7 +6642,28 @@ jobs:
'',
chartMarkdown,
'',
- hasComparableBaseline ? comparisonTable(visibleRows) : currentOnlyTable(visibleRows),
+ hasComparableBaseline
+ ? (visibleNonZeroImpactRows.length > 0
+ ? comparisonTable(visibleNonZeroImpactRows)
+ : 'No non-zero actionable measurement impact detected.')
+ : currentOnlyTable(visibleRows),
+ ]
+
+ if (hasComparableBaseline && zeroImpactRows.length > 0) {
+ summaryLines.push(
+ '',
+ '',
+ 'Zero-impact measurements (' + zeroImpactRows.length + ')
',
+ '',
+ 'These rows had compatible baseline data, but their semantic impact rounded to 0.00x because the movement was below the configured budget, below the noise floor, or inside the robust noise band.',
+ '',
+ comparisonTable(zeroImpactRows),
+ '',
+ ' ',
+ )
+ }
+
+ summaryLines.push(
'',
'',
'All measurements
',
@@ -6615,7 +6671,7 @@ jobs:
allMeasurementsTable(allRows),
'',
' ',
- ]
+ )
if (historyRows.length > 0) {
summaryLines.push(
@@ -8167,6 +8223,13 @@ jobs:
? allRows.filter((row) => typeof row.baseline === 'number')
: allRows.slice().sort((left, right) => (right.current || 0) - (left.current || 0))
).slice(0, visibleLimit)
+ const isZeroImpactRow = (row) =>
+ typeof row.semanticImpactScore === 'number' &&
+ !Number.isNaN(row.semanticImpactScore) &&
+ Math.abs(row.semanticImpactScore) < 0.005
+ const nonZeroImpactRows = comparableRows.filter((row) => !isZeroImpactRow(row))
+ const zeroImpactRows = comparableRows.filter(isZeroImpactRow)
+ const visibleNonZeroImpactRows = nonZeroImpactRows.slice(0, visibleLimit)
const comparisonTable = (rows) => {
if (rows.length === 0) return 'No measurement regressions detected.'
@@ -8441,7 +8504,28 @@ jobs:
'',
chartMarkdown,
'',
- hasComparableBaseline ? comparisonTable(visibleRows) : currentOnlyTable(visibleRows),
+ hasComparableBaseline
+ ? (visibleNonZeroImpactRows.length > 0
+ ? comparisonTable(visibleNonZeroImpactRows)
+ : 'No non-zero actionable measurement impact detected.')
+ : currentOnlyTable(visibleRows),
+ ]
+
+ if (hasComparableBaseline && zeroImpactRows.length > 0) {
+ summaryLines.push(
+ '',
+ '',
+ 'Zero-impact measurements (' + zeroImpactRows.length + ')
',
+ '',
+ 'These rows had compatible baseline data, but their semantic impact rounded to 0.00x because the movement was below the configured budget, below the noise floor, or inside the robust noise band.',
+ '',
+ comparisonTable(zeroImpactRows),
+ '',
+ ' ',
+ )
+ }
+
+ summaryLines.push(
'',
'',
'All measurements
',
@@ -8449,7 +8533,7 @@ jobs:
allMeasurementsTable(allRows),
'',
' ',
- ]
+ )
if (historyRows.length > 0) {
summaryLines.push(
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 8a8943da9..62373b745 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -2773,6 +2773,13 @@ const visibleRows = (hasComparableBaseline
? allRows.filter((row) => typeof row.baseline === 'number')
: allRows.slice().sort((left, right) => (right.current || 0) - (left.current || 0))
).slice(0, visibleLimit)
+const isZeroImpactRow = (row) =>
+ typeof row.semanticImpactScore === 'number' &&
+ !Number.isNaN(row.semanticImpactScore) &&
+ Math.abs(row.semanticImpactScore) < 0.005
+const nonZeroImpactRows = comparableRows.filter((row) => !isZeroImpactRow(row))
+const zeroImpactRows = comparableRows.filter(isZeroImpactRow)
+const visibleNonZeroImpactRows = nonZeroImpactRows.slice(0, visibleLimit)
const comparisonTable = (rows) => {
if (rows.length === 0) return 'No measurement regressions detected.'
@@ -3047,7 +3054,28 @@ const summaryLines = [
'',
chartMarkdown,
'',
- hasComparableBaseline ? comparisonTable(visibleRows) : currentOnlyTable(visibleRows),
+ hasComparableBaseline
+ ? (visibleNonZeroImpactRows.length > 0
+ ? comparisonTable(visibleNonZeroImpactRows)
+ : 'No non-zero actionable measurement impact detected.')
+ : currentOnlyTable(visibleRows),
+]
+
+if (hasComparableBaseline && zeroImpactRows.length > 0) {
+ summaryLines.push(
+ '',
+ '',
+ 'Zero-impact measurements (' + zeroImpactRows.length + ')
',
+ '',
+ 'These rows had compatible baseline data, but their semantic impact rounded to 0.00x because the movement was below the configured budget, below the noise floor, or inside the robust noise band.',
+ '',
+ comparisonTable(zeroImpactRows),
+ '',
+ ' ',
+ )
+}
+
+summaryLines.push(
'',
'',
'All measurements
',
@@ -3055,7 +3083,7 @@ const summaryLines = [
allMeasurementsTable(allRows),
'',
' ',
-]
+)
if (historyRows.length > 0) {
summaryLines.push(
From 2df4450a84b3d2eb999d0263aedc2a95be529f8a Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Tue, 19 May 2026 12:22:44 +0200
Subject: [PATCH 57/81] Test zero-impact measurement table split
---
.../runtime/github-workflow/ci-workflow-helpers.unit.test.ts | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
index 297bc7018..ee06604b4 100644
--- a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
+++ b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
@@ -508,6 +508,11 @@ describe('ci workflow devenv perf helpers', () => {
expect(ciWorkflowSource).toContain(
'| Group | Measurement | Baseline | Current | Raw change | Impact | Meaning | Gate | Evidence |',
)
+ expect(ciWorkflowSource).toContain('const zeroImpactRows = comparableRows.filter(isZeroImpactRow)')
+ expect(ciWorkflowSource).toContain('Zero-impact measurements (')
+ expect(ciWorkflowSource).toContain(
+ 'No non-zero actionable measurement impact detected.',
+ )
expect(ciWorkflowSource).toContain("'- Readiness: ' + readinessLabel")
expect(ciWorkflowSource).toContain('renderPerfChangeSvg')
expect(ciWorkflowSource).toContain('Actionable measurement impact')
From 0c55ca8a13bb19af8cd1dabbb5a09e5d98ab0cd1 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Wed, 20 May 2026 03:15:19 +0200
Subject: [PATCH 58/81] Unblock current CI measurement runs
---
.github/workflows/ci.yml | 4 ++--
genie/ci-workflow/shared.ts | 21 ++++++++++++++-----
.../ci-workflow-helpers.unit.test.ts | 8 +++++++
3 files changed, 26 insertions(+), 7 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 784a805d7..7cae0aaed 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,8 +2,8 @@
# Source: ci.yml.genie.ts
concurrency:
- group: '${{ github.workflow }}-${{ github.event.pull_request.number || github.event.inputs.measurement_baseline_ref || github.ref }}'
- cancel-in-progress: true
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code') }}"
+ cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
name: CI
diff --git a/genie/ci-workflow/shared.ts b/genie/ci-workflow/shared.ts
index 29ba9deee..135c7d107 100644
--- a/genie/ci-workflow/shared.ts
+++ b/genie/ci-workflow/shared.ts
@@ -49,19 +49,30 @@ export const standardCIEnv = {
} as const
/**
- * Cancel superseded CI workflow runs for the same PR or branch.
+ * Cancel superseded CI workflow runs for the same event and ref.
*
* The group key intentionally does not include the job name so a new push
* cancels the entire older workflow run rather than letting stale sibling jobs
* continue consuming runner capacity.
*
- * Measurement baseline backfills are keyed by their subject ref so several
- * historical refs can be backfilled without canceling each other.
+ * Code validation is a branch-protection signal for the latest PR head. Keeping
+ * older code-triggered pull_request runs alive can wedge the concurrency bucket
+ * behind a stale queued self-hosted job and prevent the current head from
+ * materializing any jobs.
+ *
+ * Measurement baseline backfills are keyed by their subject ref and do not
+ * cancel in-progress runs so several historical refs can be backfilled without
+ * canceling each other.
+ *
+ * Merge-queue label churn is different: only the mq:ci-admitted label event is
+ * allowed to materialize full PR CI. Other label events do not change the
+ * commit under test and must not cancel an already-running validation run.
*/
export const ciWorkflowConcurrency = {
group:
- '${{ github.workflow }}-${{ github.event.pull_request.number || github.event.inputs.measurement_baseline_ref || github.ref }}',
- 'cancel-in-progress': true,
+ "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code') }}",
+ 'cancel-in-progress':
+ "${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}",
} as const
/**
diff --git a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
index ee06604b4..11c4e8e84 100644
--- a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
+++ b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
@@ -505,6 +505,14 @@ describe('ci workflow devenv perf helpers', () => {
expect(ciWorkflowSource).toContain(
'Chart: bars show semantic impact. A value of 0 means the raw change is not actionable for this PR; raw percentage and nominal values stay in the table.',
)
+ expect(generatedCiWorkflowYamlSource).toContain(
+ "github.workflow }}-${{ github.event_name }}-${{ github.ref }}",
+ )
+ expect(generatedCiWorkflowYamlSource).toContain("format('measurement-baseline-{0}'")
+ expect(generatedCiWorkflowYamlSource).toContain("format('label-{0}', github.event.label.name)")
+ expect(generatedCiWorkflowYamlSource).toContain(
+ "inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request'",
+ )
expect(ciWorkflowSource).toContain(
'| Group | Measurement | Baseline | Current | Raw change | Impact | Meaning | Gate | Evidence |',
)
From dc9fada05b8c72e382f064963ff5b1a0ad175d9a Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Wed, 20 May 2026 03:21:40 +0200
Subject: [PATCH 59/81] Support manual measurement PR comment refresh
---
.github/workflows/ci.yml | 8 ++++++++
genie/ci-workflow/measurements.ts | 8 ++++++++
.../github-workflow/ci-workflow-helpers.unit.test.ts | 2 ++
3 files changed, 18 insertions(+)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7cae0aaed..5800d8af5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -24,6 +24,11 @@ on:
required: false
default: ''
type: string
+ measurement_pr_number:
+ description: Optional pull request number to update with CI measurement comments during manual measurement runs.
+ required: false
+ default: ''
+ type: string
debug_force_nix_diagnostics_failure:
description: 'Temporary debug switch (#272): force post-validation failure to verify diagnostics artifact + summary'
required: false
@@ -3232,6 +3237,7 @@ jobs:
CI_MEASUREMENT_PR_COMMENT_MAX_ROWS: '8'
CI_MEASUREMENT_PR_COMMENT_MAX_HISTORY: '20'
CI_MEASUREMENT_PR_COMMENT_ASSET_BRANCH: ci-measurement-assets
+ CI_MEASUREMENT_PR_COMMENT_PR_NUMBER: ${{ inputs.measurement_pr_number }}
GH_TOKEN: ${{ github.token }}
run: |
set -euo pipefail
@@ -5443,6 +5449,7 @@ jobs:
CI_MEASUREMENT_PR_COMMENT_MAX_ROWS: '8'
CI_MEASUREMENT_PR_COMMENT_MAX_HISTORY: '20'
CI_MEASUREMENT_PR_COMMENT_ASSET_BRANCH: ci-measurement-assets
+ CI_MEASUREMENT_PR_COMMENT_PR_NUMBER: ${{ inputs.measurement_pr_number }}
GH_TOKEN: ${{ github.token }}
run: |
set -euo pipefail
@@ -7305,6 +7312,7 @@ jobs:
CI_MEASUREMENT_PR_COMMENT_MAX_ROWS: '12'
CI_MEASUREMENT_PR_COMMENT_MAX_HISTORY: '20'
CI_MEASUREMENT_PR_COMMENT_ASSET_BRANCH: ci-measurement-assets
+ CI_MEASUREMENT_PR_COMMENT_PR_NUMBER: ${{ inputs.measurement_pr_number }}
GH_TOKEN: ${{ github.token }}
run: |
set -euo pipefail
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 62373b745..ba44a9886 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -291,6 +291,13 @@ export const ciMeasurementBaselineWorkflowDispatchInputs = {
default: '',
type: 'string',
},
+ measurement_pr_number: {
+ description:
+ 'Optional pull request number to update with CI measurement comments during manual measurement runs.',
+ required: false,
+ default: '',
+ type: 'string',
+ },
} as const
export const ciMeasurementBaselineBackfillPredicate =
@@ -1849,6 +1856,7 @@ export const compareCiMeasurementsStep = (opts?: CiMeasurementsComparisonStepOpt
CI_MEASUREMENT_PR_COMMENT_MAX_HISTORY: String(opts?.prComment?.maxHistory ?? 20),
CI_MEASUREMENT_PR_COMMENT_ASSET_BRANCH:
opts?.prComment?.assetBranch ?? 'ci-measurement-assets',
+ CI_MEASUREMENT_PR_COMMENT_PR_NUMBER: '${{ inputs.measurement_pr_number }}',
...(opts?.prComment?.publicAssetCommand === undefined
? {}
: { CI_MEASUREMENT_PR_COMMENT_PUBLIC_ASSET_COMMAND: opts.prComment.publicAssetCommand }),
diff --git a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
index 11c4e8e84..3696072b2 100644
--- a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
+++ b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
@@ -490,6 +490,8 @@ describe('ci workflow devenv perf helpers', () => {
expect(ciWorkflowSource).toContain('seedRunIds?: readonly string[]')
expect(ciWorkflowSource).toContain('baselineSeedRuns?: readonly CiMeasurementBaselineSeedRun[]')
expect(ciWorkflowSource).toContain('baselineSeedRunIds?: readonly string[]')
+ expect(ciWorkflowSource).toContain('measurement_pr_number:')
+ expect(ciWorkflowSource).toContain("CI_MEASUREMENT_PR_COMMENT_PR_NUMBER: '${{ inputs.measurement_pr_number }}'")
expect(ciWorkflowSource).toContain('seedRuns: ($seedRuns[0] // [])')
expect(ciWorkflowSource).toContain('baselineProvenance: ($baselineProvenance[0] // null)')
expect(ciWorkflowSource).toContain(
From 0369f9761b2ca8731cd0d0af419fe2a28008bd59 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Wed, 20 May 2026 03:30:05 +0200
Subject: [PATCH 60/81] Trigger CI measurement comment refresh
From b366b857364f41e61f72b7222a7e86c3395cadcc Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Wed, 20 May 2026 08:52:40 +0200
Subject: [PATCH 61/81] Improve CI measurement comment scanability
---
.github/workflows/ci.yml | 612 ++++++++++++++++--
genie/ci-workflow/measurements.ts | 204 +++++-
.../ci-workflow-helpers.unit.test.ts | 10 +-
3 files changed, 762 insertions(+), 64 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 5800d8af5..774f5e96f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -4163,6 +4163,103 @@ jobs:
const nonZeroImpactRows = comparableRows.filter((row) => !isZeroImpactRow(row))
const zeroImpactRows = comparableRows.filter(isZeroImpactRow)
const visibleNonZeroImpactRows = nonZeroImpactRows.slice(0, visibleLimit)
+ const diagnosticRows = allRows.filter((row) =>
+ row.status === 'missing_baseline' ||
+ row.confidence === 'diagnostic' ||
+ row.gateReason === 'disabled' ||
+ row.semanticImpactKind === 'diagnostic' ||
+ (!row.gateable && typeof row.baseline !== 'number')
+ )
+
+ const baselineToCurrent = (row) => {
+ const unit = row.observation?.unit
+ return formatValue(row.baseline, unit) + ' -> ' + formatValue(row.current, unit)
+ }
+
+ const rawChange = (row) => {
+ const unit = row.observation?.unit
+ return formatDelta(row.delta, unit) + ' / ' + formatRatio(row.ratio)
+ }
+
+ const confidenceSummary = (row) => {
+ const unit = row.observation?.unit
+ if (row.comparisonMode === 'paired' && typeof row.evidenceDeltaLower === 'number' && typeof row.evidenceDeltaUpper === 'number') {
+ const quantile = typeof row.pairedEvidenceQuantile === 'number'
+ ? Math.round(row.pairedEvidenceQuantile * 100)
+ : 25
+ return 'paired n=' + (row.pairedSamples ?? 0)
+ + ', ' + quantile + '-' + (100 - quantile) + '% delta '
+ + formatValue(row.evidenceDeltaLower, unit)
+ + '..' + formatValue(row.evidenceDeltaUpper, unit)
+ }
+ return (row.confidence || 'unknown') + ', baseline n=' + (row.baselineSources ?? 0) + ', current n=' + (row.currentSamples ?? 1)
+ }
+
+ const scanDecision = (row) => {
+ if (row.status === 'fail') return 'regression blocks'
+ if (row.status === 'warn') return 'regression review'
+ if (row.status === 'missing_baseline') return 'needs baseline'
+ if (row.direction === 'improved') return 'faster'
+ if (row.direction === 'regressed') return 'no material impact'
+ return 'unchanged'
+ }
+
+ const scanTable = (rows) => {
+ if (rows.length === 0) return 'No non-zero actionable measurement impact detected.'
+ return [
+ '| What changed? | Probe | Baseline -> current | Raw change | Impact | Confidence |',
+ '| --- | --- | --- | ---: | ---: | --- |',
+ ...rows.map((row) => {
+ return '| ' + [
+ scanDecision(row),
+ humanProbe(row),
+ baselineToCurrent(row),
+ rawChange(row),
+ formatRowImpact(row),
+ confidenceSummary(row),
+ ].map(escapeCell).join(' | ') + ' |'
+ }),
+ ].join('\n')
+ }
+
+ const zeroImpactTable = (rows) => {
+ if (rows.length === 0) return 'No zero-impact measurements.'
+ return [
+ '| Probe | Baseline -> current | Raw change | Impact | Gate | Evidence | Why hidden |',
+ '| --- | --- | ---: | ---: | --- | --- | --- |',
+ ...rows.map((row) => {
+ const meaning = interpretation(row)
+ return '| ' + [
+ humanProbe(row),
+ baselineToCurrent(row),
+ rawChange(row),
+ formatRowImpact(row),
+ row.gateable ? 'yes' : (row.gateReason || 'no'),
+ confidenceSummary(row),
+ meaning.label,
+ ].map(escapeCell).join(' | ') + ' |'
+ }),
+ ].join('\n')
+ }
+
+ const diagnosticTable = (rows) => {
+ if (rows.length === 0) return 'No diagnostic or ungated measurements.'
+ return [
+ '| Probe | Current | Baseline | Impact | Gate | Reason | Evidence |',
+ '| --- | ---: | ---: | ---: | --- | --- | --- |',
+ ...rows.map((row) => {
+ return '| ' + [
+ humanProbe(row),
+ formatValue(row.current, row.observation?.unit),
+ formatValue(row.baseline, row.observation?.unit),
+ formatRowImpact(row),
+ row.gateable ? 'yes' : (row.gateReason || row.status || 'no'),
+ interpretation(row).label,
+ confidenceSummary(row),
+ ].map(escapeCell).join(' | ') + ' |'
+ }),
+ ].join('\n')
+ }
const comparisonTable = (rows) => {
if (rows.length === 0) return 'No measurement regressions detected.'
@@ -4226,6 +4323,32 @@ jobs:
].join('\n')
}
+ const sourceMeasurement = (row) => ({
+ id: row.observation?.dimensions?.probe || row.observation?.name || humanProbe(row),
+ label: humanProbe(row),
+ group: semanticPath(row),
+ status: row.status,
+ direction: row.direction,
+ gateable: row.gateable,
+ gateReason: row.gateReason,
+ confidence: row.confidence,
+ comparisonMode: row.comparisonMode,
+ unit: row.observation?.unit,
+ baseline: row.baseline ?? null,
+ current: row.current ?? null,
+ delta: row.delta ?? null,
+ ratio: row.ratio ?? null,
+ semanticImpactScore: row.semanticImpactScore ?? null,
+ semanticImpactKind: row.semanticImpactKind ?? null,
+ baselineSources: row.baselineSources ?? null,
+ currentSamples: row.currentSamples ?? null,
+ pairedSamples: row.pairedSamples ?? null,
+ evidenceDeltaLower: row.evidenceDeltaLower ?? null,
+ evidenceDeltaUpper: row.evidenceDeltaUpper ?? null,
+ pairedEvidenceQuantile: row.pairedEvidenceQuantile ?? null,
+ dimensions: row.observation?.dimensions || {},
+ })
+
const truncate = (value, maxLength) => {
const text = String(value)
if (text.length <= maxLength) return text
@@ -4402,6 +4525,32 @@ jobs:
? '[main run ' + baselineProvenance.runId + '](' + serverUrl + '/' + repo + '/actions/runs/' + baselineProvenance.runId + ')' +
(Array.isArray(baselineProvenance.runs) && baselineProvenance.runs.length > 1 ? ' + ' + (baselineProvenance.runs.length - 1) + ' older baseline runs' : '')
: 'not available'
+ const sourceOfTruth = {
+ schemaVersion,
+ title,
+ status: statusWord,
+ gate: gateModeLabel(comparison.mode),
+ readiness: readinessLabel,
+ commit: {
+ shortSha,
+ sha: headSha || sha || 'unknown',
+ },
+ run: {
+ id: runId || null,
+ attempt: runAttempt || null,
+ url: runUrl || null,
+ },
+ baseline: baselineProvenance || null,
+ protocol: protocolLabel,
+ chart: {
+ meaning: 'semantic-impact',
+ zeroImpactMeaning: 'no actionable PR impact after budgets, noise floor, and robust evidence checks',
+ svg: chartSourceUrl || null,
+ lightPng: chartUrl || null,
+ darkPng: chartDarkUrl || null,
+ },
+ measurements: allRows.map(sourceMeasurement),
+ }
const chartSvg = hasComparableBaseline ? renderPerfChangeSvg(visibleRows.length > 0 ? visibleRows : allRows) : ''
const chartDarkSvg = hasComparableBaseline ? renderPerfChangeSvg(visibleRows.length > 0 ? visibleRows : allRows, 'dark') : ''
if (chartPath && chartSvg) writeFileSync(chartPath, chartSvg)
@@ -4420,27 +4569,28 @@ jobs:
(chartSourceUrl ? '\n\n[SVG source](' + chartSourceUrl + ')' : '')
: ''
+ const regressionCount = allRows.filter((row) => row.status === 'fail' || row.status === 'warn').length
+ const improvementCount = comparableRows.filter((row) => row.direction === 'improved' && !isZeroImpactRow(row)).length
+ const neutralCount = zeroImpactRows.length + diagnosticRows.length
+ const humanSummary = hasComparableBaseline
+ ? regressionCount > 0
+ ? String(regressionCount) + ' regression' + (regressionCount === 1 ? '' : 's') + ' need review.'
+ : improvementCount > 0
+ ? 'No regressions. ' + String(improvementCount) + ' probe' + (improvementCount === 1 ? '' : 's') + ' got faster; ' + String(neutralCount) + ' neutral or ungated row' + (neutralCount === 1 ? '' : 's') + ' are collapsed below.'
+ : 'No regressions. Comparable movement is below the semantic impact threshold; neutral rows are collapsed below.'
+ : 'No compatible baseline was available, so this run shows current measurements only.'
+
const summaryLines = [
'## ' + title,
'',
- '- Status: ' + statusWord,
- '- Gate: ' + gateModeLabel(comparison.mode),
- '- Commit: ' + shortSha,
- '- Run: ' + runLink,
- '- Baseline: ' + baselineLabel,
- '- Readiness: ' + readinessLabel,
- '- Protocol: ' + protocolLabel,
+ '**' + statusWord + '** - ' + gateModeLabel(comparison.mode) + ' gate - readiness ' + readinessLabel + ' - commit ' + shortSha + ' - protocol ' + protocolLabel + '',
'',
- hasComparableBaseline
- ? 'Chart: bars show semantic impact. A value of 0 means the raw change is not actionable for this PR; raw percentage and nominal values stay in the table.'
- : 'No compatible baseline was available, so this run shows current measurements only.',
+ '> ' + humanSummary,
'',
chartMarkdown,
'',
hasComparableBaseline
- ? (visibleNonZeroImpactRows.length > 0
- ? comparisonTable(visibleNonZeroImpactRows)
- : 'No non-zero actionable measurement impact detected.')
+ ? scanTable(visibleNonZeroImpactRows)
: currentOnlyTable(visibleRows),
]
@@ -4448,11 +4598,23 @@ jobs:
summaryLines.push(
'',
'',
- 'Zero-impact measurements (' + zeroImpactRows.length + ')
',
+ 'Unchanged / 0-impact measurements (' + zeroImpactRows.length + ')
',
'',
'These rows had compatible baseline data, but their semantic impact rounded to 0.00x because the movement was below the configured budget, below the noise floor, or inside the robust noise band.',
'',
- comparisonTable(zeroImpactRows),
+ zeroImpactTable(zeroImpactRows),
+ '',
+ ' ',
+ )
+ }
+
+ if (diagnosticRows.length > 0) {
+ summaryLines.push(
+ '',
+ '',
+ 'Diagnostic / ungated measurements (' + diagnosticRows.length + ')
',
+ '',
+ diagnosticTable(diagnosticRows),
'',
' ',
)
@@ -4482,6 +4644,18 @@ jobs:
)
}
+ summaryLines.push(
+ '',
+ '',
+ 'Source-of-truth JSON
',
+ '',
+ '~~~json',
+ JSON.stringify(sourceOfTruth, null, 2),
+ '~~~',
+ '',
+ ' ',
+ )
+
summaryLines.push('', marker, statePrefix + JSON.stringify(state, null, 2) + stateSuffix)
writeFileSync(bodyPath, summaryLines.join('\n') + '\n')
writeFileSync(commentIdPath, existing?.id ? String(existing.id) : '')
@@ -6375,6 +6549,103 @@ jobs:
const nonZeroImpactRows = comparableRows.filter((row) => !isZeroImpactRow(row))
const zeroImpactRows = comparableRows.filter(isZeroImpactRow)
const visibleNonZeroImpactRows = nonZeroImpactRows.slice(0, visibleLimit)
+ const diagnosticRows = allRows.filter((row) =>
+ row.status === 'missing_baseline' ||
+ row.confidence === 'diagnostic' ||
+ row.gateReason === 'disabled' ||
+ row.semanticImpactKind === 'diagnostic' ||
+ (!row.gateable && typeof row.baseline !== 'number')
+ )
+
+ const baselineToCurrent = (row) => {
+ const unit = row.observation?.unit
+ return formatValue(row.baseline, unit) + ' -> ' + formatValue(row.current, unit)
+ }
+
+ const rawChange = (row) => {
+ const unit = row.observation?.unit
+ return formatDelta(row.delta, unit) + ' / ' + formatRatio(row.ratio)
+ }
+
+ const confidenceSummary = (row) => {
+ const unit = row.observation?.unit
+ if (row.comparisonMode === 'paired' && typeof row.evidenceDeltaLower === 'number' && typeof row.evidenceDeltaUpper === 'number') {
+ const quantile = typeof row.pairedEvidenceQuantile === 'number'
+ ? Math.round(row.pairedEvidenceQuantile * 100)
+ : 25
+ return 'paired n=' + (row.pairedSamples ?? 0)
+ + ', ' + quantile + '-' + (100 - quantile) + '% delta '
+ + formatValue(row.evidenceDeltaLower, unit)
+ + '..' + formatValue(row.evidenceDeltaUpper, unit)
+ }
+ return (row.confidence || 'unknown') + ', baseline n=' + (row.baselineSources ?? 0) + ', current n=' + (row.currentSamples ?? 1)
+ }
+
+ const scanDecision = (row) => {
+ if (row.status === 'fail') return 'regression blocks'
+ if (row.status === 'warn') return 'regression review'
+ if (row.status === 'missing_baseline') return 'needs baseline'
+ if (row.direction === 'improved') return 'faster'
+ if (row.direction === 'regressed') return 'no material impact'
+ return 'unchanged'
+ }
+
+ const scanTable = (rows) => {
+ if (rows.length === 0) return 'No non-zero actionable measurement impact detected.'
+ return [
+ '| What changed? | Probe | Baseline -> current | Raw change | Impact | Confidence |',
+ '| --- | --- | --- | ---: | ---: | --- |',
+ ...rows.map((row) => {
+ return '| ' + [
+ scanDecision(row),
+ humanProbe(row),
+ baselineToCurrent(row),
+ rawChange(row),
+ formatRowImpact(row),
+ confidenceSummary(row),
+ ].map(escapeCell).join(' | ') + ' |'
+ }),
+ ].join('\n')
+ }
+
+ const zeroImpactTable = (rows) => {
+ if (rows.length === 0) return 'No zero-impact measurements.'
+ return [
+ '| Probe | Baseline -> current | Raw change | Impact | Gate | Evidence | Why hidden |',
+ '| --- | --- | ---: | ---: | --- | --- | --- |',
+ ...rows.map((row) => {
+ const meaning = interpretation(row)
+ return '| ' + [
+ humanProbe(row),
+ baselineToCurrent(row),
+ rawChange(row),
+ formatRowImpact(row),
+ row.gateable ? 'yes' : (row.gateReason || 'no'),
+ confidenceSummary(row),
+ meaning.label,
+ ].map(escapeCell).join(' | ') + ' |'
+ }),
+ ].join('\n')
+ }
+
+ const diagnosticTable = (rows) => {
+ if (rows.length === 0) return 'No diagnostic or ungated measurements.'
+ return [
+ '| Probe | Current | Baseline | Impact | Gate | Reason | Evidence |',
+ '| --- | ---: | ---: | ---: | --- | --- | --- |',
+ ...rows.map((row) => {
+ return '| ' + [
+ humanProbe(row),
+ formatValue(row.current, row.observation?.unit),
+ formatValue(row.baseline, row.observation?.unit),
+ formatRowImpact(row),
+ row.gateable ? 'yes' : (row.gateReason || row.status || 'no'),
+ interpretation(row).label,
+ confidenceSummary(row),
+ ].map(escapeCell).join(' | ') + ' |'
+ }),
+ ].join('\n')
+ }
const comparisonTable = (rows) => {
if (rows.length === 0) return 'No measurement regressions detected.'
@@ -6438,6 +6709,32 @@ jobs:
].join('\n')
}
+ const sourceMeasurement = (row) => ({
+ id: row.observation?.dimensions?.probe || row.observation?.name || humanProbe(row),
+ label: humanProbe(row),
+ group: semanticPath(row),
+ status: row.status,
+ direction: row.direction,
+ gateable: row.gateable,
+ gateReason: row.gateReason,
+ confidence: row.confidence,
+ comparisonMode: row.comparisonMode,
+ unit: row.observation?.unit,
+ baseline: row.baseline ?? null,
+ current: row.current ?? null,
+ delta: row.delta ?? null,
+ ratio: row.ratio ?? null,
+ semanticImpactScore: row.semanticImpactScore ?? null,
+ semanticImpactKind: row.semanticImpactKind ?? null,
+ baselineSources: row.baselineSources ?? null,
+ currentSamples: row.currentSamples ?? null,
+ pairedSamples: row.pairedSamples ?? null,
+ evidenceDeltaLower: row.evidenceDeltaLower ?? null,
+ evidenceDeltaUpper: row.evidenceDeltaUpper ?? null,
+ pairedEvidenceQuantile: row.pairedEvidenceQuantile ?? null,
+ dimensions: row.observation?.dimensions || {},
+ })
+
const truncate = (value, maxLength) => {
const text = String(value)
if (text.length <= maxLength) return text
@@ -6614,6 +6911,32 @@ jobs:
? '[main run ' + baselineProvenance.runId + '](' + serverUrl + '/' + repo + '/actions/runs/' + baselineProvenance.runId + ')' +
(Array.isArray(baselineProvenance.runs) && baselineProvenance.runs.length > 1 ? ' + ' + (baselineProvenance.runs.length - 1) + ' older baseline runs' : '')
: 'not available'
+ const sourceOfTruth = {
+ schemaVersion,
+ title,
+ status: statusWord,
+ gate: gateModeLabel(comparison.mode),
+ readiness: readinessLabel,
+ commit: {
+ shortSha,
+ sha: headSha || sha || 'unknown',
+ },
+ run: {
+ id: runId || null,
+ attempt: runAttempt || null,
+ url: runUrl || null,
+ },
+ baseline: baselineProvenance || null,
+ protocol: protocolLabel,
+ chart: {
+ meaning: 'semantic-impact',
+ zeroImpactMeaning: 'no actionable PR impact after budgets, noise floor, and robust evidence checks',
+ svg: chartSourceUrl || null,
+ lightPng: chartUrl || null,
+ darkPng: chartDarkUrl || null,
+ },
+ measurements: allRows.map(sourceMeasurement),
+ }
const chartSvg = hasComparableBaseline ? renderPerfChangeSvg(visibleRows.length > 0 ? visibleRows : allRows) : ''
const chartDarkSvg = hasComparableBaseline ? renderPerfChangeSvg(visibleRows.length > 0 ? visibleRows : allRows, 'dark') : ''
if (chartPath && chartSvg) writeFileSync(chartPath, chartSvg)
@@ -6632,27 +6955,28 @@ jobs:
(chartSourceUrl ? '\n\n[SVG source](' + chartSourceUrl + ')' : '')
: ''
+ const regressionCount = allRows.filter((row) => row.status === 'fail' || row.status === 'warn').length
+ const improvementCount = comparableRows.filter((row) => row.direction === 'improved' && !isZeroImpactRow(row)).length
+ const neutralCount = zeroImpactRows.length + diagnosticRows.length
+ const humanSummary = hasComparableBaseline
+ ? regressionCount > 0
+ ? String(regressionCount) + ' regression' + (regressionCount === 1 ? '' : 's') + ' need review.'
+ : improvementCount > 0
+ ? 'No regressions. ' + String(improvementCount) + ' probe' + (improvementCount === 1 ? '' : 's') + ' got faster; ' + String(neutralCount) + ' neutral or ungated row' + (neutralCount === 1 ? '' : 's') + ' are collapsed below.'
+ : 'No regressions. Comparable movement is below the semantic impact threshold; neutral rows are collapsed below.'
+ : 'No compatible baseline was available, so this run shows current measurements only.'
+
const summaryLines = [
'## ' + title,
'',
- '- Status: ' + statusWord,
- '- Gate: ' + gateModeLabel(comparison.mode),
- '- Commit: ' + shortSha,
- '- Run: ' + runLink,
- '- Baseline: ' + baselineLabel,
- '- Readiness: ' + readinessLabel,
- '- Protocol: ' + protocolLabel,
+ '**' + statusWord + '** - ' + gateModeLabel(comparison.mode) + ' gate - readiness ' + readinessLabel + ' - commit ' + shortSha + ' - protocol ' + protocolLabel + '',
'',
- hasComparableBaseline
- ? 'Chart: bars show semantic impact. A value of 0 means the raw change is not actionable for this PR; raw percentage and nominal values stay in the table.'
- : 'No compatible baseline was available, so this run shows current measurements only.',
+ '> ' + humanSummary,
'',
chartMarkdown,
'',
hasComparableBaseline
- ? (visibleNonZeroImpactRows.length > 0
- ? comparisonTable(visibleNonZeroImpactRows)
- : 'No non-zero actionable measurement impact detected.')
+ ? scanTable(visibleNonZeroImpactRows)
: currentOnlyTable(visibleRows),
]
@@ -6660,11 +6984,23 @@ jobs:
summaryLines.push(
'',
'',
- 'Zero-impact measurements (' + zeroImpactRows.length + ')
',
+ 'Unchanged / 0-impact measurements (' + zeroImpactRows.length + ')
',
'',
'These rows had compatible baseline data, but their semantic impact rounded to 0.00x because the movement was below the configured budget, below the noise floor, or inside the robust noise band.',
'',
- comparisonTable(zeroImpactRows),
+ zeroImpactTable(zeroImpactRows),
+ '',
+ ' ',
+ )
+ }
+
+ if (diagnosticRows.length > 0) {
+ summaryLines.push(
+ '',
+ '',
+ 'Diagnostic / ungated measurements (' + diagnosticRows.length + ')
',
+ '',
+ diagnosticTable(diagnosticRows),
'',
' ',
)
@@ -6694,6 +7030,18 @@ jobs:
)
}
+ summaryLines.push(
+ '',
+ '',
+ 'Source-of-truth JSON
',
+ '',
+ '~~~json',
+ JSON.stringify(sourceOfTruth, null, 2),
+ '~~~',
+ '',
+ ' ',
+ )
+
summaryLines.push('', marker, statePrefix + JSON.stringify(state, null, 2) + stateSuffix)
writeFileSync(bodyPath, summaryLines.join('\n') + '\n')
writeFileSync(commentIdPath, existing?.id ? String(existing.id) : '')
@@ -8238,6 +8586,103 @@ jobs:
const nonZeroImpactRows = comparableRows.filter((row) => !isZeroImpactRow(row))
const zeroImpactRows = comparableRows.filter(isZeroImpactRow)
const visibleNonZeroImpactRows = nonZeroImpactRows.slice(0, visibleLimit)
+ const diagnosticRows = allRows.filter((row) =>
+ row.status === 'missing_baseline' ||
+ row.confidence === 'diagnostic' ||
+ row.gateReason === 'disabled' ||
+ row.semanticImpactKind === 'diagnostic' ||
+ (!row.gateable && typeof row.baseline !== 'number')
+ )
+
+ const baselineToCurrent = (row) => {
+ const unit = row.observation?.unit
+ return formatValue(row.baseline, unit) + ' -> ' + formatValue(row.current, unit)
+ }
+
+ const rawChange = (row) => {
+ const unit = row.observation?.unit
+ return formatDelta(row.delta, unit) + ' / ' + formatRatio(row.ratio)
+ }
+
+ const confidenceSummary = (row) => {
+ const unit = row.observation?.unit
+ if (row.comparisonMode === 'paired' && typeof row.evidenceDeltaLower === 'number' && typeof row.evidenceDeltaUpper === 'number') {
+ const quantile = typeof row.pairedEvidenceQuantile === 'number'
+ ? Math.round(row.pairedEvidenceQuantile * 100)
+ : 25
+ return 'paired n=' + (row.pairedSamples ?? 0)
+ + ', ' + quantile + '-' + (100 - quantile) + '% delta '
+ + formatValue(row.evidenceDeltaLower, unit)
+ + '..' + formatValue(row.evidenceDeltaUpper, unit)
+ }
+ return (row.confidence || 'unknown') + ', baseline n=' + (row.baselineSources ?? 0) + ', current n=' + (row.currentSamples ?? 1)
+ }
+
+ const scanDecision = (row) => {
+ if (row.status === 'fail') return 'regression blocks'
+ if (row.status === 'warn') return 'regression review'
+ if (row.status === 'missing_baseline') return 'needs baseline'
+ if (row.direction === 'improved') return 'faster'
+ if (row.direction === 'regressed') return 'no material impact'
+ return 'unchanged'
+ }
+
+ const scanTable = (rows) => {
+ if (rows.length === 0) return 'No non-zero actionable measurement impact detected.'
+ return [
+ '| What changed? | Probe | Baseline -> current | Raw change | Impact | Confidence |',
+ '| --- | --- | --- | ---: | ---: | --- |',
+ ...rows.map((row) => {
+ return '| ' + [
+ scanDecision(row),
+ humanProbe(row),
+ baselineToCurrent(row),
+ rawChange(row),
+ formatRowImpact(row),
+ confidenceSummary(row),
+ ].map(escapeCell).join(' | ') + ' |'
+ }),
+ ].join('\n')
+ }
+
+ const zeroImpactTable = (rows) => {
+ if (rows.length === 0) return 'No zero-impact measurements.'
+ return [
+ '| Probe | Baseline -> current | Raw change | Impact | Gate | Evidence | Why hidden |',
+ '| --- | --- | ---: | ---: | --- | --- | --- |',
+ ...rows.map((row) => {
+ const meaning = interpretation(row)
+ return '| ' + [
+ humanProbe(row),
+ baselineToCurrent(row),
+ rawChange(row),
+ formatRowImpact(row),
+ row.gateable ? 'yes' : (row.gateReason || 'no'),
+ confidenceSummary(row),
+ meaning.label,
+ ].map(escapeCell).join(' | ') + ' |'
+ }),
+ ].join('\n')
+ }
+
+ const diagnosticTable = (rows) => {
+ if (rows.length === 0) return 'No diagnostic or ungated measurements.'
+ return [
+ '| Probe | Current | Baseline | Impact | Gate | Reason | Evidence |',
+ '| --- | ---: | ---: | ---: | --- | --- | --- |',
+ ...rows.map((row) => {
+ return '| ' + [
+ humanProbe(row),
+ formatValue(row.current, row.observation?.unit),
+ formatValue(row.baseline, row.observation?.unit),
+ formatRowImpact(row),
+ row.gateable ? 'yes' : (row.gateReason || row.status || 'no'),
+ interpretation(row).label,
+ confidenceSummary(row),
+ ].map(escapeCell).join(' | ') + ' |'
+ }),
+ ].join('\n')
+ }
const comparisonTable = (rows) => {
if (rows.length === 0) return 'No measurement regressions detected.'
@@ -8301,6 +8746,32 @@ jobs:
].join('\n')
}
+ const sourceMeasurement = (row) => ({
+ id: row.observation?.dimensions?.probe || row.observation?.name || humanProbe(row),
+ label: humanProbe(row),
+ group: semanticPath(row),
+ status: row.status,
+ direction: row.direction,
+ gateable: row.gateable,
+ gateReason: row.gateReason,
+ confidence: row.confidence,
+ comparisonMode: row.comparisonMode,
+ unit: row.observation?.unit,
+ baseline: row.baseline ?? null,
+ current: row.current ?? null,
+ delta: row.delta ?? null,
+ ratio: row.ratio ?? null,
+ semanticImpactScore: row.semanticImpactScore ?? null,
+ semanticImpactKind: row.semanticImpactKind ?? null,
+ baselineSources: row.baselineSources ?? null,
+ currentSamples: row.currentSamples ?? null,
+ pairedSamples: row.pairedSamples ?? null,
+ evidenceDeltaLower: row.evidenceDeltaLower ?? null,
+ evidenceDeltaUpper: row.evidenceDeltaUpper ?? null,
+ pairedEvidenceQuantile: row.pairedEvidenceQuantile ?? null,
+ dimensions: row.observation?.dimensions || {},
+ })
+
const truncate = (value, maxLength) => {
const text = String(value)
if (text.length <= maxLength) return text
@@ -8477,6 +8948,32 @@ jobs:
? '[main run ' + baselineProvenance.runId + '](' + serverUrl + '/' + repo + '/actions/runs/' + baselineProvenance.runId + ')' +
(Array.isArray(baselineProvenance.runs) && baselineProvenance.runs.length > 1 ? ' + ' + (baselineProvenance.runs.length - 1) + ' older baseline runs' : '')
: 'not available'
+ const sourceOfTruth = {
+ schemaVersion,
+ title,
+ status: statusWord,
+ gate: gateModeLabel(comparison.mode),
+ readiness: readinessLabel,
+ commit: {
+ shortSha,
+ sha: headSha || sha || 'unknown',
+ },
+ run: {
+ id: runId || null,
+ attempt: runAttempt || null,
+ url: runUrl || null,
+ },
+ baseline: baselineProvenance || null,
+ protocol: protocolLabel,
+ chart: {
+ meaning: 'semantic-impact',
+ zeroImpactMeaning: 'no actionable PR impact after budgets, noise floor, and robust evidence checks',
+ svg: chartSourceUrl || null,
+ lightPng: chartUrl || null,
+ darkPng: chartDarkUrl || null,
+ },
+ measurements: allRows.map(sourceMeasurement),
+ }
const chartSvg = hasComparableBaseline ? renderPerfChangeSvg(visibleRows.length > 0 ? visibleRows : allRows) : ''
const chartDarkSvg = hasComparableBaseline ? renderPerfChangeSvg(visibleRows.length > 0 ? visibleRows : allRows, 'dark') : ''
if (chartPath && chartSvg) writeFileSync(chartPath, chartSvg)
@@ -8495,27 +8992,28 @@ jobs:
(chartSourceUrl ? '\n\n[SVG source](' + chartSourceUrl + ')' : '')
: ''
+ const regressionCount = allRows.filter((row) => row.status === 'fail' || row.status === 'warn').length
+ const improvementCount = comparableRows.filter((row) => row.direction === 'improved' && !isZeroImpactRow(row)).length
+ const neutralCount = zeroImpactRows.length + diagnosticRows.length
+ const humanSummary = hasComparableBaseline
+ ? regressionCount > 0
+ ? String(regressionCount) + ' regression' + (regressionCount === 1 ? '' : 's') + ' need review.'
+ : improvementCount > 0
+ ? 'No regressions. ' + String(improvementCount) + ' probe' + (improvementCount === 1 ? '' : 's') + ' got faster; ' + String(neutralCount) + ' neutral or ungated row' + (neutralCount === 1 ? '' : 's') + ' are collapsed below.'
+ : 'No regressions. Comparable movement is below the semantic impact threshold; neutral rows are collapsed below.'
+ : 'No compatible baseline was available, so this run shows current measurements only.'
+
const summaryLines = [
'## ' + title,
'',
- '- Status: ' + statusWord,
- '- Gate: ' + gateModeLabel(comparison.mode),
- '- Commit: ' + shortSha,
- '- Run: ' + runLink,
- '- Baseline: ' + baselineLabel,
- '- Readiness: ' + readinessLabel,
- '- Protocol: ' + protocolLabel,
+ '**' + statusWord + '** - ' + gateModeLabel(comparison.mode) + ' gate - readiness ' + readinessLabel + ' - commit ' + shortSha + ' - protocol ' + protocolLabel + '',
'',
- hasComparableBaseline
- ? 'Chart: bars show semantic impact. A value of 0 means the raw change is not actionable for this PR; raw percentage and nominal values stay in the table.'
- : 'No compatible baseline was available, so this run shows current measurements only.',
+ '> ' + humanSummary,
'',
chartMarkdown,
'',
hasComparableBaseline
- ? (visibleNonZeroImpactRows.length > 0
- ? comparisonTable(visibleNonZeroImpactRows)
- : 'No non-zero actionable measurement impact detected.')
+ ? scanTable(visibleNonZeroImpactRows)
: currentOnlyTable(visibleRows),
]
@@ -8523,11 +9021,23 @@ jobs:
summaryLines.push(
'',
'',
- 'Zero-impact measurements (' + zeroImpactRows.length + ')
',
+ 'Unchanged / 0-impact measurements (' + zeroImpactRows.length + ')
',
'',
'These rows had compatible baseline data, but their semantic impact rounded to 0.00x because the movement was below the configured budget, below the noise floor, or inside the robust noise band.',
'',
- comparisonTable(zeroImpactRows),
+ zeroImpactTable(zeroImpactRows),
+ '',
+ ' ',
+ )
+ }
+
+ if (diagnosticRows.length > 0) {
+ summaryLines.push(
+ '',
+ '',
+ 'Diagnostic / ungated measurements (' + diagnosticRows.length + ')
',
+ '',
+ diagnosticTable(diagnosticRows),
'',
' ',
)
@@ -8557,6 +9067,18 @@ jobs:
)
}
+ summaryLines.push(
+ '',
+ '',
+ 'Source-of-truth JSON
',
+ '',
+ '~~~json',
+ JSON.stringify(sourceOfTruth, null, 2),
+ '~~~',
+ '',
+ ' ',
+ )
+
summaryLines.push('', marker, statePrefix + JSON.stringify(state, null, 2) + stateSuffix)
writeFileSync(bodyPath, summaryLines.join('\n') + '\n')
writeFileSync(commentIdPath, existing?.id ? String(existing.id) : '')
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index ba44a9886..35f3d6eef 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -2788,6 +2788,103 @@ const isZeroImpactRow = (row) =>
const nonZeroImpactRows = comparableRows.filter((row) => !isZeroImpactRow(row))
const zeroImpactRows = comparableRows.filter(isZeroImpactRow)
const visibleNonZeroImpactRows = nonZeroImpactRows.slice(0, visibleLimit)
+const diagnosticRows = allRows.filter((row) =>
+ row.status === 'missing_baseline' ||
+ row.confidence === 'diagnostic' ||
+ row.gateReason === 'disabled' ||
+ row.semanticImpactKind === 'diagnostic' ||
+ (!row.gateable && typeof row.baseline !== 'number')
+)
+
+const baselineToCurrent = (row) => {
+ const unit = row.observation?.unit
+ return formatValue(row.baseline, unit) + ' -> ' + formatValue(row.current, unit)
+}
+
+const rawChange = (row) => {
+ const unit = row.observation?.unit
+ return formatDelta(row.delta, unit) + ' / ' + formatRatio(row.ratio)
+}
+
+const confidenceSummary = (row) => {
+ const unit = row.observation?.unit
+ if (row.comparisonMode === 'paired' && typeof row.evidenceDeltaLower === 'number' && typeof row.evidenceDeltaUpper === 'number') {
+ const quantile = typeof row.pairedEvidenceQuantile === 'number'
+ ? Math.round(row.pairedEvidenceQuantile * 100)
+ : 25
+ return 'paired n=' + (row.pairedSamples ?? 0)
+ + ', ' + quantile + '-' + (100 - quantile) + '% delta '
+ + formatValue(row.evidenceDeltaLower, unit)
+ + '..' + formatValue(row.evidenceDeltaUpper, unit)
+ }
+ return (row.confidence || 'unknown') + ', baseline n=' + (row.baselineSources ?? 0) + ', current n=' + (row.currentSamples ?? 1)
+}
+
+const scanDecision = (row) => {
+ if (row.status === 'fail') return 'regression blocks'
+ if (row.status === 'warn') return 'regression review'
+ if (row.status === 'missing_baseline') return 'needs baseline'
+ if (row.direction === 'improved') return 'faster'
+ if (row.direction === 'regressed') return 'no material impact'
+ return 'unchanged'
+}
+
+const scanTable = (rows) => {
+ if (rows.length === 0) return 'No non-zero actionable measurement impact detected.'
+ return [
+ '| What changed? | Probe | Baseline -> current | Raw change | Impact | Confidence |',
+ '| --- | --- | --- | ---: | ---: | --- |',
+ ...rows.map((row) => {
+ return '| ' + [
+ scanDecision(row),
+ humanProbe(row),
+ baselineToCurrent(row),
+ rawChange(row),
+ formatRowImpact(row),
+ confidenceSummary(row),
+ ].map(escapeCell).join(' | ') + ' |'
+ }),
+ ].join('\n')
+}
+
+const zeroImpactTable = (rows) => {
+ if (rows.length === 0) return 'No zero-impact measurements.'
+ return [
+ '| Probe | Baseline -> current | Raw change | Impact | Gate | Evidence | Why hidden |',
+ '| --- | --- | ---: | ---: | --- | --- | --- |',
+ ...rows.map((row) => {
+ const meaning = interpretation(row)
+ return '| ' + [
+ humanProbe(row),
+ baselineToCurrent(row),
+ rawChange(row),
+ formatRowImpact(row),
+ row.gateable ? 'yes' : (row.gateReason || 'no'),
+ confidenceSummary(row),
+ meaning.label,
+ ].map(escapeCell).join(' | ') + ' |'
+ }),
+ ].join('\n')
+}
+
+const diagnosticTable = (rows) => {
+ if (rows.length === 0) return 'No diagnostic or ungated measurements.'
+ return [
+ '| Probe | Current | Baseline | Impact | Gate | Reason | Evidence |',
+ '| --- | ---: | ---: | ---: | --- | --- | --- |',
+ ...rows.map((row) => {
+ return '| ' + [
+ humanProbe(row),
+ formatValue(row.current, row.observation?.unit),
+ formatValue(row.baseline, row.observation?.unit),
+ formatRowImpact(row),
+ row.gateable ? 'yes' : (row.gateReason || row.status || 'no'),
+ interpretation(row).label,
+ confidenceSummary(row),
+ ].map(escapeCell).join(' | ') + ' |'
+ }),
+ ].join('\n')
+}
const comparisonTable = (rows) => {
if (rows.length === 0) return 'No measurement regressions detected.'
@@ -2851,6 +2948,32 @@ const allMeasurementsTable = (rows) => {
].join('\n')
}
+const sourceMeasurement = (row) => ({
+ id: row.observation?.dimensions?.probe || row.observation?.name || humanProbe(row),
+ label: humanProbe(row),
+ group: semanticPath(row),
+ status: row.status,
+ direction: row.direction,
+ gateable: row.gateable,
+ gateReason: row.gateReason,
+ confidence: row.confidence,
+ comparisonMode: row.comparisonMode,
+ unit: row.observation?.unit,
+ baseline: row.baseline ?? null,
+ current: row.current ?? null,
+ delta: row.delta ?? null,
+ ratio: row.ratio ?? null,
+ semanticImpactScore: row.semanticImpactScore ?? null,
+ semanticImpactKind: row.semanticImpactKind ?? null,
+ baselineSources: row.baselineSources ?? null,
+ currentSamples: row.currentSamples ?? null,
+ pairedSamples: row.pairedSamples ?? null,
+ evidenceDeltaLower: row.evidenceDeltaLower ?? null,
+ evidenceDeltaUpper: row.evidenceDeltaUpper ?? null,
+ pairedEvidenceQuantile: row.pairedEvidenceQuantile ?? null,
+ dimensions: row.observation?.dimensions || {},
+})
+
const truncate = (value, maxLength) => {
const text = String(value)
if (text.length <= maxLength) return text
@@ -3027,6 +3150,32 @@ const baselineLabel = baselineProvenance?.runId
? '[main run ' + baselineProvenance.runId + '](' + serverUrl + '/' + repo + '/actions/runs/' + baselineProvenance.runId + ')' +
(Array.isArray(baselineProvenance.runs) && baselineProvenance.runs.length > 1 ? ' + ' + (baselineProvenance.runs.length - 1) + ' older baseline runs' : '')
: 'not available'
+const sourceOfTruth = {
+ schemaVersion,
+ title,
+ status: statusWord,
+ gate: gateModeLabel(comparison.mode),
+ readiness: readinessLabel,
+ commit: {
+ shortSha,
+ sha: headSha || sha || 'unknown',
+ },
+ run: {
+ id: runId || null,
+ attempt: runAttempt || null,
+ url: runUrl || null,
+ },
+ baseline: baselineProvenance || null,
+ protocol: protocolLabel,
+ chart: {
+ meaning: 'semantic-impact',
+ zeroImpactMeaning: 'no actionable PR impact after budgets, noise floor, and robust evidence checks',
+ svg: chartSourceUrl || null,
+ lightPng: chartUrl || null,
+ darkPng: chartDarkUrl || null,
+ },
+ measurements: allRows.map(sourceMeasurement),
+}
const chartSvg = hasComparableBaseline ? renderPerfChangeSvg(visibleRows.length > 0 ? visibleRows : allRows) : ''
const chartDarkSvg = hasComparableBaseline ? renderPerfChangeSvg(visibleRows.length > 0 ? visibleRows : allRows, 'dark') : ''
if (chartPath && chartSvg) writeFileSync(chartPath, chartSvg)
@@ -3045,27 +3194,28 @@ const chartMarkdown = chartImageMarkdown
(chartSourceUrl ? '\n\n[SVG source](' + chartSourceUrl + ')' : '')
: ''
+const regressionCount = allRows.filter((row) => row.status === 'fail' || row.status === 'warn').length
+const improvementCount = comparableRows.filter((row) => row.direction === 'improved' && !isZeroImpactRow(row)).length
+const neutralCount = zeroImpactRows.length + diagnosticRows.length
+const humanSummary = hasComparableBaseline
+ ? regressionCount > 0
+ ? String(regressionCount) + ' regression' + (regressionCount === 1 ? '' : 's') + ' need review.'
+ : improvementCount > 0
+ ? 'No regressions. ' + String(improvementCount) + ' probe' + (improvementCount === 1 ? '' : 's') + ' got faster; ' + String(neutralCount) + ' neutral or ungated row' + (neutralCount === 1 ? '' : 's') + ' are collapsed below.'
+ : 'No regressions. Comparable movement is below the semantic impact threshold; neutral rows are collapsed below.'
+ : 'No compatible baseline was available, so this run shows current measurements only.'
+
const summaryLines = [
'## ' + title,
'',
- '- Status: ' + statusWord,
- '- Gate: ' + gateModeLabel(comparison.mode),
- '- Commit: ' + shortSha,
- '- Run: ' + runLink,
- '- Baseline: ' + baselineLabel,
- '- Readiness: ' + readinessLabel,
- '- Protocol: ' + protocolLabel,
+ '**' + statusWord + '** - ' + gateModeLabel(comparison.mode) + ' gate - readiness ' + readinessLabel + ' - commit ' + shortSha + ' - protocol ' + protocolLabel + '',
'',
- hasComparableBaseline
- ? 'Chart: bars show semantic impact. A value of 0 means the raw change is not actionable for this PR; raw percentage and nominal values stay in the table.'
- : 'No compatible baseline was available, so this run shows current measurements only.',
+ '> ' + humanSummary,
'',
chartMarkdown,
'',
hasComparableBaseline
- ? (visibleNonZeroImpactRows.length > 0
- ? comparisonTable(visibleNonZeroImpactRows)
- : 'No non-zero actionable measurement impact detected.')
+ ? scanTable(visibleNonZeroImpactRows)
: currentOnlyTable(visibleRows),
]
@@ -3073,11 +3223,23 @@ if (hasComparableBaseline && zeroImpactRows.length > 0) {
summaryLines.push(
'',
'',
- 'Zero-impact measurements (' + zeroImpactRows.length + ')
',
+ 'Unchanged / 0-impact measurements (' + zeroImpactRows.length + ')
',
'',
'These rows had compatible baseline data, but their semantic impact rounded to 0.00x because the movement was below the configured budget, below the noise floor, or inside the robust noise band.',
'',
- comparisonTable(zeroImpactRows),
+ zeroImpactTable(zeroImpactRows),
+ '',
+ ' ',
+ )
+}
+
+if (diagnosticRows.length > 0) {
+ summaryLines.push(
+ '',
+ '',
+ 'Diagnostic / ungated measurements (' + diagnosticRows.length + ')
',
+ '',
+ diagnosticTable(diagnosticRows),
'',
' ',
)
@@ -3107,6 +3269,18 @@ if (historyRows.length > 0) {
)
}
+summaryLines.push(
+ '',
+ '',
+ 'Source-of-truth JSON
',
+ '',
+ '~~~json',
+ JSON.stringify(sourceOfTruth, null, 2),
+ '~~~',
+ '',
+ ' ',
+)
+
summaryLines.push('', marker, statePrefix + JSON.stringify(state, null, 2) + stateSuffix)
writeFileSync(bodyPath, summaryLines.join('\n') + '\n')
writeFileSync(commentIdPath, existing?.id ? String(existing.id) : '')
diff --git a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
index 3696072b2..1628ef243 100644
--- a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
+++ b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
@@ -505,7 +505,7 @@ describe('ci workflow devenv perf helpers', () => {
'chart_dark_png_file="$comment_tmp_dir/perf-change-vs-baseline-dark.png"',
)
expect(ciWorkflowSource).toContain(
- 'Chart: bars show semantic impact. A value of 0 means the raw change is not actionable for this PR; raw percentage and nominal values stay in the table.',
+ 'No regressions. Comparable movement is below the semantic impact threshold; neutral rows are collapsed below.',
)
expect(generatedCiWorkflowYamlSource).toContain(
"github.workflow }}-${{ github.event_name }}-${{ github.ref }}",
@@ -516,14 +516,16 @@ describe('ci workflow devenv perf helpers', () => {
"inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request'",
)
expect(ciWorkflowSource).toContain(
- '| Group | Measurement | Baseline | Current | Raw change | Impact | Meaning | Gate | Evidence |',
+ '| What changed? | Probe | Baseline -> current | Raw change | Impact | Confidence |',
)
expect(ciWorkflowSource).toContain('const zeroImpactRows = comparableRows.filter(isZeroImpactRow)')
- expect(ciWorkflowSource).toContain('Zero-impact measurements (')
+ expect(ciWorkflowSource).toContain('Unchanged / 0-impact measurements (')
+ expect(ciWorkflowSource).toContain('Source-of-truth JSON
')
+ expect(ciWorkflowSource).toContain('const sourceOfTruth = {')
expect(ciWorkflowSource).toContain(
'No non-zero actionable measurement impact detected.',
)
- expect(ciWorkflowSource).toContain("'- Readiness: ' + readinessLabel")
+ expect(ciWorkflowSource).toContain('readiness ')
expect(ciWorkflowSource).toContain('renderPerfChangeSvg')
expect(ciWorkflowSource).toContain('Actionable measurement impact')
expect(ciWorkflowSource).toContain(
From 72d0690932823c17085fbe8b26a282e17a4c3a5b Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Wed, 20 May 2026 09:01:50 +0200
Subject: [PATCH 62/81] Hide diagnostic measurements from scan table
---
.github/workflows/ci.yml | 93 ++++++++++++++++---------------
genie/ci-workflow/measurements.ts | 31 ++++++-----
2 files changed, 64 insertions(+), 60 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 774f5e96f..1d3865158 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -4152,24 +4152,25 @@ jobs:
const visibleLimit = Number.isFinite(maxRows) && maxRows > 0 ? maxRows : 10
const comparableRows = allRows.filter((row) => typeof row.baseline === 'number')
const hasComparableBaseline = comparableRows.length > 0
- const visibleRows = (hasComparableBaseline
- ? allRows.filter((row) => typeof row.baseline === 'number')
- : allRows.slice().sort((left, right) => (right.current || 0) - (left.current || 0))
- ).slice(0, visibleLimit)
- const isZeroImpactRow = (row) =>
- typeof row.semanticImpactScore === 'number' &&
- !Number.isNaN(row.semanticImpactScore) &&
- Math.abs(row.semanticImpactScore) < 0.005
- const nonZeroImpactRows = comparableRows.filter((row) => !isZeroImpactRow(row))
- const zeroImpactRows = comparableRows.filter(isZeroImpactRow)
- const visibleNonZeroImpactRows = nonZeroImpactRows.slice(0, visibleLimit)
- const diagnosticRows = allRows.filter((row) =>
+ const isDiagnosticRow = (row) =>
row.status === 'missing_baseline' ||
row.confidence === 'diagnostic' ||
row.gateReason === 'disabled' ||
row.semanticImpactKind === 'diagnostic' ||
(!row.gateable && typeof row.baseline !== 'number')
- )
+ const isZeroImpactRow = (row) =>
+ typeof row.semanticImpactScore === 'number' &&
+ !Number.isNaN(row.semanticImpactScore) &&
+ Math.abs(row.semanticImpactScore) < 0.005
+ const actionableComparableRows = comparableRows.filter((row) => !isDiagnosticRow(row))
+ const visibleRows = (hasComparableBaseline
+ ? actionableComparableRows
+ : allRows.filter((row) => !isDiagnosticRow(row)).sort((left, right) => (right.current || 0) - (left.current || 0))
+ ).slice(0, visibleLimit)
+ const nonZeroImpactRows = actionableComparableRows.filter((row) => !isZeroImpactRow(row))
+ const zeroImpactRows = actionableComparableRows.filter(isZeroImpactRow)
+ const visibleNonZeroImpactRows = nonZeroImpactRows.slice(0, visibleLimit)
+ const diagnosticRows = allRows.filter(isDiagnosticRow)
const baselineToCurrent = (row) => {
const unit = row.observation?.unit
@@ -4551,8 +4552,8 @@ jobs:
},
measurements: allRows.map(sourceMeasurement),
}
- const chartSvg = hasComparableBaseline ? renderPerfChangeSvg(visibleRows.length > 0 ? visibleRows : allRows) : ''
- const chartDarkSvg = hasComparableBaseline ? renderPerfChangeSvg(visibleRows.length > 0 ? visibleRows : allRows, 'dark') : ''
+ const chartSvg = hasComparableBaseline && visibleRows.length > 0 ? renderPerfChangeSvg(visibleRows) : ''
+ const chartDarkSvg = hasComparableBaseline && visibleRows.length > 0 ? renderPerfChangeSvg(visibleRows, 'dark') : ''
if (chartPath && chartSvg) writeFileSync(chartPath, chartSvg)
if (chartDarkPath && chartDarkSvg) writeFileSync(chartDarkPath, chartDarkSvg)
const chartImageMarkdown = chartUrl && chartSvg
@@ -6538,24 +6539,25 @@ jobs:
const visibleLimit = Number.isFinite(maxRows) && maxRows > 0 ? maxRows : 10
const comparableRows = allRows.filter((row) => typeof row.baseline === 'number')
const hasComparableBaseline = comparableRows.length > 0
- const visibleRows = (hasComparableBaseline
- ? allRows.filter((row) => typeof row.baseline === 'number')
- : allRows.slice().sort((left, right) => (right.current || 0) - (left.current || 0))
- ).slice(0, visibleLimit)
- const isZeroImpactRow = (row) =>
- typeof row.semanticImpactScore === 'number' &&
- !Number.isNaN(row.semanticImpactScore) &&
- Math.abs(row.semanticImpactScore) < 0.005
- const nonZeroImpactRows = comparableRows.filter((row) => !isZeroImpactRow(row))
- const zeroImpactRows = comparableRows.filter(isZeroImpactRow)
- const visibleNonZeroImpactRows = nonZeroImpactRows.slice(0, visibleLimit)
- const diagnosticRows = allRows.filter((row) =>
+ const isDiagnosticRow = (row) =>
row.status === 'missing_baseline' ||
row.confidence === 'diagnostic' ||
row.gateReason === 'disabled' ||
row.semanticImpactKind === 'diagnostic' ||
(!row.gateable && typeof row.baseline !== 'number')
- )
+ const isZeroImpactRow = (row) =>
+ typeof row.semanticImpactScore === 'number' &&
+ !Number.isNaN(row.semanticImpactScore) &&
+ Math.abs(row.semanticImpactScore) < 0.005
+ const actionableComparableRows = comparableRows.filter((row) => !isDiagnosticRow(row))
+ const visibleRows = (hasComparableBaseline
+ ? actionableComparableRows
+ : allRows.filter((row) => !isDiagnosticRow(row)).sort((left, right) => (right.current || 0) - (left.current || 0))
+ ).slice(0, visibleLimit)
+ const nonZeroImpactRows = actionableComparableRows.filter((row) => !isZeroImpactRow(row))
+ const zeroImpactRows = actionableComparableRows.filter(isZeroImpactRow)
+ const visibleNonZeroImpactRows = nonZeroImpactRows.slice(0, visibleLimit)
+ const diagnosticRows = allRows.filter(isDiagnosticRow)
const baselineToCurrent = (row) => {
const unit = row.observation?.unit
@@ -6937,8 +6939,8 @@ jobs:
},
measurements: allRows.map(sourceMeasurement),
}
- const chartSvg = hasComparableBaseline ? renderPerfChangeSvg(visibleRows.length > 0 ? visibleRows : allRows) : ''
- const chartDarkSvg = hasComparableBaseline ? renderPerfChangeSvg(visibleRows.length > 0 ? visibleRows : allRows, 'dark') : ''
+ const chartSvg = hasComparableBaseline && visibleRows.length > 0 ? renderPerfChangeSvg(visibleRows) : ''
+ const chartDarkSvg = hasComparableBaseline && visibleRows.length > 0 ? renderPerfChangeSvg(visibleRows, 'dark') : ''
if (chartPath && chartSvg) writeFileSync(chartPath, chartSvg)
if (chartDarkPath && chartDarkSvg) writeFileSync(chartDarkPath, chartDarkSvg)
const chartImageMarkdown = chartUrl && chartSvg
@@ -8575,24 +8577,25 @@ jobs:
const visibleLimit = Number.isFinite(maxRows) && maxRows > 0 ? maxRows : 10
const comparableRows = allRows.filter((row) => typeof row.baseline === 'number')
const hasComparableBaseline = comparableRows.length > 0
- const visibleRows = (hasComparableBaseline
- ? allRows.filter((row) => typeof row.baseline === 'number')
- : allRows.slice().sort((left, right) => (right.current || 0) - (left.current || 0))
- ).slice(0, visibleLimit)
- const isZeroImpactRow = (row) =>
- typeof row.semanticImpactScore === 'number' &&
- !Number.isNaN(row.semanticImpactScore) &&
- Math.abs(row.semanticImpactScore) < 0.005
- const nonZeroImpactRows = comparableRows.filter((row) => !isZeroImpactRow(row))
- const zeroImpactRows = comparableRows.filter(isZeroImpactRow)
- const visibleNonZeroImpactRows = nonZeroImpactRows.slice(0, visibleLimit)
- const diagnosticRows = allRows.filter((row) =>
+ const isDiagnosticRow = (row) =>
row.status === 'missing_baseline' ||
row.confidence === 'diagnostic' ||
row.gateReason === 'disabled' ||
row.semanticImpactKind === 'diagnostic' ||
(!row.gateable && typeof row.baseline !== 'number')
- )
+ const isZeroImpactRow = (row) =>
+ typeof row.semanticImpactScore === 'number' &&
+ !Number.isNaN(row.semanticImpactScore) &&
+ Math.abs(row.semanticImpactScore) < 0.005
+ const actionableComparableRows = comparableRows.filter((row) => !isDiagnosticRow(row))
+ const visibleRows = (hasComparableBaseline
+ ? actionableComparableRows
+ : allRows.filter((row) => !isDiagnosticRow(row)).sort((left, right) => (right.current || 0) - (left.current || 0))
+ ).slice(0, visibleLimit)
+ const nonZeroImpactRows = actionableComparableRows.filter((row) => !isZeroImpactRow(row))
+ const zeroImpactRows = actionableComparableRows.filter(isZeroImpactRow)
+ const visibleNonZeroImpactRows = nonZeroImpactRows.slice(0, visibleLimit)
+ const diagnosticRows = allRows.filter(isDiagnosticRow)
const baselineToCurrent = (row) => {
const unit = row.observation?.unit
@@ -8974,8 +8977,8 @@ jobs:
},
measurements: allRows.map(sourceMeasurement),
}
- const chartSvg = hasComparableBaseline ? renderPerfChangeSvg(visibleRows.length > 0 ? visibleRows : allRows) : ''
- const chartDarkSvg = hasComparableBaseline ? renderPerfChangeSvg(visibleRows.length > 0 ? visibleRows : allRows, 'dark') : ''
+ const chartSvg = hasComparableBaseline && visibleRows.length > 0 ? renderPerfChangeSvg(visibleRows) : ''
+ const chartDarkSvg = hasComparableBaseline && visibleRows.length > 0 ? renderPerfChangeSvg(visibleRows, 'dark') : ''
if (chartPath && chartSvg) writeFileSync(chartPath, chartSvg)
if (chartDarkPath && chartDarkSvg) writeFileSync(chartDarkPath, chartDarkSvg)
const chartImageMarkdown = chartUrl && chartSvg
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 35f3d6eef..79c74c19c 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -2777,24 +2777,25 @@ const protocolLabel = (() => {
const visibleLimit = Number.isFinite(maxRows) && maxRows > 0 ? maxRows : 10
const comparableRows = allRows.filter((row) => typeof row.baseline === 'number')
const hasComparableBaseline = comparableRows.length > 0
-const visibleRows = (hasComparableBaseline
- ? allRows.filter((row) => typeof row.baseline === 'number')
- : allRows.slice().sort((left, right) => (right.current || 0) - (left.current || 0))
-).slice(0, visibleLimit)
-const isZeroImpactRow = (row) =>
- typeof row.semanticImpactScore === 'number' &&
- !Number.isNaN(row.semanticImpactScore) &&
- Math.abs(row.semanticImpactScore) < 0.005
-const nonZeroImpactRows = comparableRows.filter((row) => !isZeroImpactRow(row))
-const zeroImpactRows = comparableRows.filter(isZeroImpactRow)
-const visibleNonZeroImpactRows = nonZeroImpactRows.slice(0, visibleLimit)
-const diagnosticRows = allRows.filter((row) =>
+const isDiagnosticRow = (row) =>
row.status === 'missing_baseline' ||
row.confidence === 'diagnostic' ||
row.gateReason === 'disabled' ||
row.semanticImpactKind === 'diagnostic' ||
(!row.gateable && typeof row.baseline !== 'number')
-)
+const isZeroImpactRow = (row) =>
+ typeof row.semanticImpactScore === 'number' &&
+ !Number.isNaN(row.semanticImpactScore) &&
+ Math.abs(row.semanticImpactScore) < 0.005
+const actionableComparableRows = comparableRows.filter((row) => !isDiagnosticRow(row))
+const visibleRows = (hasComparableBaseline
+ ? actionableComparableRows
+ : allRows.filter((row) => !isDiagnosticRow(row)).sort((left, right) => (right.current || 0) - (left.current || 0))
+).slice(0, visibleLimit)
+const nonZeroImpactRows = actionableComparableRows.filter((row) => !isZeroImpactRow(row))
+const zeroImpactRows = actionableComparableRows.filter(isZeroImpactRow)
+const visibleNonZeroImpactRows = nonZeroImpactRows.slice(0, visibleLimit)
+const diagnosticRows = allRows.filter(isDiagnosticRow)
const baselineToCurrent = (row) => {
const unit = row.observation?.unit
@@ -3176,8 +3177,8 @@ const sourceOfTruth = {
},
measurements: allRows.map(sourceMeasurement),
}
-const chartSvg = hasComparableBaseline ? renderPerfChangeSvg(visibleRows.length > 0 ? visibleRows : allRows) : ''
-const chartDarkSvg = hasComparableBaseline ? renderPerfChangeSvg(visibleRows.length > 0 ? visibleRows : allRows, 'dark') : ''
+const chartSvg = hasComparableBaseline && visibleRows.length > 0 ? renderPerfChangeSvg(visibleRows) : ''
+const chartDarkSvg = hasComparableBaseline && visibleRows.length > 0 ? renderPerfChangeSvg(visibleRows, 'dark') : ''
if (chartPath && chartSvg) writeFileSync(chartPath, chartSvg)
if (chartDarkPath && chartDarkSvg) writeFileSync(chartDarkPath, chartDarkSvg)
const chartImageMarkdown = chartUrl && chartSvg
From 19ebea8d205a9c330e5d070f2679d145bc975803 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Wed, 20 May 2026 09:17:46 +0200
Subject: [PATCH 63/81] Unblock manual measurement dispatches
---
.github/workflows/ci.yml | 2 +-
genie/ci-workflow/shared.ts | 7 ++++++-
.../github-workflow/ci-workflow-helpers.unit.test.ts | 6 +++++-
3 files changed, 12 insertions(+), 3 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 1d3865158..2589485ce 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,7 +2,7 @@
# Source: ci.yml.genie.ts
concurrency:
- group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code') }}"
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}"
cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
name: CI
diff --git a/genie/ci-workflow/shared.ts b/genie/ci-workflow/shared.ts
index 135c7d107..5f24a8f59 100644
--- a/genie/ci-workflow/shared.ts
+++ b/genie/ci-workflow/shared.ts
@@ -64,13 +64,18 @@ export const standardCIEnv = {
* cancel in-progress runs so several historical refs can be backfilled without
* canceling each other.
*
+ * Manual PR measurement refreshes are intentionally keyed by run id. They are
+ * used as operational probes to update managed PR comments, and stale queued
+ * GitHub workflow_dispatch runs can otherwise hold the shared branch bucket
+ * indefinitely when GitHub refuses cancellation.
+ *
* Merge-queue label churn is different: only the mq:ci-admitted label event is
* allowed to materialize full PR CI. Other label events do not change the
* commit under test and must not cancel an already-running validation run.
*/
export const ciWorkflowConcurrency = {
group:
- "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code') }}",
+ "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}",
'cancel-in-progress':
"${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}",
} as const
diff --git a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
index 1628ef243..4a46514ad 100644
--- a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
+++ b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
@@ -511,6 +511,8 @@ describe('ci workflow devenv perf helpers', () => {
"github.workflow }}-${{ github.event_name }}-${{ github.ref }}",
)
expect(generatedCiWorkflowYamlSource).toContain("format('measurement-baseline-{0}'")
+ expect(generatedCiWorkflowYamlSource).toContain("format('measurement-pr-{0}-run-{1}'")
+ expect(generatedCiWorkflowYamlSource).toContain("format('manual-run-{0}', github.run_id)")
expect(generatedCiWorkflowYamlSource).toContain("format('label-{0}', github.event.label.name)")
expect(generatedCiWorkflowYamlSource).toContain(
"inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request'",
@@ -518,7 +520,9 @@ describe('ci workflow devenv perf helpers', () => {
expect(ciWorkflowSource).toContain(
'| What changed? | Probe | Baseline -> current | Raw change | Impact | Confidence |',
)
- expect(ciWorkflowSource).toContain('const zeroImpactRows = comparableRows.filter(isZeroImpactRow)')
+ expect(ciWorkflowSource).toContain(
+ 'const zeroImpactRows = actionableComparableRows.filter(isZeroImpactRow)',
+ )
expect(ciWorkflowSource).toContain('Unchanged / 0-impact measurements (')
expect(ciWorkflowSource).toContain('Source-of-truth JSON
')
expect(ciWorkflowSource).toContain('const sourceOfTruth = {')
From 7ac62d63789f8ac60547fca17684d748fc4f08b8 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Wed, 20 May 2026 09:34:23 +0200
Subject: [PATCH 64/81] Use job-level CI concurrency
---
.github/workflows/ci.yml | 43 ++++++++++++++++--
genie/ci-workflow/shared.ts | 45 +++++++++++++------
.../ci-workflow-helpers.unit.test.ts | 3 ++
3 files changed, 74 insertions(+), 17 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 2589485ce..2bd0bd7ef 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,10 +1,6 @@
# Generated file - DO NOT EDIT
# Source: ci.yml.genie.ts
-concurrency:
- group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}"
- cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
-
name: CI
on:
@@ -506,6 +502,9 @@ jobs:
run: |
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
+ concurrency:
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}-typecheck"
+ cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
lint:
if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
runs-on:
@@ -854,6 +853,9 @@ jobs:
run: |
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
+ concurrency:
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}-lint"
+ cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
test:
if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
strategy:
@@ -1205,6 +1207,9 @@ jobs:
run: |
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
+ concurrency:
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}-test"
+ cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
nix-check:
if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
strategy:
@@ -1556,6 +1561,9 @@ jobs:
run: |
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
+ concurrency:
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}-nix-check"
+ cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
nix-fod-check:
if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
strategy:
@@ -1719,6 +1727,9 @@ jobs:
run: |
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
+ concurrency:
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}-nix-fod-check"
+ cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
pnpm-builder-contract:
if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
runs-on:
@@ -1984,6 +1995,9 @@ jobs:
run: |
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
+ concurrency:
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}-pnpm-builder-contract"
+ cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
pnpm-regression:
if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
runs-on:
@@ -2227,6 +2241,9 @@ jobs:
run: |
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
+ concurrency:
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}-pnpm-regression"
+ cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
devenv-perf:
runs-on:
[namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}']
@@ -4792,6 +4809,9 @@ jobs:
if-no-files-found: error
retention-days: 30
timeout-minutes: 30
+ concurrency:
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}-devenv-perf"
+ cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
nix-closure-sizes:
if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
runs-on:
@@ -7236,6 +7256,9 @@ jobs:
run: |
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
+ concurrency:
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}-nix-closure-sizes"
+ cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
source-shape:
runs-on:
[namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}']
@@ -9217,6 +9240,9 @@ jobs:
!tmp/source-shape-ci/baseline/**
if-no-files-found: error
retention-days: 30
+ concurrency:
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}-source-shape"
+ cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
test-integration-notion:
if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
runs-on:
@@ -9566,6 +9592,9 @@ jobs:
run: |
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
+ concurrency:
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}-test-integration-notion"
+ cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
deploy-storybooks:
if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
runs-on:
@@ -10391,6 +10420,9 @@ jobs:
run: |
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
+ concurrency:
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}-deploy-storybooks"
+ cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
notify-alignment:
runs-on:
[namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}']
@@ -10419,3 +10451,6 @@ jobs:
--header "X-GitHub-Api-Version: 2022-11-28" \
--data "$payload"
shell: bash
+ concurrency:
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}-notify-alignment"
+ cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
diff --git a/genie/ci-workflow/shared.ts b/genie/ci-workflow/shared.ts
index 5f24a8f59..2ef52ec45 100644
--- a/genie/ci-workflow/shared.ts
+++ b/genie/ci-workflow/shared.ts
@@ -49,16 +49,17 @@ export const standardCIEnv = {
} as const
/**
- * Cancel superseded CI workflow runs for the same event and ref.
+ * Cancel superseded CI jobs for the same event, ref, and job id.
*
- * The group key intentionally does not include the job name so a new push
- * cancels the entire older workflow run rather than letting stale sibling jobs
- * continue consuming runner capacity.
+ * This is intentionally job-level, not workflow-level. GitHub can wedge
+ * workflow_dispatch runs before job creation; when that happens, the run has no
+ * check runs, no logs, and the API may return 500 for cancellation. Keeping
+ * concurrency at job level lets workflow evaluation materialize visible jobs
+ * before any scarce-runner throttling applies.
*
* Code validation is a branch-protection signal for the latest PR head. Keeping
- * older code-triggered pull_request runs alive can wedge the concurrency bucket
- * behind a stale queued self-hosted job and prevent the current head from
- * materializing any jobs.
+ * older code-triggered pull_request jobs alive can consume scarce runners after
+ * a newer head exists, so jobs with the same id still cancel superseded work.
*
* Measurement baseline backfills are keyed by their subject ref and do not
* cancel in-progress runs so several historical refs can be backfilled without
@@ -73,6 +74,23 @@ export const standardCIEnv = {
* allowed to materialize full PR CI. Other label events do not change the
* commit under test and must not cancel an already-running validation run.
*/
+export const ciJobConcurrency = (jobId: string) =>
+ ({
+ group:
+ "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}" +
+ `-${jobId}`,
+ 'cancel-in-progress':
+ "${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}",
+ }) as const
+
+const withDefaultJobConcurrency = (jobs: GitHubWorkflowArgs['jobs']): GitHubWorkflowArgs['jobs'] =>
+ Object.fromEntries(
+ Object.entries(jobs).map(([jobId, job]) => [
+ jobId,
+ job.concurrency === undefined ? { ...job, concurrency: ciJobConcurrency(jobId) } : job,
+ ]),
+ )
+
export const ciWorkflowConcurrency = {
group:
"${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}",
@@ -83,16 +101,17 @@ export const ciWorkflowConcurrency = {
/**
* Standard wrapper for composed CI workflows.
*
- * This keeps cancellation policy centralized in `effect-utils` instead of
- * making each consumer remember to wire `concurrency` by hand. Repos can still
- * override the policy by passing an explicit `concurrency` field.
+ * This keeps cancellation policy centralized in `effect-utils`. Repos can still
+ * override the workflow-level policy by passing an explicit `concurrency`
+ * field, and individual jobs can opt out or provide their own `concurrency`.
*/
export const ciWorkflow = (args: GitHubWorkflowArgs) =>
- (({ concurrency, actionlint, ...rest }) =>
+ (({ concurrency, actionlint, jobs, ...rest }) =>
githubWorkflow({
- concurrency: concurrency ?? ciWorkflowConcurrency,
- actionlint: actionlint ?? defaultActionlintConfig,
...rest,
+ ...(concurrency === undefined ? {} : { concurrency }),
+ actionlint: actionlint ?? defaultActionlintConfig,
+ jobs: concurrency === undefined ? withDefaultJobConcurrency(jobs) : jobs,
}))(args)
export type NixConfigOptions = {
diff --git a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
index 4a46514ad..d95623a55 100644
--- a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
+++ b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
@@ -510,6 +510,9 @@ describe('ci workflow devenv perf helpers', () => {
expect(generatedCiWorkflowYamlSource).toContain(
"github.workflow }}-${{ github.event_name }}-${{ github.ref }}",
)
+ expect(generatedCiWorkflowYamlSource).not.toMatch(/^concurrency:/m)
+ expect(generatedCiWorkflowYamlSource).toContain('concurrency:\n group:')
+ expect(generatedCiWorkflowYamlSource).toContain('}}-typecheck')
expect(generatedCiWorkflowYamlSource).toContain("format('measurement-baseline-{0}'")
expect(generatedCiWorkflowYamlSource).toContain("format('measurement-pr-{0}-run-{1}'")
expect(generatedCiWorkflowYamlSource).toContain("format('manual-run-{0}', github.run_id)")
From ef5c63e2a19c435c706301a15d76da9c430e1668 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Wed, 20 May 2026 09:49:41 +0200
Subject: [PATCH 65/81] Make PR CI authoritative for measurement comments
---
.github/workflows/ci.yml | 109 ++++++++++--------
genie/ci-workflow/measurements.ts | 33 +++---
genie/ci-workflow/shared.ts | 10 +-
.../ci-workflow-helpers.unit.test.ts | 13 ++-
4 files changed, 95 insertions(+), 70 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 2bd0bd7ef..ae1523ef9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -20,11 +20,6 @@ on:
required: false
default: ''
type: string
- measurement_pr_number:
- description: Optional pull request number to update with CI measurement comments during manual measurement runs.
- required: false
- default: ''
- type: string
debug_force_nix_diagnostics_failure:
description: 'Temporary debug switch (#272): force post-validation failure to verify diagnostics artifact + summary'
required: false
@@ -503,7 +498,7 @@ jobs:
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
concurrency:
- group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}-typecheck"
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-typecheck"
cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
lint:
if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
@@ -854,7 +849,7 @@ jobs:
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
concurrency:
- group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}-lint"
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-lint"
cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
test:
if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
@@ -1208,7 +1203,7 @@ jobs:
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
concurrency:
- group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}-test"
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-test"
cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
nix-check:
if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
@@ -1562,7 +1557,7 @@ jobs:
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
concurrency:
- group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}-nix-check"
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-nix-check"
cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
nix-fod-check:
if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
@@ -1728,7 +1723,7 @@ jobs:
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
concurrency:
- group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}-nix-fod-check"
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-nix-fod-check"
cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
pnpm-builder-contract:
if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
@@ -1996,7 +1991,7 @@ jobs:
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
concurrency:
- group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}-pnpm-builder-contract"
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-pnpm-builder-contract"
cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
pnpm-regression:
if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
@@ -2242,7 +2237,7 @@ jobs:
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
concurrency:
- group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}-pnpm-regression"
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-pnpm-regression"
cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
devenv-perf:
runs-on:
@@ -3254,7 +3249,6 @@ jobs:
CI_MEASUREMENT_PR_COMMENT_MAX_ROWS: '8'
CI_MEASUREMENT_PR_COMMENT_MAX_HISTORY: '20'
CI_MEASUREMENT_PR_COMMENT_ASSET_BRANCH: ci-measurement-assets
- CI_MEASUREMENT_PR_COMMENT_PR_NUMBER: ${{ inputs.measurement_pr_number }}
GH_TOKEN: ${{ github.token }}
run: |
set -euo pipefail
@@ -3743,7 +3737,12 @@ jobs:
} >>"$GITHUB_STEP_SUMMARY"
fi
- if [ "${CI_MEASUREMENT_PR_COMMENT_ENABLED:-false}" = "true" ] && { [ "${GITHUB_EVENT_NAME:-}" = "pull_request" ] || [ -n "${CI_MEASUREMENT_PR_COMMENT_PR_NUMBER:-}" ]; }; then
+ if [ "${CI_MEASUREMENT_PR_COMMENT_ENABLED:-false}" = "true" ]; then
+ if [ "${GITHUB_EVENT_NAME:-}" != "pull_request" ]; then
+ echo "::notice::CI measurement PR comments are produced only by pull_request workflows; skipping comment for event ${GITHUB_EVENT_NAME:-unknown}"
+ exit 0
+ fi
+
can_render_pr_comment=true
ensure_ci_measurement_tool() {
@@ -3762,36 +3761,40 @@ jobs:
}
if ! ensure_ci_measurement_tool gh gh; then
- echo "::notice::gh is not available; skipping CI measurement PR comment"
+ echo "::error::gh is not available; unable to publish required CI measurement PR comment"
can_render_pr_comment=false
fi
if ! ensure_ci_measurement_tool node nodejs; then
- echo "::notice::node is not available; skipping CI measurement PR comment"
+ echo "::error::node is not available; unable to publish required CI measurement PR comment"
can_render_pr_comment=false
fi
if ! command -v jq >/dev/null 2>&1; then
if ensure_ci_measurement_tool jq jq; then
:
else
- echo "::notice::jq is not available; skipping CI measurement PR comment"
+ echo "::error::jq is not available; unable to publish required CI measurement PR comment"
can_render_pr_comment=false
fi
fi
if [ -z "${GH_TOKEN:-${GITHUB_TOKEN:-}}" ]; then
- echo "::notice::GH_TOKEN/GITHUB_TOKEN is not set; skipping CI measurement PR comment"
+ echo "::error::GH_TOKEN/GITHUB_TOKEN is not set; unable to publish required CI measurement PR comment"
can_render_pr_comment=false
fi
event_path="${GITHUB_EVENT_PATH:-}"
- pr_number="${CI_MEASUREMENT_PR_COMMENT_PR_NUMBER:-}"
+ pr_number=""
if [ "$can_render_pr_comment" = "true" ] && [ -n "$event_path" ] && [ -f "$event_path" ]; then
- pr_number="${pr_number:-$(jq -r '.pull_request.number // empty' "$event_path")}"
+ pr_number="$(jq -r '.pull_request.number // empty' "$event_path")"
fi
if [ "$can_render_pr_comment" = "true" ] && [ -z "$pr_number" ]; then
- echo "::notice::pull request number is unavailable; skipping CI measurement PR comment"
+ echo "::error::pull request number is unavailable; unable to publish required CI measurement PR comment"
can_render_pr_comment=false
fi
+ if [ "$can_render_pr_comment" != "true" ]; then
+ exit 1
+ fi
+
if [ "$can_render_pr_comment" = "true" ]; then
repo="${GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}"
comment_tmp_dir="$(mktemp -d)"
@@ -4810,7 +4813,7 @@ jobs:
retention-days: 30
timeout-minutes: 30
concurrency:
- group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}-devenv-perf"
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-devenv-perf"
cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
nix-closure-sizes:
if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
@@ -5644,7 +5647,6 @@ jobs:
CI_MEASUREMENT_PR_COMMENT_MAX_ROWS: '8'
CI_MEASUREMENT_PR_COMMENT_MAX_HISTORY: '20'
CI_MEASUREMENT_PR_COMMENT_ASSET_BRANCH: ci-measurement-assets
- CI_MEASUREMENT_PR_COMMENT_PR_NUMBER: ${{ inputs.measurement_pr_number }}
GH_TOKEN: ${{ github.token }}
run: |
set -euo pipefail
@@ -6133,7 +6135,12 @@ jobs:
} >>"$GITHUB_STEP_SUMMARY"
fi
- if [ "${CI_MEASUREMENT_PR_COMMENT_ENABLED:-false}" = "true" ] && { [ "${GITHUB_EVENT_NAME:-}" = "pull_request" ] || [ -n "${CI_MEASUREMENT_PR_COMMENT_PR_NUMBER:-}" ]; }; then
+ if [ "${CI_MEASUREMENT_PR_COMMENT_ENABLED:-false}" = "true" ]; then
+ if [ "${GITHUB_EVENT_NAME:-}" != "pull_request" ]; then
+ echo "::notice::CI measurement PR comments are produced only by pull_request workflows; skipping comment for event ${GITHUB_EVENT_NAME:-unknown}"
+ exit 0
+ fi
+
can_render_pr_comment=true
ensure_ci_measurement_tool() {
@@ -6152,36 +6159,40 @@ jobs:
}
if ! ensure_ci_measurement_tool gh gh; then
- echo "::notice::gh is not available; skipping CI measurement PR comment"
+ echo "::error::gh is not available; unable to publish required CI measurement PR comment"
can_render_pr_comment=false
fi
if ! ensure_ci_measurement_tool node nodejs; then
- echo "::notice::node is not available; skipping CI measurement PR comment"
+ echo "::error::node is not available; unable to publish required CI measurement PR comment"
can_render_pr_comment=false
fi
if ! command -v jq >/dev/null 2>&1; then
if ensure_ci_measurement_tool jq jq; then
:
else
- echo "::notice::jq is not available; skipping CI measurement PR comment"
+ echo "::error::jq is not available; unable to publish required CI measurement PR comment"
can_render_pr_comment=false
fi
fi
if [ -z "${GH_TOKEN:-${GITHUB_TOKEN:-}}" ]; then
- echo "::notice::GH_TOKEN/GITHUB_TOKEN is not set; skipping CI measurement PR comment"
+ echo "::error::GH_TOKEN/GITHUB_TOKEN is not set; unable to publish required CI measurement PR comment"
can_render_pr_comment=false
fi
event_path="${GITHUB_EVENT_PATH:-}"
- pr_number="${CI_MEASUREMENT_PR_COMMENT_PR_NUMBER:-}"
+ pr_number=""
if [ "$can_render_pr_comment" = "true" ] && [ -n "$event_path" ] && [ -f "$event_path" ]; then
- pr_number="${pr_number:-$(jq -r '.pull_request.number // empty' "$event_path")}"
+ pr_number="$(jq -r '.pull_request.number // empty' "$event_path")"
fi
if [ "$can_render_pr_comment" = "true" ] && [ -z "$pr_number" ]; then
- echo "::notice::pull request number is unavailable; skipping CI measurement PR comment"
+ echo "::error::pull request number is unavailable; unable to publish required CI measurement PR comment"
can_render_pr_comment=false
fi
+ if [ "$can_render_pr_comment" != "true" ]; then
+ exit 1
+ fi
+
if [ "$can_render_pr_comment" = "true" ]; then
repo="${GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}"
comment_tmp_dir="$(mktemp -d)"
@@ -7257,7 +7268,7 @@ jobs:
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
concurrency:
- group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}-nix-closure-sizes"
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-nix-closure-sizes"
cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
source-shape:
runs-on:
@@ -7685,7 +7696,6 @@ jobs:
CI_MEASUREMENT_PR_COMMENT_MAX_ROWS: '12'
CI_MEASUREMENT_PR_COMMENT_MAX_HISTORY: '20'
CI_MEASUREMENT_PR_COMMENT_ASSET_BRANCH: ci-measurement-assets
- CI_MEASUREMENT_PR_COMMENT_PR_NUMBER: ${{ inputs.measurement_pr_number }}
GH_TOKEN: ${{ github.token }}
run: |
set -euo pipefail
@@ -8174,7 +8184,12 @@ jobs:
} >>"$GITHUB_STEP_SUMMARY"
fi
- if [ "${CI_MEASUREMENT_PR_COMMENT_ENABLED:-false}" = "true" ] && { [ "${GITHUB_EVENT_NAME:-}" = "pull_request" ] || [ -n "${CI_MEASUREMENT_PR_COMMENT_PR_NUMBER:-}" ]; }; then
+ if [ "${CI_MEASUREMENT_PR_COMMENT_ENABLED:-false}" = "true" ]; then
+ if [ "${GITHUB_EVENT_NAME:-}" != "pull_request" ]; then
+ echo "::notice::CI measurement PR comments are produced only by pull_request workflows; skipping comment for event ${GITHUB_EVENT_NAME:-unknown}"
+ exit 0
+ fi
+
can_render_pr_comment=true
ensure_ci_measurement_tool() {
@@ -8193,36 +8208,40 @@ jobs:
}
if ! ensure_ci_measurement_tool gh gh; then
- echo "::notice::gh is not available; skipping CI measurement PR comment"
+ echo "::error::gh is not available; unable to publish required CI measurement PR comment"
can_render_pr_comment=false
fi
if ! ensure_ci_measurement_tool node nodejs; then
- echo "::notice::node is not available; skipping CI measurement PR comment"
+ echo "::error::node is not available; unable to publish required CI measurement PR comment"
can_render_pr_comment=false
fi
if ! command -v jq >/dev/null 2>&1; then
if ensure_ci_measurement_tool jq jq; then
:
else
- echo "::notice::jq is not available; skipping CI measurement PR comment"
+ echo "::error::jq is not available; unable to publish required CI measurement PR comment"
can_render_pr_comment=false
fi
fi
if [ -z "${GH_TOKEN:-${GITHUB_TOKEN:-}}" ]; then
- echo "::notice::GH_TOKEN/GITHUB_TOKEN is not set; skipping CI measurement PR comment"
+ echo "::error::GH_TOKEN/GITHUB_TOKEN is not set; unable to publish required CI measurement PR comment"
can_render_pr_comment=false
fi
event_path="${GITHUB_EVENT_PATH:-}"
- pr_number="${CI_MEASUREMENT_PR_COMMENT_PR_NUMBER:-}"
+ pr_number=""
if [ "$can_render_pr_comment" = "true" ] && [ -n "$event_path" ] && [ -f "$event_path" ]; then
- pr_number="${pr_number:-$(jq -r '.pull_request.number // empty' "$event_path")}"
+ pr_number="$(jq -r '.pull_request.number // empty' "$event_path")"
fi
if [ "$can_render_pr_comment" = "true" ] && [ -z "$pr_number" ]; then
- echo "::notice::pull request number is unavailable; skipping CI measurement PR comment"
+ echo "::error::pull request number is unavailable; unable to publish required CI measurement PR comment"
can_render_pr_comment=false
fi
+ if [ "$can_render_pr_comment" != "true" ]; then
+ exit 1
+ fi
+
if [ "$can_render_pr_comment" = "true" ]; then
repo="${GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}"
comment_tmp_dir="$(mktemp -d)"
@@ -9241,7 +9260,7 @@ jobs:
if-no-files-found: error
retention-days: 30
concurrency:
- group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}-source-shape"
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-source-shape"
cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
test-integration-notion:
if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
@@ -9593,7 +9612,7 @@ jobs:
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
concurrency:
- group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}-test-integration-notion"
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-test-integration-notion"
cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
deploy-storybooks:
if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
@@ -10421,7 +10440,7 @@ jobs:
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
concurrency:
- group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}-deploy-storybooks"
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-deploy-storybooks"
cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
notify-alignment:
runs-on:
@@ -10452,5 +10471,5 @@ jobs:
--data "$payload"
shell: bash
concurrency:
- group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}-notify-alignment"
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-notify-alignment"
cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 79c74c19c..da7216dec 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -291,13 +291,6 @@ export const ciMeasurementBaselineWorkflowDispatchInputs = {
default: '',
type: 'string',
},
- measurement_pr_number: {
- description:
- 'Optional pull request number to update with CI measurement comments during manual measurement runs.',
- required: false,
- default: '',
- type: 'string',
- },
} as const
export const ciMeasurementBaselineBackfillPredicate =
@@ -1856,7 +1849,6 @@ export const compareCiMeasurementsStep = (opts?: CiMeasurementsComparisonStepOpt
CI_MEASUREMENT_PR_COMMENT_MAX_HISTORY: String(opts?.prComment?.maxHistory ?? 20),
CI_MEASUREMENT_PR_COMMENT_ASSET_BRANCH:
opts?.prComment?.assetBranch ?? 'ci-measurement-assets',
- CI_MEASUREMENT_PR_COMMENT_PR_NUMBER: '${{ inputs.measurement_pr_number }}',
...(opts?.prComment?.publicAssetCommand === undefined
? {}
: { CI_MEASUREMENT_PR_COMMENT_PUBLIC_ASSET_COMMAND: opts.prComment.publicAssetCommand }),
@@ -2351,7 +2343,12 @@ if [ -n "${dollar}{GITHUB_STEP_SUMMARY:-}" ]; then
} >>"$GITHUB_STEP_SUMMARY"
fi
-if [ "${dollar}{CI_MEASUREMENT_PR_COMMENT_ENABLED:-false}" = "true" ] && { [ "${dollar}{GITHUB_EVENT_NAME:-}" = "pull_request" ] || [ -n "${dollar}{CI_MEASUREMENT_PR_COMMENT_PR_NUMBER:-}" ]; }; then
+if [ "${dollar}{CI_MEASUREMENT_PR_COMMENT_ENABLED:-false}" = "true" ]; then
+ if [ "${dollar}{GITHUB_EVENT_NAME:-}" != "pull_request" ]; then
+ echo "::notice::CI measurement PR comments are produced only by pull_request workflows; skipping comment for event ${dollar}{GITHUB_EVENT_NAME:-unknown}"
+ exit 0
+ fi
+
can_render_pr_comment=true
ensure_ci_measurement_tool() {
@@ -2370,36 +2367,40 @@ if [ "${dollar}{CI_MEASUREMENT_PR_COMMENT_ENABLED:-false}" = "true" ] && { [ "${
}
if ! ensure_ci_measurement_tool gh gh; then
- echo "::notice::gh is not available; skipping CI measurement PR comment"
+ echo "::error::gh is not available; unable to publish required CI measurement PR comment"
can_render_pr_comment=false
fi
if ! ensure_ci_measurement_tool node nodejs; then
- echo "::notice::node is not available; skipping CI measurement PR comment"
+ echo "::error::node is not available; unable to publish required CI measurement PR comment"
can_render_pr_comment=false
fi
if ! command -v jq >/dev/null 2>&1; then
if ensure_ci_measurement_tool jq jq; then
:
else
- echo "::notice::jq is not available; skipping CI measurement PR comment"
+ echo "::error::jq is not available; unable to publish required CI measurement PR comment"
can_render_pr_comment=false
fi
fi
if [ -z "${dollar}{GH_TOKEN:-${dollar}{GITHUB_TOKEN:-}}" ]; then
- echo "::notice::GH_TOKEN/GITHUB_TOKEN is not set; skipping CI measurement PR comment"
+ echo "::error::GH_TOKEN/GITHUB_TOKEN is not set; unable to publish required CI measurement PR comment"
can_render_pr_comment=false
fi
event_path="${dollar}{GITHUB_EVENT_PATH:-}"
- pr_number="${dollar}{CI_MEASUREMENT_PR_COMMENT_PR_NUMBER:-}"
+ pr_number=""
if [ "$can_render_pr_comment" = "true" ] && [ -n "$event_path" ] && [ -f "$event_path" ]; then
- pr_number="${dollar}{pr_number:-$(jq -r '.pull_request.number // empty' "$event_path")}"
+ pr_number="$(jq -r '.pull_request.number // empty' "$event_path")"
fi
if [ "$can_render_pr_comment" = "true" ] && [ -z "$pr_number" ]; then
- echo "::notice::pull request number is unavailable; skipping CI measurement PR comment"
+ echo "::error::pull request number is unavailable; unable to publish required CI measurement PR comment"
can_render_pr_comment=false
fi
+ if [ "$can_render_pr_comment" != "true" ]; then
+ exit 1
+ fi
+
if [ "$can_render_pr_comment" = "true" ]; then
repo="${dollar}{GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}"
comment_tmp_dir="$(mktemp -d)"
diff --git a/genie/ci-workflow/shared.ts b/genie/ci-workflow/shared.ts
index 2ef52ec45..5c8c736cd 100644
--- a/genie/ci-workflow/shared.ts
+++ b/genie/ci-workflow/shared.ts
@@ -65,10 +65,8 @@ export const standardCIEnv = {
* cancel in-progress runs so several historical refs can be backfilled without
* canceling each other.
*
- * Manual PR measurement refreshes are intentionally keyed by run id. They are
- * used as operational probes to update managed PR comments, and stale queued
- * GitHub workflow_dispatch runs can otherwise hold the shared branch bucket
- * indefinitely when GitHub refuses cancellation.
+ * Manual dispatches are intentionally keyed by run id. They are operator probes
+ * and baseline/debug tools, not the authoritative PR-comment path.
*
* Merge-queue label churn is different: only the mq:ci-admitted label event is
* allowed to materialize full PR CI. Other label events do not change the
@@ -77,7 +75,7 @@ export const standardCIEnv = {
export const ciJobConcurrency = (jobId: string) =>
({
group:
- "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}" +
+ "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}" +
`-${jobId}`,
'cancel-in-progress':
"${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}",
@@ -93,7 +91,7 @@ const withDefaultJobConcurrency = (jobs: GitHubWorkflowArgs['jobs']): GitHubWork
export const ciWorkflowConcurrency = {
group:
- "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && inputs.measurement_pr_number != '' && format('measurement-pr-{0}-run-{1}', inputs.measurement_pr_number, github.run_id) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code'))) }}",
+ "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}",
'cancel-in-progress':
"${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}",
} as const
diff --git a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
index d95623a55..108925900 100644
--- a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
+++ b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
@@ -490,8 +490,14 @@ describe('ci workflow devenv perf helpers', () => {
expect(ciWorkflowSource).toContain('seedRunIds?: readonly string[]')
expect(ciWorkflowSource).toContain('baselineSeedRuns?: readonly CiMeasurementBaselineSeedRun[]')
expect(ciWorkflowSource).toContain('baselineSeedRunIds?: readonly string[]')
- expect(ciWorkflowSource).toContain('measurement_pr_number:')
- expect(ciWorkflowSource).toContain("CI_MEASUREMENT_PR_COMMENT_PR_NUMBER: '${{ inputs.measurement_pr_number }}'")
+ expect(ciWorkflowSource).not.toContain('measurement_pr_number:')
+ expect(ciWorkflowSource).not.toContain('CI_MEASUREMENT_PR_COMMENT_PR_NUMBER')
+ expect(ciWorkflowSource).toContain(
+ 'CI measurement PR comments are produced only by pull_request workflows',
+ )
+ expect(ciWorkflowSource).toContain(
+ 'unable to publish required CI measurement PR comment',
+ )
expect(ciWorkflowSource).toContain('seedRuns: ($seedRuns[0] // [])')
expect(ciWorkflowSource).toContain('baselineProvenance: ($baselineProvenance[0] // null)')
expect(ciWorkflowSource).toContain(
@@ -514,7 +520,8 @@ describe('ci workflow devenv perf helpers', () => {
expect(generatedCiWorkflowYamlSource).toContain('concurrency:\n group:')
expect(generatedCiWorkflowYamlSource).toContain('}}-typecheck')
expect(generatedCiWorkflowYamlSource).toContain("format('measurement-baseline-{0}'")
- expect(generatedCiWorkflowYamlSource).toContain("format('measurement-pr-{0}-run-{1}'")
+ expect(generatedCiWorkflowYamlSource).not.toContain("format('measurement-pr-{0}-run-{1}'")
+ expect(generatedCiWorkflowYamlSource).not.toContain('inputs.measurement_pr_number')
expect(generatedCiWorkflowYamlSource).toContain("format('manual-run-{0}', github.run_id)")
expect(generatedCiWorkflowYamlSource).toContain("format('label-{0}', github.event.label.name)")
expect(generatedCiWorkflowYamlSource).toContain(
From 234bd49c693fd06434b3cf15c0136f202600d331 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Wed, 20 May 2026 10:08:23 +0200
Subject: [PATCH 66/81] Probe PR CI synchronize trigger
From 8b77e4371d534bbf02d4893744f23f560006a51c Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Wed, 20 May 2026 11:15:15 +0200
Subject: [PATCH 67/81] Consolidate CI measurement reporting
---
.github/workflows/ci.yml | 5743 ++++++++++++-----------------
.github/workflows/ci.yml.genie.ts | 101 +-
genie/ci-workflow/measurements.ts | 3 +-
3 files changed, 2511 insertions(+), 3336 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ae1523ef9..c528a3d2f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3244,12 +3244,11 @@ jobs:
CI_MEASUREMENT_BASELINE_DIR: tmp/devenv-perf-ci/baseline
CI_MEASUREMENT_COMPARISON_FILE: tmp/devenv-perf-ci/measurement-comparison.json
CI_MEASUREMENT_REGRESSION_MODE: warn
- CI_MEASUREMENT_PR_COMMENT_ENABLED: 'true'
+ CI_MEASUREMENT_PR_COMMENT_ENABLED: 'false'
CI_MEASUREMENT_PR_COMMENT_TITLE: Devenv Performance
CI_MEASUREMENT_PR_COMMENT_MAX_ROWS: '8'
CI_MEASUREMENT_PR_COMMENT_MAX_HISTORY: '20'
CI_MEASUREMENT_PR_COMMENT_ASSET_BRANCH: ci-measurement-assets
- GH_TOKEN: ${{ github.token }}
run: |
set -euo pipefail
@@ -3737,2080 +3736,1021 @@ jobs:
} >>"$GITHUB_STEP_SUMMARY"
fi
- if [ "${CI_MEASUREMENT_PR_COMMENT_ENABLED:-false}" = "true" ]; then
- if [ "${GITHUB_EVENT_NAME:-}" != "pull_request" ]; then
- echo "::notice::CI measurement PR comments are produced only by pull_request workflows; skipping comment for event ${GITHUB_EVENT_NAME:-unknown}"
- exit 0
- fi
-
- can_render_pr_comment=true
- ensure_ci_measurement_tool() {
- tool_name="$1"
- nix_attr="$2"
- if command -v "$tool_name" >/dev/null 2>&1; then
- return 0
- fi
- if ! command -v nix >/dev/null 2>&1; then
- return 1
- fi
- if tool_out="$(nix build --no-link --print-out-paths "nixpkgs#$nix_attr" 2>/dev/null)"; then
- export PATH="$tool_out/bin:$PATH"
- fi
- command -v "$tool_name" >/dev/null 2>&1
- }
- if ! ensure_ci_measurement_tool gh gh; then
- echo "::error::gh is not available; unable to publish required CI measurement PR comment"
- can_render_pr_comment=false
- fi
- if ! ensure_ci_measurement_tool node nodejs; then
- echo "::error::node is not available; unable to publish required CI measurement PR comment"
- can_render_pr_comment=false
- fi
- if ! command -v jq >/dev/null 2>&1; then
- if ensure_ci_measurement_tool jq jq; then
- :
- else
- echo "::error::jq is not available; unable to publish required CI measurement PR comment"
- can_render_pr_comment=false
- fi
- fi
- if [ -z "${GH_TOKEN:-${GITHUB_TOKEN:-}}" ]; then
- echo "::error::GH_TOKEN/GITHUB_TOKEN is not set; unable to publish required CI measurement PR comment"
- can_render_pr_comment=false
- fi
+ if [ "$exit_code" -ne 0 ]; then
+ exit "$exit_code"
+ fi
- event_path="${GITHUB_EVENT_PATH:-}"
- pr_number=""
- if [ "$can_render_pr_comment" = "true" ] && [ -n "$event_path" ] && [ -f "$event_path" ]; then
- pr_number="$(jq -r '.pull_request.number // empty' "$event_path")"
- fi
- if [ "$can_render_pr_comment" = "true" ] && [ -z "$pr_number" ]; then
- echo "::error::pull request number is unavailable; unable to publish required CI measurement PR comment"
- can_render_pr_comment=false
- fi
+ - name: Upload devenv perf artifacts
+ if: always()
+ uses: actions/upload-artifact@v4
+ with:
+ name: devenv-perf
+ path: |
+ tmp/devenv-perf-ci
+ !tmp/devenv-perf-ci/baseline/**
+ if-no-files-found: error
+ retention-days: 30
+ timeout-minutes: 30
+ concurrency:
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-devenv-perf"
+ cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
+ nix-closure-sizes:
+ if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
+ runs-on:
+ [namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}']
+ timeout-minutes: 30
+ defaults:
+ run:
+ shell: bash
+ permissions:
+ actions: read
+ contents: write
+ issues: write
+ pull-requests: write
+ env:
+ CI_MEASUREMENT_SUBJECT_REF: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.ref || github.ref }}
+ CI_MEASUREMENT_SUBJECT_SHA: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.sha || github.sha }}
+ CI_MEASUREMENT_SUBJECT_LABEL: ${{ inputs.measurement_baseline_label }}
+ CI_MEASUREMENT_ALLOW_PROBE_FAILURES: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && '1' || '' }}
+ steps:
+ - uses: actions/checkout@v6
+ - name: Checkout CI measurement baseline ref
+ if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }}
+ uses: actions/checkout@v6
+ with:
+ ref: ${{ inputs.measurement_baseline_ref }}
+ - name: Install Nix
+ uses: DeterminateSystems/determinate-nix-action@v3
+ with:
+ extra-conf: |
+ experimental-features = nix-command flakes
+ accept-flake-config = true
+ extra-substituters = https://devenv.cachix.org
+ extra-trusted-public-keys = devenv.cachix.org-1:w1cLUi8dv3hnoSPGAuibQv+f9TZLr6cv/Hm9XgU50cw=
+ access-tokens = github.com=${{ github.token }}
+ summarize: true
+ - name: Provide cachix CLI from nixpkgs
+ shell: bash
+ run: |
+ set -euo pipefail
+ out=$(nix build --no-link --print-out-paths nixpkgs#cachix)
+ echo "$out/bin" >> "$GITHUB_PATH"
+ - name: Enable Cachix cache
+ uses: cachix/cachix-action@v17
+ with:
+ name: overeng-effect-utils
+ authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
+ - name: Use pinned devenv from lock
+ run: |
+ DEVENV_REV=$(jq -r .nodes.devenv.locked.rev devenv.lock)
+ if [ -z "$DEVENV_REV" ] || [ "$DEVENV_REV" = "null" ]; then
+ echo '::error::devenv.lock missing .nodes.devenv.locked.rev'
+ exit 1
+ fi
+ echo "DEVENV_REV=$DEVENV_REV" >> "$GITHUB_ENV"
+ echo "Pinned devenv rev: $DEVENV_REV"
+ shell: bash
+ - name: Isolate pnpm state
+ shell: bash
+ run: |
+ echo "PNPM_STORE_DIR=${{ runner.temp }}/pnpm-store/${{ github.job }}" >> "$GITHUB_ENV"
+ echo "PNPM_HOME=${{ github.workspace }}/.pnpm-home" >> "$GITHUB_ENV"
+ - id: restore-pnpm-state
+ name: Restore pnpm state
+ uses: actions/cache/restore@v4
+ with:
+ path: |
+ ${{ github.workspace }}/.pnpm-home
+ ${{ runner.temp }}/pnpm-store/${{ github.job }}
+ key: "pnpm-state-v1-${{ runner.os }}-${{ runner.arch }}-${{ hashFiles('**/pnpm-lock.yaml') }}"
+ - name: Resolve devenv
+ run: |
+ DEVENV_REV=$(jq -r .nodes.devenv.locked.rev devenv.lock)
+ if [ -z "$DEVENV_REV" ] || [ "$DEVENV_REV" = "null" ]; then
+ echo '::error::devenv.lock missing .nodes.devenv.locked.rev'
+ exit 1
+ fi
- if [ "$can_render_pr_comment" != "true" ]; then
- exit 1
- fi
+ resolve_devenv() {
+ nix build \
+ --accept-flake-config \
+ --option extra-substituters https://devenv.cachix.org \
+ --option extra-trusted-public-keys devenv.cachix.org-1:w1cLUi8dv3hnoSPGAuibQv+f9TZLr6cv/Hm9XgU50cw= \
+ --no-link \
+ --print-out-paths \
+ "github:cachix/devenv/$DEVENV_REV#devenv"
+ }
- if [ "$can_render_pr_comment" = "true" ]; then
- repo="${GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}"
- comment_tmp_dir="$(mktemp -d)"
- comments_json="$comment_tmp_dir/comments.json"
- comment_body="$comment_tmp_dir/comment.md"
- comment_id_file="$comment_tmp_dir/comment-id.txt"
- chart_file="$comment_tmp_dir/perf-change-vs-baseline.svg"
- chart_dark_file="$comment_tmp_dir/perf-change-vs-baseline-dark.svg"
- chart_png_file="$comment_tmp_dir/perf-change-vs-baseline.png"
- chart_dark_png_file="$comment_tmp_dir/perf-change-vs-baseline-dark.png"
- renderer_script="$comment_tmp_dir/render-ci-measurement-comment.mjs"
+ # Temporary: capture diagnostics dir for #272 root-cause analysis.
+ DIAG_ROOT="${RUNNER_TEMP:-/tmp}/nix-store-diagnostics-${GITHUB_JOB:-job}-${RUNNER_OS:-unknown}-${GITHUB_RUN_ATTEMPT:-0}"
+ mkdir -p "$DIAG_ROOT"
+ echo "NIX_STORE_DIAGNOSTICS_DIR=$DIAG_ROOT" >> "$GITHUB_ENV"
- if ! gh api "repos/$repo/issues/$pr_number/comments" --paginate >"$comments_json"; then
- echo "::notice::unable to list PR comments; skipping CI measurement PR comment"
- can_render_pr_comment=false
- fi
+ {
+ echo "timestamp_utc=$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+ echo "runner_name=${RUNNER_NAME:-unknown}"
+ echo "runner_os=${RUNNER_OS:-unknown}"
+ echo "runner_arch=${RUNNER_ARCH:-unknown}"
+ echo "github_job=${GITHUB_JOB:-unknown}"
+ echo "github_run_id=${GITHUB_RUN_ID:-unknown}"
+ echo "nix_user_conf_files=${NIX_USER_CONF_FILES:-}"
+ nix --version || true
+ } > "$DIAG_ROOT/environment.txt" 2>&1
- if [ "$can_render_pr_comment" = "true" ]; then
- asset_branch="${CI_MEASUREMENT_PR_COMMENT_ASSET_BRANCH:-ci-measurement-assets}"
- asset_title="$(printf '%s' "${CI_MEASUREMENT_PR_COMMENT_TITLE:-ci-measurements}" | tr '[:upper:]' '[:lower:]' | sed -E 's/[^a-z0-9]+/-/g; s/^-+//; s/-+$//')"
- if [ -z "$asset_title" ]; then
- asset_title="ci-measurements"
- fi
- asset_head_sha="${CI_MEASUREMENT_SUBJECT_SHA:-${GITHUB_HEAD_SHA:-${GITHUB_SHA:-unknown}}}"
- asset_run_id="${GITHUB_RUN_ID:-local}"
- asset_run_attempt="${GITHUB_RUN_ATTEMPT:-0}"
- asset_svg_path="ci-measurements/pr-$pr_number/${asset_head_sha}/run-${asset_run_id}-attempt-${asset_run_attempt}/${asset_title}.svg"
- asset_png_path="ci-measurements/pr-$pr_number/${asset_head_sha}/run-${asset_run_id}-attempt-${asset_run_attempt}/${asset_title}.png"
- asset_dark_png_path="ci-measurements/pr-$pr_number/${asset_head_sha}/run-${asset_run_id}-attempt-${asset_run_attempt}/${asset_title}-dark.png"
- public_asset_command="${CI_MEASUREMENT_PR_COMMENT_PUBLIC_ASSET_COMMAND:-}"
- repo_private="$(gh api "repos/$repo" --jq '.private // false' 2>/dev/null || printf 'true')"
- require_public_asset=false
- if [ "$repo_private" = "true" ]; then
- require_public_asset=true
- fi
- if [ "${GITHUB_SERVER_URL:-https://github.com}" = "https://github.com" ]; then
- github_raw_chart_url="https://raw.githubusercontent.com/$repo/$asset_branch/$asset_png_path"
- github_raw_chart_dark_url="https://raw.githubusercontent.com/$repo/$asset_branch/$asset_dark_png_path"
- github_raw_chart_source_url="https://raw.githubusercontent.com/$repo/$asset_branch/$asset_svg_path"
- else
- github_raw_chart_url="${GITHUB_SERVER_URL:-https://github.com}/$repo/raw/$asset_branch/$asset_png_path"
- github_raw_chart_dark_url="${GITHUB_SERVER_URL:-https://github.com}/$repo/raw/$asset_branch/$asset_dark_png_path"
- github_raw_chart_source_url="${GITHUB_SERVER_URL:-https://github.com}/$repo/raw/$asset_branch/$asset_svg_path"
- fi
- if [ "$repo_private" = "true" ]; then
- chart_url=""
- chart_dark_url=""
- chart_source_url=""
- else
- chart_url="$github_raw_chart_url"
- chart_dark_url="$github_raw_chart_dark_url"
- chart_source_url="$github_raw_chart_source_url"
- fi
- export CI_MEASUREMENT_PR_COMMENT_CHART_URL="$chart_url"
- export CI_MEASUREMENT_PR_COMMENT_CHART_DARK_URL="$chart_dark_url"
- export CI_MEASUREMENT_PR_COMMENT_CHART_SOURCE_URL="$chart_source_url"
+ if ! DEVENV_OUT=$(resolve_devenv 2> >(tee "$DIAG_ROOT/resolve-devenv.log" >&2)); then
+ echo "::error::resolve_devenv failed. Last 30 lines of log:"
+ tail -30 "$DIAG_ROOT/resolve-devenv.log" || true
+ exit 1
+ fi
+ DEVENV_BIN="$DEVENV_OUT/bin/devenv"
- cat > "$renderer_script" <<'EOF'
- import { readFileSync, writeFileSync } from 'node:fs'
+ # Fast validity check on the devenv store path (~1-2s vs ~25s for devenv info).
+ if ! nix-store --check-validity "$DEVENV_OUT" 2>/dev/null; then
+ echo "::warning::devenv store path invalid, repairing targeted path..."
+ nix-store --repair-path "$DEVENV_OUT" > "$DIAG_ROOT/nix-store-verify-repair.log" 2>&1 || true
+ rm -rf "${XDG_CACHE_HOME:-$HOME/.cache}"/nix/eval-cache-* ~/.cache/nix/eval-cache-*
+ if ! DEVENV_OUT=$(resolve_devenv 2> >(tee "$DIAG_ROOT/resolve-devenv-post-repair.log" >&2)); then
+ echo "::error::resolve_devenv failed after repair. Last 30 lines of log:"
+ tail -30 "$DIAG_ROOT/resolve-devenv-post-repair.log" || true
+ exit 1
+ fi
+ DEVENV_BIN="$DEVENV_OUT/bin/devenv"
+ fi
- const [comparisonPath, commentsPath, bodyPath, commentIdPath, chartPath, chartDarkPath] = process.argv.slice(2)
- const title = process.env.CI_MEASUREMENT_PR_COMMENT_TITLE || 'CI Measurements'
- const maxRows = Number.parseInt(process.env.CI_MEASUREMENT_PR_COMMENT_MAX_ROWS || '10', 10)
- const maxHistory = Number.parseInt(process.env.CI_MEASUREMENT_PR_COMMENT_MAX_HISTORY || '20', 10)
- const repo = process.env.GITHUB_REPOSITORY || 'unknown'
- const runId = process.env.GITHUB_RUN_ID || ''
- const runAttempt = process.env.GITHUB_RUN_ATTEMPT || ''
- const sha = process.env.GITHUB_SHA || ''
- const headSha = process.env.CI_MEASUREMENT_SUBJECT_SHA || process.env.GITHUB_HEAD_SHA || sha
- const serverUrl = process.env.GITHUB_SERVER_URL || 'https://github.com'
- const workflow = process.env.GITHUB_WORKFLOW || 'CI'
- const job = process.env.GITHUB_JOB || ''
- const chartUrl = process.env.CI_MEASUREMENT_PR_COMMENT_CHART_URL || ''
- const chartDarkUrl = process.env.CI_MEASUREMENT_PR_COMMENT_CHART_DARK_URL || ''
- const chartSourceUrl = process.env.CI_MEASUREMENT_PR_COMMENT_CHART_SOURCE_URL || ''
-
- const markerScope = (process.env.CI_MEASUREMENT_PR_COMMENT_MARKER || title)
- .toLowerCase()
- .replace(/[^a-z0-9]+/g, '-')
- .replace(/^-+|-+$/g, '') || 'default'
- const marker = ''
- const legacyMarker = ''
- const statePrefix = ''
- const stateTag = 'ci-measurement-comment-state'
- const schemaVersion = 1
-
- const comparison = JSON.parse(readFileSync(comparisonPath, 'utf8'))
- const comments = JSON.parse(readFileSync(commentsPath, 'utf8'))
- if (!Array.isArray(comments)) throw new Error('comments response must be an array')
+ echo "DEVENV_BIN=$DEVENV_BIN" >> "$GITHUB_ENV"
+ "$DEVENV_BIN" version | tee "$DIAG_ROOT/devenv-version.txt"
+ shell: bash
+ - name: Evict cached pnpm deps for oxlint-npm
+ shell: bash
+ run: |
+ targetRef='.#oxlint-npm'
+ entriesJson=$(mktemp)
+ if nix eval --json "$targetRef.passthru.depsBuildEntries" >"$entriesJson" 2>/dev/null; then
+ while IFS=$'\t' read -r attrName drv; do
+ [ -n "$drv" ] || continue
+ while IFS= read -r outPath; do
+ [ -n "$outPath" ] || continue
+ if nix path-info "$outPath" >/dev/null 2>&1; then
+ echo "evicting cached: $(basename "$outPath")"
+ if ! nix store delete --ignore-liveness "$outPath" >/dev/null 2>&1; then
+ echo "::error::failed to evict cached pnpm-deps output: $outPath"
+ exit 1
+ fi
+ if nix path-info "$outPath" >/dev/null 2>&1; then
+ echo "::error::cached pnpm-deps output still present after eviction: $outPath"
+ exit 1
+ fi
+ fi
+ done < <(nix-store -q --outputs "$drv" 2>/dev/null || true)
+ done < <(jq -r '.[] | [.attrName, (.drvPath // "")] | @tsv' "$entriesJson")
+ else
+ topDrv=$(nix path-info --derivation "$targetRef" 2>/dev/null || true)
+ if [ -n "$topDrv" ]; then
+ while IFS= read -r drv; do
+ [ -n "$drv" ] || continue
+ attrName=""
+ while IFS= read -r outPath; do
+ [ -n "$outPath" ] || continue
+ if nix path-info "$outPath" >/dev/null 2>&1; then
+ echo "evicting cached: $(basename "$outPath")"
+ if ! nix store delete --ignore-liveness "$outPath" >/dev/null 2>&1; then
+ echo "::error::failed to evict cached pnpm-deps output: $outPath"
+ exit 1
+ fi
+ if nix path-info "$outPath" >/dev/null 2>&1; then
+ echo "::error::cached pnpm-deps output still present after eviction: $outPath"
+ exit 1
+ fi
+ fi
+ done < <(nix-store -q --outputs "$drv" 2>/dev/null || true)
+ done < <(nix-store -qR "$topDrv" 2>/dev/null | grep "pnpm-deps-[a-z0-9-]*-v[0-9].*\.drv$" || true)
+ fi
+ fi
+ rm -f "$entriesJson"
+ - name: Force diagnostics failure (debug)
+ if: ${{ github.event_name == 'workflow_dispatch' && (inputs.debug_force_nix_diagnostics_failure == true || inputs.debug_force_nix_diagnostics_failure == 'true') }}
+ shell: bash
+ run: |
+ diag_dir="${NIX_STORE_DIAGNOSTICS_DIR:-${RUNNER_TEMP:-/tmp}/nix-store-diagnostics-missing}"
+ mkdir -p "$diag_dir"
+ cat > "$diag_dir/synthetic-signature.log" <<'EOF'
+ Failed to convert config.cachix to JSON
+ ... while evaluating the option `cachix.package`
+ error: path '/nix/store/synthetic-invalid-path' is not valid
+ EOF
+ echo "::warning::Intentional failure for diagnostics validation (#272)"
+ exit 1
+ - name: 'Download previous artifact: nix-closure-measurements'
+ shell: bash
+ env:
+ GH_TOKEN: ${{ github.token }}
+ BASELINE_ARTIFACT_NAME: nix-closure-measurements
+ BASELINE_OUTPUT_DIR: tmp/nix-closure-ci/baseline
+ BASELINE_WORKFLOW_NAME: ${{ github.workflow }}
+ BASELINE_BRANCH: ${{ github.base_ref || github.ref_name }}
+ BASELINE_SEED_RUNS_JSON: '[]'
+ BASELINE_MAX_RUNS: '20'
+ BASELINE_MAX_CANDIDATE_RUNS: '60'
+ BASELINE_REQUIRED_OBSERVATIONS_JSON: '[]'
+ run: |
+ set -euo pipefail
- const existing = comments.find((comment) => {
- if (typeof comment?.body !== 'string') return false
- return comment.body.includes(marker) ||
- (comment.body.includes(legacyMarker) && comment.body.includes('## ' + title))
- })
+ mkdir -p "$BASELINE_OUTPUT_DIR"
- const extractState = (body) => {
- if (typeof body !== 'string') return undefined
- const start = body.indexOf(statePrefix)
- if (start === -1) return undefined
- const end = body.indexOf(stateSuffix, start + statePrefix.length)
- if (end === -1) return undefined
- try {
- const parsed = JSON.parse(body.slice(start + statePrefix.length, end))
- if (parsed && parsed._tag === stateTag && Array.isArray(parsed.runs)) return parsed
- } catch {
- return undefined
- }
- return undefined
- }
+ if command -v gh >/dev/null 2>&1; then
+ GH_BIN="$(command -v gh)"
+ else
+ echo "::notice::gh is not on PATH; resolving GitHub CLI through Nix"
+ if ! GH_BIN="$(nix build --no-link --print-out-paths nixpkgs#gh 2>/dev/null)/bin/gh"; then
+ echo "::notice::unable to resolve GitHub CLI through Nix; skipping previous artifact download"
+ exit 0
+ fi
+ fi
+ echo "Using GitHub CLI: $GH_BIN"
- const formatNumber = (value) => {
- if (value === null || value === undefined || Number.isNaN(value)) return 'n/a'
- if (Number.isInteger(value)) return String(value)
- return String(Math.round(value * 1000) / 1000)
- }
+ repo="${GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}"
+ workflow="${BASELINE_WORKFLOW_NAME:-CI}"
+ branch="${BASELINE_BRANCH:-${GITHUB_BASE_REF:-${GITHUB_REF_NAME:-main}}}"
+ seed_runs_file="$BASELINE_OUTPUT_DIR/baseline-seed-runs.json"
+ required_observations_file="$BASELINE_OUTPUT_DIR/baseline-required-observations.json"
+ printf '%s' "${BASELINE_SEED_RUNS_JSON:-[]}" >"$seed_runs_file"
+ printf '%s' "${BASELINE_REQUIRED_OBSERVATIONS_JSON:-[]}" >"$required_observations_file"
+ if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.runId | type == "string")) else false end' \
+ "$seed_runs_file" >/dev/null; then
+ echo "::error::BASELINE_SEED_RUNS_JSON must be an array of objects with string runId fields"
+ exit 1
+ fi
+ if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.id | type == "string") and (.minSources | type == "number")) else false end' \
+ "$required_observations_file" >/dev/null; then
+ echo "::error::BASELINE_REQUIRED_OBSERVATIONS_JSON must be an array of objects with string id and numeric minSources fields"
+ exit 1
+ fi
+ seed_run_ids="$(jq -r '.[].runId' "$seed_runs_file")"
+ required_observation_count="$(jq 'length' "$required_observations_file")"
+ max_candidate_runs="${BASELINE_MAX_CANDIDATE_RUNS:-${BASELINE_MAX_RUNS:-5}}"
+ if ! [[ "$max_candidate_runs" =~ ^[0-9]+$ ]] || [ "$max_candidate_runs" -lt 1 ]; then
+ max_candidate_runs=1
+ fi
- const formatValue = (value, unit) => {
- if (value === null || value === undefined) return 'n/a'
- if (unit === 'bytes') {
- if (value >= 1073741824) return formatNumber(Math.round((value / 1073741824) * 10) / 10) + ' GiB'
- if (value >= 1048576) return formatNumber(Math.round((value / 1048576) * 10) / 10) + ' MiB'
- if (value >= 1024) return formatNumber(Math.round((value / 1024) * 10) / 10) + ' KiB'
- return formatNumber(value) + ' B'
- }
- if (unit === 'seconds') return formatNumber(value) + ' s'
- return formatNumber(value) + (unit ? ' ' + unit : '')
- }
+ candidate_runs="$(
+ "$GH_BIN" run list \
+ --repo "$repo" \
+ --workflow "$workflow" \
+ --branch "$branch" \
+ --event push \
+ --status success \
+ --json databaseId,headSha \
+ --limit "$max_candidate_runs" \
+ --jq '[.[] | select(.headSha != env.GITHUB_SHA) | .databaseId] | .[]'
+ )"
- const formatDelta = (value, unit) => {
- if (value === null || value === undefined) return 'n/a'
- const sign = value >= 0 ? '+' : '-'
- return sign + formatValue(Math.abs(value), unit)
- }
+ candidate_runs="$seed_run_ids
+ $candidate_runs"
- const formatRatio = (value) => {
- if (value === null || value === undefined) return 'n/a'
- return formatNumber(Math.round((value - 1) * 1000) / 10) + '%'
- }
+ max_runs="${BASELINE_MAX_RUNS:-5}"
+ if ! [[ "$max_runs" =~ ^[0-9]+$ ]] || [ "$max_runs" -lt 1 ]; then
+ max_runs=1
+ fi
- const formatSemanticImpact = (value) => {
- if (value === null || value === undefined || Number.isNaN(value)) return 'n/a'
- if (Math.abs(value) < 0.005) return '0.00x'
- const sign = value > 0 ? '+' : ''
- return sign + formatNumber(Math.round(value * 100) / 100) + 'x'
- }
+ write_baseline_observation_counts() {
+ local measurement_index="$BASELINE_OUTPUT_DIR/baseline-measurement-files.txt"
+ local counts_file="$BASELINE_OUTPUT_DIR/baseline-observation-counts.json"
+ find "$BASELINE_OUTPUT_DIR" \
+ -mindepth 2 \
+ -maxdepth 2 \
+ -name measurements.json \
+ -type f \
+ -print \
+ | sort >"$measurement_index" || true
- const formatRowImpact = (row) => {
- if (row.confidence === 'diagnostic' || row.gateReason === 'disabled' || row.semanticImpactKind === 'diagnostic') {
- return 'diagnostic'
- }
- return formatSemanticImpact(row.semanticImpactScore)
+ if [ -s "$measurement_index" ]; then
+ xargs -r jq -s \
+ --slurpfile required "$required_observations_file" \
+ '
+ ([.[] | (.observations // [])[]? | select(.value | type == "number") | .id] | sort | group_by(.) | map({id: .[0], sources: length})) as $counts
+ | ($required[0] // []) as $requiredRows
+ | {
+ counts: $counts,
+ required: (
+ $requiredRows
+ | map(. as $requiredRow | ($counts | map(select(.id == $requiredRow.id)) | .[0].sources // 0) as $actual | $requiredRow + {sources:$actual, satisfied:($actual >= $requiredRow.minSources)})
+ )
+ }
+ ' <"$measurement_index" >"$counts_file"
+ else
+ jq -n --slurpfile required "$required_observations_file" \
+ '{counts: [], required: (($required[0] // []) | map(. + {sources:0, satisfied:false}))}' >"$counts_file"
+ fi
}
- const formatEvidence = (row) => {
- const unit = row.observation?.unit
- if (row.comparisonMode === 'paired' && typeof row.evidenceDeltaLower === 'number' && typeof row.evidenceDeltaUpper === 'number') {
- const quantile = typeof row.pairedEvidenceQuantile === 'number'
- ? Math.round(row.pairedEvidenceQuantile * 100)
- : 25
- return (row.confidence || 'unknown')
- + '
paired n=' + (row.pairedSamples ?? 0)
- + ', ' + quantile + '-' + (100 - quantile) + '% delta '
- + formatValue(row.evidenceDeltaLower, unit)
- + ' - ' + formatValue(row.evidenceDeltaUpper, unit)
- + ''
- }
- return (row.confidence || 'unknown') + '
baseline n=' + (row.baselineSources ?? 0) + ', current samples=' + (row.currentSamples ?? 1) + ''
+ baseline_requirements_satisfied() {
+ if [ "$required_observation_count" -eq 0 ]; then
+ return 1
+ fi
+ write_baseline_observation_counts
+ jq -e '.required | all(.satisfied == true)' "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" >/dev/null
}
- const interpretation = (row) => {
- if (row.confidence === 'low_baseline_count') return {
- label: 'Needs more baseline',
- detail: 'Not enough compatible baseline runs to make this gate trustworthy.',
- tone: 'neutral',
- color: '#94a3b8',
- }
- if (row.confidence === 'low_current_sample_count') return {
- label: 'Needs repeat',
- detail: 'Current run has too few successful measured samples.',
- tone: 'neutral',
- color: '#94a3b8',
- }
- if (row.confidence === 'low_paired_sample_count') return {
- label: 'Needs paired evidence',
- detail: 'Wall-clock gates require same-run base/head samples before they can block merges.',
- tone: 'neutral',
- color: '#94a3b8',
- }
- if (row.confidence === 'missing_paired_delta') return {
- label: 'Needs paired delta stats',
- detail: 'Wall-clock gates require per-pair delta statistics, not only paired medians.',
- tone: 'neutral',
- color: '#94a3b8',
- }
- if (row.confidence === 'paired_uncertain') return {
- label: 'Uncertain wall-clock movement',
- detail: 'The paired median moved, but the paired delta band still crosses the configured budget.',
- tone: 'neutral',
- color: '#94a3b8',
- }
- if (row.confidence === 'diagnostic') return {
- label: 'Diagnostic only',
- detail: 'Shown for investigation, but intentionally excluded from gating.',
- tone: 'diagnostic',
- color: '#a78bfa',
- }
- if (row.status === 'fail') return {
- label: 'Regression - blocks merge',
- detail: 'Worse than the configured fail threshold with enough samples.',
- tone: 'bad',
- color: '#ef4444',
- }
- if (row.status === 'warn') return {
- label: 'Regression - review',
- detail: 'Worse than the configured warning threshold.',
- tone: 'warn',
- color: '#f59e0b',
- }
- if (row.status === 'missing_baseline') return {
- label: 'No baseline yet',
- detail: 'Current value is measured, but no comparable baseline exists.',
- tone: 'neutral',
- color: '#94a3b8',
- }
- if (row.confidence === 'noise_floor') return {
- label: 'Too small to matter',
- detail: 'The absolute change is below the noise floor for this metric.',
- tone: 'neutral',
- color: '#94a3b8',
- }
- if (row.confidence === 'within_baseline_range') return {
- label: 'Historical range only',
- detail: 'Inside the full historical min/max range, but this range is not used to pass a gate.',
- tone: 'neutral',
- color: '#94a3b8',
- }
- if (row.confidence === 'within_robust_band' || row.confidence === 'within_baseline_distribution') return {
- label: 'Within noise band',
- detail: 'Current and baseline robust noise bands overlap.',
- tone: 'neutral',
- color: '#94a3b8',
- }
- if (row.direction === 'improved' && typeof row.semanticImpactScore === 'number' && row.semanticImpactScore <= -1) return {
- label: 'Meaningfully lower',
- detail: 'Lower than baseline by enough to cross the configured review threshold.',
- tone: 'good',
- color: '#10b981',
- }
- if (row.direction === 'improved') return {
- label: 'Slightly lower, ok',
- detail: 'Lower than baseline, but still inside the configured review budget.',
- tone: 'neutral',
- color: '#94a3b8',
- }
- if (row.direction === 'regressed') return {
- label: 'Slightly higher, ok',
- detail: 'Higher than baseline but still inside the configured budget.',
- tone: 'neutral',
- color: '#94a3b8',
- }
- return {
- label: 'Unchanged',
- detail: 'No meaningful movement from baseline.',
- tone: 'neutral',
- color: '#94a3b8',
- }
- }
+ run_id=""
+ artifact_name=""
+ artifact_id=""
+ downloaded_runs_file="$BASELINE_OUTPUT_DIR/baseline-runs.jsonl"
+ seen_runs_file="$BASELINE_OUTPUT_DIR/baseline-seen-runs.txt"
+ : >"$downloaded_runs_file"
+ : >"$seen_runs_file"
+ for candidate_run in $candidate_runs; do
+ if [ -z "$candidate_run" ]; then
+ continue
+ fi
+ if grep -qxF "$candidate_run" "$seen_runs_file"; then
+ continue
+ fi
+ downloaded_count="$(wc -l <"$downloaded_runs_file" | tr -d ' ')"
+ if [ "$downloaded_count" -ge "$max_runs" ]; then
+ if baseline_requirements_satisfied; then
+ break
+ fi
+ echo "::notice::downloaded $downloaded_count baseline artifact(s), but required observation counts are not satisfied yet; continuing through bounded candidate history"
+ fi
+ if [ "$(wc -l <"$seen_runs_file" | tr -d ' ')" -ge "$max_candidate_runs" ]; then
+ break
+ fi
+ printf '%s\n' "$candidate_run" >>"$seen_runs_file"
- const formatGate = (row) => {
- if (row.gateable) return 'yes'
- const reason = row.gateReason || row.confidence || 'unknown'
- return 'no
' + reason + ''
- }
+ artifact_json="$(
+ "$GH_BIN" api "repos/$repo/actions/runs/$candidate_run/artifacts" \
+ | jq --arg artifactName "$BASELINE_ARTIFACT_NAME" '.artifacts
+ | map(select(.expired == false))
+ | map(select(.name == $artifactName or (.name | startswith($artifactName + "-"))))
+ | sort_by(.created_at // "")
+ | reverse
+ | .[0] // empty'
+ )"
- const escapeCell = (value) => String(value ?? '-').replaceAll('|', '\\|').replaceAll('\n', '
')
- const escapeXml = (value) => String(value)
- .replaceAll('&', '&')
- .replaceAll('<', '<')
- .replaceAll('>', '>')
- .replaceAll('"', '"')
+ if [ -n "$artifact_json" ]; then
+ current_artifact_name="$(printf '%s' "$artifact_json" | jq -r '.name')"
+ current_artifact_id="$(printf '%s' "$artifact_json" | jq -r '.id')"
+ current_output_dir="$BASELINE_OUTPUT_DIR/run-$candidate_run"
+ mkdir -p "$current_output_dir"
+ if "$GH_BIN" run download "$candidate_run" \
+ --repo "$repo" \
+ --name "$current_artifact_name" \
+ --dir "$current_output_dir"; then
+ if [ -z "$run_id" ]; then
+ run_id="$candidate_run"
+ artifact_name="$current_artifact_name"
+ artifact_id="$current_artifact_id"
+ fi
+ jq -cn \
+ --arg runId "$candidate_run" \
+ --arg artifactName "$current_artifact_name" \
+ --arg artifactId "$current_artifact_id" \
+ --arg path "run-$candidate_run" \
+ '{runId:$runId, artifactName:$artifactName, artifactId:$artifactId, path:$path}' \
+ >>"$downloaded_runs_file"
+ else
+ echo "::notice::failed to download baseline artifact $current_artifact_name from run $candidate_run"
+ fi
+ fi
+ done
- const humanProbe = (row) => {
- if (row.observation?.label) return row.observation.label
- const probe = row.observation?.dimensions?.probe
- const name = row.observation?.name || 'unknown'
- const labels = {
- shell_eval_traced: 'Shell eval with OTEL trace',
- shell_eval_warm: 'Warm shell eval',
- tasks_list: 'devenv tasks list',
- processes_help: 'devenv processes --help',
- task_pnpm_install: 'pnpm:install',
- task_genie_run: 'genie:run',
- task_check_quick: 'check:quick',
- task_check_quick_warm: 'Warm cached check:quick',
- task_check_quick_forced: 'Forced check:quick',
- }
- if (probe && labels[probe]) return labels[probe]
- if (name.startsWith('devenv.') && name.endsWith('.duration')) {
- return name.slice('devenv.'.length, -'.duration'.length).replaceAll('_', ' ')
- }
- return name
- }
+ write_baseline_observation_counts
- const semanticPath = (row) => {
- const parts = [
- ...(Array.isArray(row.target?.path) ? row.target.path : []),
- row.target?.group,
- ...(Array.isArray(row.observation?.path) ? row.observation.path : []),
- row.observation?.group,
- ].filter((value) => typeof value === 'string' && value.length > 0)
- const seen = new Set()
- const unique = parts.filter((part) => {
- if (seen.has(part)) return false
- seen.add(part)
- return true
- })
- return unique.length > 0 ? unique.join(' / ') : '-'
- }
+ if [ -z "$run_id" ] || [ -z "$artifact_name" ]; then
+ echo "::notice::no successful baseline run found for $repo workflow=$workflow branch=$branch"
+ exit 0
+ fi
- const chartProbe = (row) => {
- if (row.observation?.label) return row.observation.label
- const probe = row.observation?.dimensions?.probe
- const labels = {
- shell_eval_traced: 'Shell eval with OTEL trace',
- shell_eval_warm: 'Warm shell eval',
- tasks_list: 'devenv tasks list',
- processes_help: 'processes --help',
- task_pnpm_install: 'pnpm:install',
- task_genie_run: 'genie:run',
- task_check_quick: 'check:quick',
- task_check_quick_warm: 'Warm cached check:quick',
- task_check_quick_forced: 'Forced check:quick',
- }
- if (probe && labels[probe]) return labels[probe]
- return humanProbe(row)
- }
+ jq -n \
+ --slurpfile runs "$downloaded_runs_file" \
+ --slurpfile seedRuns "$seed_runs_file" \
+ --slurpfile observationCounts "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" \
+ --argjson schemaVersion 1 \
+ --arg repository "$repo" \
+ --arg workflow "$workflow" \
+ --arg branch "$branch" \
+ --arg runId "$run_id" \
+ --arg artifactName "$artifact_name" \
+ --arg artifactId "$artifact_id" \
+ '{
+ schemaVersion: $schemaVersion,
+ source: "github-actions-artifact",
+ repository: $repository,
+ workflow: $workflow,
+ branch: $branch,
+ runId: $runId,
+ artifactName: $artifactName,
+ artifactId: $artifactId,
+ seedRuns: ($seedRuns[0] // []),
+ runs: $runs,
+ observationCounts: ($observationCounts[0] // null)
+ }' >"$BASELINE_OUTPUT_DIR/baseline-provenance.json"
- const dimensions = (row) => {
- const entries = Object.entries(row.observation?.dimensions || {})
- if (entries.length === 0) return '-'
- return entries
- .sort(([left], [right]) => left.localeCompare(right))
- .map(([key, value]) => key + '=' + String(value))
- .join('
')
- }
+ echo "Downloaded $(wc -l <"$downloaded_runs_file" | tr -d ' ') baseline artifact(s), latest $artifact_name from run $run_id into $BASELINE_OUTPUT_DIR"
- const rank = (row) => {
- if (row.status === 'fail') return 0
- if (row.status === 'warn') return 1
- if (row.status === 'missing_baseline') return 3
- return 2
- }
-
- const allRows = Object.values(comparison.comparisons || {}).sort((left, right) => {
- const byRank = rank(left) - rank(right)
- if (byRank !== 0) return byRank
- const leftImpact = typeof left.semanticImpactScore === 'number' ? Math.abs(left.semanticImpactScore) : 0
- const rightImpact = typeof right.semanticImpactScore === 'number' ? Math.abs(right.semanticImpactScore) : 0
- if (rightImpact !== leftImpact) return rightImpact - leftImpact
- const leftDelta = typeof left.delta === 'number' ? Math.abs(left.delta) : 0
- const rightDelta = typeof right.delta === 'number' ? Math.abs(right.delta) : 0
- if (rightDelta !== leftDelta) return rightDelta - leftDelta
- return humanProbe(left).localeCompare(humanProbe(right))
- })
- const protocolLabel = (() => {
- const protocols = new Set(
- allRows
- .map((row) => row.observation?.dimensions?.measurementProtocol)
- .filter((value) => typeof value === 'string' && value.length > 0),
- )
- return protocols.size > 0 ? Array.from(protocols).join(', ') : 'legacy'
- })()
- const visibleLimit = Number.isFinite(maxRows) && maxRows > 0 ? maxRows : 10
- const comparableRows = allRows.filter((row) => typeof row.baseline === 'number')
- const hasComparableBaseline = comparableRows.length > 0
- const isDiagnosticRow = (row) =>
- row.status === 'missing_baseline' ||
- row.confidence === 'diagnostic' ||
- row.gateReason === 'disabled' ||
- row.semanticImpactKind === 'diagnostic' ||
- (!row.gateable && typeof row.baseline !== 'number')
- const isZeroImpactRow = (row) =>
- typeof row.semanticImpactScore === 'number' &&
- !Number.isNaN(row.semanticImpactScore) &&
- Math.abs(row.semanticImpactScore) < 0.005
- const actionableComparableRows = comparableRows.filter((row) => !isDiagnosticRow(row))
- const visibleRows = (hasComparableBaseline
- ? actionableComparableRows
- : allRows.filter((row) => !isDiagnosticRow(row)).sort((left, right) => (right.current || 0) - (left.current || 0))
- ).slice(0, visibleLimit)
- const nonZeroImpactRows = actionableComparableRows.filter((row) => !isZeroImpactRow(row))
- const zeroImpactRows = actionableComparableRows.filter(isZeroImpactRow)
- const visibleNonZeroImpactRows = nonZeroImpactRows.slice(0, visibleLimit)
- const diagnosticRows = allRows.filter(isDiagnosticRow)
+ - name: 'Measure Nix closure: genie'
+ shell: bash
+ env:
+ ARTIFACT_DIR: tmp/nix-closure-ci/current/genie_package
+ RUNNER_CLASS: '${{ runner.os }}-${{ runner.arch }}'
+ run: |
+ set -euo pipefail
- const baselineToCurrent = (row) => {
- const unit = row.observation?.unit
- return formatValue(row.baseline, unit) + ' -> ' + formatValue(row.current, unit)
- }
+ mkdir -p "$ARTIFACT_DIR"
+ installable='.#genie'
+ target_id='genie_package'
+ target_name='genie'
+ target_label='Genie package'
+ target_group='packages'
+ target_description='the packaged Genie CLI closure'
+ artifact_file="$ARTIFACT_DIR/measurements.json"
+ target_system='x86_64-linux'
- const rawChange = (row) => {
- const unit = row.observation?.unit
- return formatDelta(row.delta, unit) + ' / ' + formatRatio(row.ratio)
- }
+ out_path="$(nix build --no-update-lock-file --no-link --print-out-paths "$installable")"
+ path_info="$ARTIFACT_DIR/nix-closure-path-info.json"
+ paths_file="$ARTIFACT_DIR/nix-closure-paths.json"
- const confidenceSummary = (row) => {
- const unit = row.observation?.unit
- if (row.comparisonMode === 'paired' && typeof row.evidenceDeltaLower === 'number' && typeof row.evidenceDeltaUpper === 'number') {
- const quantile = typeof row.pairedEvidenceQuantile === 'number'
- ? Math.round(row.pairedEvidenceQuantile * 100)
- : 25
- return 'paired n=' + (row.pairedSamples ?? 0)
- + ', ' + quantile + '-' + (100 - quantile) + '% delta '
- + formatValue(row.evidenceDeltaLower, unit)
- + '..' + formatValue(row.evidenceDeltaUpper, unit)
- }
- return (row.confidence || 'unknown') + ', baseline n=' + (row.baselineSources ?? 0) + ', current n=' + (row.currentSamples ?? 1)
- }
+ nix path-info --recursive --json "$out_path" >"$path_info"
+ jq 'to_entries | map({ path: .key, narSize: (.value.narSize // 0) })' "$path_info" >"$paths_file"
- const scanDecision = (row) => {
- if (row.status === 'fail') return 'regression blocks'
- if (row.status === 'warn') return 'regression review'
- if (row.status === 'missing_baseline') return 'needs baseline'
- if (row.direction === 'improved') return 'faster'
- if (row.direction === 'regressed') return 'no material impact'
- return 'unchanged'
- }
+ jq -n \
+ --slurpfile paths "$paths_file" \
+ --argjson schemaVersion 1 \
+ --arg generatedAt "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
+ --arg repository "${GITHUB_REPOSITORY:-unknown}" \
+ --arg branchKind "${GITHUB_EVENT_NAME:-unknown}" \
+ --arg ref "${CI_MEASUREMENT_SUBJECT_REF:-${GITHUB_REF:-unknown}}" \
+ --arg headSha "${CI_MEASUREMENT_SUBJECT_SHA:-${GITHUB_SHA:-unknown}}" \
+ --arg baseSha "${GITHUB_BASE_SHA:-}" \
+ --arg runnerName "${RUNNER_NAME:-unknown}" \
+ --arg runnerOs "${RUNNER_OS:-unknown}" \
+ --arg runnerArch "${RUNNER_ARCH:-unknown}" \
+ --arg runnerClass "${RUNNER_CLASS:-unknown}" \
+ --arg githubRunId "${GITHUB_RUN_ID:-unknown}" \
+ --arg githubRunAttempt "${GITHUB_RUN_ATTEMPT:-unknown}" \
+ --arg githubJob "${GITHUB_JOB:-unknown}" \
+ --arg taskId "${CROSSTASK_TASK_ID:-}" \
+ --arg taskAttemptId "${CROSSTASK_ATTEMPT_ID:-}" \
+ --arg traceId "${TRACE_ID:-}" \
+ --arg targetName "$target_name" \
+ --arg targetId "$target_id" \
+ --arg targetLabel "$target_label" \
+ --arg targetGroup "$target_group" \
+ --arg targetDescription "$target_description" \
+ --arg targetSystem "$target_system" \
+ --arg outPath "$out_path" \
+ --argjson buckets '[{"name":"node","label":"Node / pnpm","pathRegex":"node_modules|npm-deps|pnpm"},{"name":"nix-sources","label":"Nix sources","pathRegex":"-source$"},{"name":"rust","label":"Rust","pathRegex":"cargo|rust|rustc"}]' \
+ --argjson targetPath '["nix","closures","packages","genie"]' \
+ --argjson gatePolicy '{}' \
+ '
+ ($paths[0] // []) as $closurePaths
+ | ($closurePaths | map(.narSize) | add // 0) as $totalNarSize
+ | ($closurePaths | length) as $pathCount
+ | ($buckets | map(
+ . as $bucket
+ | {
+ name: "nix.closure.bucket.nar_size",
+ id: "nix.closure.bucket.nar_size",
+ label: (($bucket.label // $bucket.name) + " closure size"),
+ group: "nix closure buckets",
+ path: ($targetPath + ["buckets", $bucket.name]),
+ description: ("NAR size contributed by closure paths matching " + $bucket.pathRegex),
+ measurementKind: "deterministic",
+ unit: "bytes",
+ value: (
+ $closurePaths
+ | map(select(.path | test($bucket.pathRegex)) | .narSize)
+ | add // 0
+ ),
+ policy: $gatePolicy,
+ dimensions: { bucket: $bucket.name }
+ }
+ )) as $bucketObservations
+ | {
+ schemaVersion: $schemaVersion,
+ generatedAt: $generatedAt,
+ producer: { name: "effect-utils-ci-measurement", version: 1 },
+ subject: {
+ repo: $repository,
+ branchKind: (if $branchKind == "" then "unknown" else $branchKind end),
+ ref: $ref,
+ headSha: $headSha,
+ baseSha: $baseSha
+ },
+ execution: {
+ provider: (if ($githubRunId != "" and $githubRunId != "unknown") then "github-actions" else "local" end),
+ workflow: "CI",
+ job: $githubJob,
+ runId: $githubRunId,
+ runAttempt: $githubRunAttempt,
+ taskId: $taskId,
+ attemptId: $taskAttemptId,
+ traceId: $traceId,
+ runner: { name: $runnerName, os: $runnerOs, arch: $runnerArch, class: $runnerClass }
+ },
+ target: { kind: "nix-closure", id: $targetId, name: $targetName, label: $targetLabel, group: $targetGroup, path: $targetPath, system: $targetSystem },
+ observations: ([
+ {
+ id: "nix.closure.nar_size",
+ label: "Total closure size",
+ group: "nix closure",
+ path: ($targetPath + ["total", "nar-size"]),
+ description: ("Total NAR size for all paths in " + $targetDescription),
+ name: "nix.closure.nar_size",
+ measurementKind: "deterministic",
+ unit: "bytes",
+ value: $totalNarSize,
+ policy: $gatePolicy,
+ dimensions: { bucket: "total" }
+ },
+ {
+ id: "nix.closure.path_count",
+ label: "Total closure path count",
+ group: "nix closure",
+ path: ($targetPath + ["total", "path-count"]),
+ description: ("Number of store paths in " + $targetDescription),
+ name: "nix.closure.path_count",
+ measurementKind: "deterministic",
+ unit: "count",
+ value: $pathCount,
+ policy: $gatePolicy,
+ dimensions: { bucket: "total" }
+ }
+ ] + $bucketObservations),
+ artifacts: [
+ { name: "nix-closure-path-info", path: "nix-closure-path-info.json", contentType: "application/json" },
+ { name: "nix-closure-paths", path: "nix-closure-paths.json", contentType: "application/json" }
+ ],
+ details: {
+ outPath: $outPath,
+ topPaths: ($closurePaths | sort_by(.narSize) | reverse | .[:30])
+ }
+ }
+ ' >"$artifact_file"
- const scanTable = (rows) => {
- if (rows.length === 0) return 'No non-zero actionable measurement impact detected.'
- return [
- '| What changed? | Probe | Baseline -> current | Raw change | Impact | Confidence |',
- '| --- | --- | --- | ---: | ---: | --- |',
- ...rows.map((row) => {
- return '| ' + [
- scanDecision(row),
- humanProbe(row),
- baselineToCurrent(row),
- rawChange(row),
- formatRowImpact(row),
- confidenceSummary(row),
- ].map(escapeCell).join(' | ') + ' |'
- }),
- ].join('\n')
- }
+ cat "$artifact_file"
- const zeroImpactTable = (rows) => {
- if (rows.length === 0) return 'No zero-impact measurements.'
- return [
- '| Probe | Baseline -> current | Raw change | Impact | Gate | Evidence | Why hidden |',
- '| --- | --- | ---: | ---: | --- | --- | --- |',
- ...rows.map((row) => {
- const meaning = interpretation(row)
- return '| ' + [
- humanProbe(row),
- baselineToCurrent(row),
- rawChange(row),
- formatRowImpact(row),
- row.gateable ? 'yes' : (row.gateReason || 'no'),
- confidenceSummary(row),
- meaning.label,
- ].map(escapeCell).join(' | ') + ' |'
- }),
- ].join('\n')
- }
+ - name: 'Measure Nix closure: megarepo'
+ shell: bash
+ env:
+ ARTIFACT_DIR: tmp/nix-closure-ci/current/megarepo_package
+ RUNNER_CLASS: '${{ runner.os }}-${{ runner.arch }}'
+ run: |
+ set -euo pipefail
- const diagnosticTable = (rows) => {
- if (rows.length === 0) return 'No diagnostic or ungated measurements.'
- return [
- '| Probe | Current | Baseline | Impact | Gate | Reason | Evidence |',
- '| --- | ---: | ---: | ---: | --- | --- | --- |',
- ...rows.map((row) => {
- return '| ' + [
- humanProbe(row),
- formatValue(row.current, row.observation?.unit),
- formatValue(row.baseline, row.observation?.unit),
- formatRowImpact(row),
- row.gateable ? 'yes' : (row.gateReason || row.status || 'no'),
- interpretation(row).label,
- confidenceSummary(row),
- ].map(escapeCell).join(' | ') + ' |'
- }),
- ].join('\n')
- }
-
- const comparisonTable = (rows) => {
- if (rows.length === 0) return 'No measurement regressions detected.'
- return [
- '| Group | Measurement | Baseline | Current | Raw change | Impact | Meaning | Gate | Evidence |',
- '| --- | --- | ---: | ---: | ---: | ---: | --- | --- | --- |',
- ...rows.map((row) => {
- const unit = row.observation?.unit
- const baselineRange = typeof row.baselineRobustLower === 'number' && typeof row.baselineRobustUpper === 'number' && row.baselineRobustLower !== row.baselineRobustUpper
- ? '
noise band ' + formatValue(row.baselineRobustLower, unit) + ' - ' + formatValue(row.baselineRobustUpper, unit) + ''
- : typeof row.baselineMin === 'number' && typeof row.baselineMax === 'number' && row.baselineMin !== row.baselineMax
- ? '
range ' + formatValue(row.baselineMin, unit) + ' - ' + formatValue(row.baselineMax, unit) + ''
- : ''
- const meaning = interpretation(row)
- return '| ' + [
- semanticPath(row),
- humanProbe(row),
- formatValue(row.baseline, unit) + baselineRange,
- formatValue(row.current, unit),
- formatDelta(row.delta, unit) + ' / ' + formatRatio(row.ratio),
- formatRowImpact(row),
- meaning.label + '
' + meaning.detail + '',
- formatGate(row),
- formatEvidence(row),
- ].map(escapeCell).join(' | ') + ' |'
- }),
- ].join('\n')
- }
+ mkdir -p "$ARTIFACT_DIR"
+ installable='.#megarepo'
+ target_id='megarepo_package'
+ target_name='megarepo'
+ target_label='Megarepo package'
+ target_group='packages'
+ target_description='the packaged megarepo CLI closure'
+ artifact_file="$ARTIFACT_DIR/measurements.json"
+ target_system='x86_64-linux'
- const currentOnlyTable = (rows) => {
- if (rows.length === 0) return 'No current measurements found.'
- return [
- '| Group | Measurement | Current |',
- '| --- | --- | ---: |',
- ...rows.map((row) => {
- return '| ' + [semanticPath(row), humanProbe(row), formatValue(row.current, row.observation?.unit)].map(escapeCell).join(' | ') + ' |'
- }),
- ].join('\n')
- }
+ out_path="$(nix build --no-update-lock-file --no-link --print-out-paths "$installable")"
+ path_info="$ARTIFACT_DIR/nix-closure-path-info.json"
+ paths_file="$ARTIFACT_DIR/nix-closure-paths.json"
- const allMeasurementsTable = (rows) => {
- if (rows.length === 0) return 'No measurement regressions detected.'
- return [
- '| Status | Gate | Target | Observation | Dimensions | Baseline | Current | Delta | Ratio | Impact |',
- '| --- | --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: |',
- ...rows.map((row) => {
- const unit = row.observation?.unit
- return '| ' + [
- row.status,
- row.gateable ? 'yes' : (row.gateReason || 'no'),
- row.target?.label || row.target?.name || 'unknown',
- row.observation?.label || row.observation?.name || 'unknown',
- dimensions(row),
- formatValue(row.baseline, unit),
- formatValue(row.current, unit),
- formatDelta(row.delta, unit),
- formatRatio(row.ratio),
- formatRowImpact(row),
- ].map(escapeCell).join(' | ') + ' |'
- }),
- ].join('\n')
- }
+ nix path-info --recursive --json "$out_path" >"$path_info"
+ jq 'to_entries | map({ path: .key, narSize: (.value.narSize // 0) })' "$path_info" >"$paths_file"
- const sourceMeasurement = (row) => ({
- id: row.observation?.dimensions?.probe || row.observation?.name || humanProbe(row),
- label: humanProbe(row),
- group: semanticPath(row),
- status: row.status,
- direction: row.direction,
- gateable: row.gateable,
- gateReason: row.gateReason,
- confidence: row.confidence,
- comparisonMode: row.comparisonMode,
- unit: row.observation?.unit,
- baseline: row.baseline ?? null,
- current: row.current ?? null,
- delta: row.delta ?? null,
- ratio: row.ratio ?? null,
- semanticImpactScore: row.semanticImpactScore ?? null,
- semanticImpactKind: row.semanticImpactKind ?? null,
- baselineSources: row.baselineSources ?? null,
- currentSamples: row.currentSamples ?? null,
- pairedSamples: row.pairedSamples ?? null,
- evidenceDeltaLower: row.evidenceDeltaLower ?? null,
- evidenceDeltaUpper: row.evidenceDeltaUpper ?? null,
- pairedEvidenceQuantile: row.pairedEvidenceQuantile ?? null,
- dimensions: row.observation?.dimensions || {},
- })
+ jq -n \
+ --slurpfile paths "$paths_file" \
+ --argjson schemaVersion 1 \
+ --arg generatedAt "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
+ --arg repository "${GITHUB_REPOSITORY:-unknown}" \
+ --arg branchKind "${GITHUB_EVENT_NAME:-unknown}" \
+ --arg ref "${CI_MEASUREMENT_SUBJECT_REF:-${GITHUB_REF:-unknown}}" \
+ --arg headSha "${CI_MEASUREMENT_SUBJECT_SHA:-${GITHUB_SHA:-unknown}}" \
+ --arg baseSha "${GITHUB_BASE_SHA:-}" \
+ --arg runnerName "${RUNNER_NAME:-unknown}" \
+ --arg runnerOs "${RUNNER_OS:-unknown}" \
+ --arg runnerArch "${RUNNER_ARCH:-unknown}" \
+ --arg runnerClass "${RUNNER_CLASS:-unknown}" \
+ --arg githubRunId "${GITHUB_RUN_ID:-unknown}" \
+ --arg githubRunAttempt "${GITHUB_RUN_ATTEMPT:-unknown}" \
+ --arg githubJob "${GITHUB_JOB:-unknown}" \
+ --arg taskId "${CROSSTASK_TASK_ID:-}" \
+ --arg taskAttemptId "${CROSSTASK_ATTEMPT_ID:-}" \
+ --arg traceId "${TRACE_ID:-}" \
+ --arg targetName "$target_name" \
+ --arg targetId "$target_id" \
+ --arg targetLabel "$target_label" \
+ --arg targetGroup "$target_group" \
+ --arg targetDescription "$target_description" \
+ --arg targetSystem "$target_system" \
+ --arg outPath "$out_path" \
+ --argjson buckets '[{"name":"node","label":"Node / pnpm","pathRegex":"node_modules|npm-deps|pnpm"},{"name":"nix-sources","label":"Nix sources","pathRegex":"-source$"},{"name":"rust","label":"Rust","pathRegex":"cargo|rust|rustc"}]' \
+ --argjson targetPath '["nix","closures","packages","megarepo"]' \
+ --argjson gatePolicy '{}' \
+ '
+ ($paths[0] // []) as $closurePaths
+ | ($closurePaths | map(.narSize) | add // 0) as $totalNarSize
+ | ($closurePaths | length) as $pathCount
+ | ($buckets | map(
+ . as $bucket
+ | {
+ name: "nix.closure.bucket.nar_size",
+ id: "nix.closure.bucket.nar_size",
+ label: (($bucket.label // $bucket.name) + " closure size"),
+ group: "nix closure buckets",
+ path: ($targetPath + ["buckets", $bucket.name]),
+ description: ("NAR size contributed by closure paths matching " + $bucket.pathRegex),
+ measurementKind: "deterministic",
+ unit: "bytes",
+ value: (
+ $closurePaths
+ | map(select(.path | test($bucket.pathRegex)) | .narSize)
+ | add // 0
+ ),
+ policy: $gatePolicy,
+ dimensions: { bucket: $bucket.name }
+ }
+ )) as $bucketObservations
+ | {
+ schemaVersion: $schemaVersion,
+ generatedAt: $generatedAt,
+ producer: { name: "effect-utils-ci-measurement", version: 1 },
+ subject: {
+ repo: $repository,
+ branchKind: (if $branchKind == "" then "unknown" else $branchKind end),
+ ref: $ref,
+ headSha: $headSha,
+ baseSha: $baseSha
+ },
+ execution: {
+ provider: (if ($githubRunId != "" and $githubRunId != "unknown") then "github-actions" else "local" end),
+ workflow: "CI",
+ job: $githubJob,
+ runId: $githubRunId,
+ runAttempt: $githubRunAttempt,
+ taskId: $taskId,
+ attemptId: $taskAttemptId,
+ traceId: $traceId,
+ runner: { name: $runnerName, os: $runnerOs, arch: $runnerArch, class: $runnerClass }
+ },
+ target: { kind: "nix-closure", id: $targetId, name: $targetName, label: $targetLabel, group: $targetGroup, path: $targetPath, system: $targetSystem },
+ observations: ([
+ {
+ id: "nix.closure.nar_size",
+ label: "Total closure size",
+ group: "nix closure",
+ path: ($targetPath + ["total", "nar-size"]),
+ description: ("Total NAR size for all paths in " + $targetDescription),
+ name: "nix.closure.nar_size",
+ measurementKind: "deterministic",
+ unit: "bytes",
+ value: $totalNarSize,
+ policy: $gatePolicy,
+ dimensions: { bucket: "total" }
+ },
+ {
+ id: "nix.closure.path_count",
+ label: "Total closure path count",
+ group: "nix closure",
+ path: ($targetPath + ["total", "path-count"]),
+ description: ("Number of store paths in " + $targetDescription),
+ name: "nix.closure.path_count",
+ measurementKind: "deterministic",
+ unit: "count",
+ value: $pathCount,
+ policy: $gatePolicy,
+ dimensions: { bucket: "total" }
+ }
+ ] + $bucketObservations),
+ artifacts: [
+ { name: "nix-closure-path-info", path: "nix-closure-path-info.json", contentType: "application/json" },
+ { name: "nix-closure-paths", path: "nix-closure-paths.json", contentType: "application/json" }
+ ],
+ details: {
+ outPath: $outPath,
+ topPaths: ($closurePaths | sort_by(.narSize) | reverse | .[:30])
+ }
+ }
+ ' >"$artifact_file"
- const truncate = (value, maxLength) => {
- const text = String(value)
- if (text.length <= maxLength) return text
- if (maxLength <= 1) return text.slice(0, maxLength)
- return text.slice(0, Math.max(0, maxLength - 3)) + '...'
- }
+ cat "$artifact_file"
- const renderPerfChangeSvg = (rows, theme = 'adaptive') => {
- const chartRows = rows
- .filter((row) => typeof row.current === 'number' && typeof row.baseline === 'number')
- .filter((row) => row.gateable === true)
- .filter((row) => typeof row.semanticImpactScore === 'number')
- .sort((left, right) => (left.semanticImpactScore || 0) - (right.semanticImpactScore || 0))
- .slice(0, visibleLimit)
- if (chartRows.length === 0) return ''
+ - name: 'Measure Nix closure: oxlint-npm'
+ shell: bash
+ env:
+ ARTIFACT_DIR: tmp/nix-closure-ci/current/oxlint_npm_package
+ RUNNER_CLASS: '${{ runner.os }}-${{ runner.arch }}'
+ run: |
+ set -euo pipefail
- const impactScores = chartRows.map((row) => row.semanticImpactScore || 0)
- const minImpact = Math.min(-1, ...impactScores)
- const maxImpact = Math.max(1, ...impactScores)
- const lower = Math.floor(minImpact)
- const upper = Math.ceil(maxImpact)
- const span = upper - lower || 1
- const width = 1040
- const rowHeight = 46
- const height = 112 + chartRows.length * rowHeight + 34
- const labelX = 230
- const plotX = 252
- const plotWidth = 320
- const impactX = 596
- const nominalX = 672
- const meaningX = 804
- const topY = 92
- const barHeight = 18
- const zeroX = plotX + ((0 - lower) / span) * plotWidth
- const themeCss = theme === 'dark'
- ? [
- ' .chart-bg { fill: #0d1117; }',
- ' .chart-border { fill: none; stroke: #30363d; }',
- ' .chart-title { fill: #f0f6fc; }',
- ' .chart-muted { fill: #8b949e; }',
- ' .chart-axis { stroke: #8b949e; }',
- ' .chart-label { fill: #c9d1d9; }',
- ' .chart-value { fill: #8b949e; }',
- ' .chart-track { fill: #21262d; }',
- ]
- : [
- ' .chart-bg { fill: #ffffff; }',
- ' .chart-border { fill: none; stroke: #d0d7de; }',
- ' .chart-title { fill: #24292f; }',
- ' .chart-muted { fill: #57606a; }',
- ' .chart-axis { stroke: #8c959f; }',
- ' .chart-label { fill: #24292f; }',
- ' .chart-value { fill: #57606a; }',
- ' .chart-track { fill: #f6f8fa; }',
- ...(theme === 'adaptive'
- ? [
- ' @media (prefers-color-scheme: dark) {',
- ' .chart-bg { fill: #0d1117; }',
- ' .chart-border { stroke: #30363d; }',
- ' .chart-title { fill: #f0f6fc; }',
- ' .chart-muted { fill: #8b949e; }',
- ' .chart-axis { stroke: #8b949e; }',
- ' .chart-label { fill: #c9d1d9; }',
- ' .chart-value { fill: #8b949e; }',
- ' .chart-track { fill: #21262d; }',
- ' }',
- ]
- : []),
- ]
+ mkdir -p "$ARTIFACT_DIR"
+ installable='.#oxlint-npm'
+ target_id='oxlint_npm_package'
+ target_name='oxlint-npm'
+ target_label='oxlint npm package'
+ target_group='packages'
+ target_description='the packaged oxlint npm compatibility wrapper closure'
+ artifact_file="$ARTIFACT_DIR/measurements.json"
+ target_system='x86_64-linux'
- const svg = [
- '',
- '',
- '',
- '',
- '',
- 'Actionable measurement impact',
- '0 means no actionable PR impact; 1x reaches the warning budget.',
- 'improved',
- 'regressed',
- 'impact',
- 'baseline -> current',
- 'meaning',
- '',
- ]
+ out_path="$(nix build --no-update-lock-file --no-link --print-out-paths "$installable")"
+ path_info="$ARTIFACT_DIR/nix-closure-path-info.json"
+ paths_file="$ARTIFACT_DIR/nix-closure-paths.json"
- for (const [index, row] of chartRows.entries()) {
- const impact = row.semanticImpactScore || 0
- const y = topY + index * rowHeight
- const valueWidth = Math.max(2, Math.abs(impact) / span * plotWidth)
- const x = impact < 0 ? zeroX - valueWidth : zeroX
- const meaning = interpretation(row)
- const color = meaning.color
- const formattedImpact = formatSemanticImpact(impact)
- const label = chartProbe(row)
- const nominal = formatValue(row.baseline, row.observation?.unit).replaceAll(' ', '') + ' -> ' + formatValue(row.current, row.observation?.unit).replaceAll(' ', '')
- const barOpacity = meaning.tone === 'neutral' ? '0.65' : '1'
- const dash = meaning.tone === 'diagnostic' ? ' stroke-dasharray="3 3"' : ''
- svg.push(
- '' + escapeXml(label) + '' + escapeXml(truncate(label, 28)) + '',
- '',
- '',
- '' + escapeXml(formattedImpact) + '',
- '' + escapeXml(nominal) + '' + escapeXml(truncate(nominal, 21)) + '',
- '' + escapeXml(meaning.detail) + '' + escapeXml(truncate(meaning.label, 30)) + '',
- )
- }
+ nix path-info --recursive --json "$out_path" >"$path_info"
+ jq 'to_entries | map({ path: .key, narSize: (.value.narSize // 0) })' "$path_info" >"$paths_file"
- svg.push(
- '0',
- '',
- )
- return svg.join('\n')
- }
+ jq -n \
+ --slurpfile paths "$paths_file" \
+ --argjson schemaVersion 1 \
+ --arg generatedAt "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
+ --arg repository "${GITHUB_REPOSITORY:-unknown}" \
+ --arg branchKind "${GITHUB_EVENT_NAME:-unknown}" \
+ --arg ref "${CI_MEASUREMENT_SUBJECT_REF:-${GITHUB_REF:-unknown}}" \
+ --arg headSha "${CI_MEASUREMENT_SUBJECT_SHA:-${GITHUB_SHA:-unknown}}" \
+ --arg baseSha "${GITHUB_BASE_SHA:-}" \
+ --arg runnerName "${RUNNER_NAME:-unknown}" \
+ --arg runnerOs "${RUNNER_OS:-unknown}" \
+ --arg runnerArch "${RUNNER_ARCH:-unknown}" \
+ --arg runnerClass "${RUNNER_CLASS:-unknown}" \
+ --arg githubRunId "${GITHUB_RUN_ID:-unknown}" \
+ --arg githubRunAttempt "${GITHUB_RUN_ATTEMPT:-unknown}" \
+ --arg githubJob "${GITHUB_JOB:-unknown}" \
+ --arg taskId "${CROSSTASK_TASK_ID:-}" \
+ --arg taskAttemptId "${CROSSTASK_ATTEMPT_ID:-}" \
+ --arg traceId "${TRACE_ID:-}" \
+ --arg targetName "$target_name" \
+ --arg targetId "$target_id" \
+ --arg targetLabel "$target_label" \
+ --arg targetGroup "$target_group" \
+ --arg targetDescription "$target_description" \
+ --arg targetSystem "$target_system" \
+ --arg outPath "$out_path" \
+ --argjson buckets '[{"name":"node","label":"Node / pnpm","pathRegex":"node_modules|npm-deps|pnpm"},{"name":"nix-sources","label":"Nix sources","pathRegex":"-source$"},{"name":"rust","label":"Rust","pathRegex":"cargo|rust|rustc"}]' \
+ --argjson targetPath '["nix","closures","packages","oxlint-npm"]' \
+ --argjson gatePolicy '{}' \
+ '
+ ($paths[0] // []) as $closurePaths
+ | ($closurePaths | map(.narSize) | add // 0) as $totalNarSize
+ | ($closurePaths | length) as $pathCount
+ | ($buckets | map(
+ . as $bucket
+ | {
+ name: "nix.closure.bucket.nar_size",
+ id: "nix.closure.bucket.nar_size",
+ label: (($bucket.label // $bucket.name) + " closure size"),
+ group: "nix closure buckets",
+ path: ($targetPath + ["buckets", $bucket.name]),
+ description: ("NAR size contributed by closure paths matching " + $bucket.pathRegex),
+ measurementKind: "deterministic",
+ unit: "bytes",
+ value: (
+ $closurePaths
+ | map(select(.path | test($bucket.pathRegex)) | .narSize)
+ | add // 0
+ ),
+ policy: $gatePolicy,
+ dimensions: { bucket: $bucket.name }
+ }
+ )) as $bucketObservations
+ | {
+ schemaVersion: $schemaVersion,
+ generatedAt: $generatedAt,
+ producer: { name: "effect-utils-ci-measurement", version: 1 },
+ subject: {
+ repo: $repository,
+ branchKind: (if $branchKind == "" then "unknown" else $branchKind end),
+ ref: $ref,
+ headSha: $headSha,
+ baseSha: $baseSha
+ },
+ execution: {
+ provider: (if ($githubRunId != "" and $githubRunId != "unknown") then "github-actions" else "local" end),
+ workflow: "CI",
+ job: $githubJob,
+ runId: $githubRunId,
+ runAttempt: $githubRunAttempt,
+ taskId: $taskId,
+ attemptId: $taskAttemptId,
+ traceId: $traceId,
+ runner: { name: $runnerName, os: $runnerOs, arch: $runnerArch, class: $runnerClass }
+ },
+ target: { kind: "nix-closure", id: $targetId, name: $targetName, label: $targetLabel, group: $targetGroup, path: $targetPath, system: $targetSystem },
+ observations: ([
+ {
+ id: "nix.closure.nar_size",
+ label: "Total closure size",
+ group: "nix closure",
+ path: ($targetPath + ["total", "nar-size"]),
+ description: ("Total NAR size for all paths in " + $targetDescription),
+ name: "nix.closure.nar_size",
+ measurementKind: "deterministic",
+ unit: "bytes",
+ value: $totalNarSize,
+ policy: $gatePolicy,
+ dimensions: { bucket: "total" }
+ },
+ {
+ id: "nix.closure.path_count",
+ label: "Total closure path count",
+ group: "nix closure",
+ path: ($targetPath + ["total", "path-count"]),
+ description: ("Number of store paths in " + $targetDescription),
+ name: "nix.closure.path_count",
+ measurementKind: "deterministic",
+ unit: "count",
+ value: $pathCount,
+ policy: $gatePolicy,
+ dimensions: { bucket: "total" }
+ }
+ ] + $bucketObservations),
+ artifacts: [
+ { name: "nix-closure-path-info", path: "nix-closure-path-info.json", contentType: "application/json" },
+ { name: "nix-closure-paths", path: "nix-closure-paths.json", contentType: "application/json" }
+ ],
+ details: {
+ outPath: $outPath,
+ topPaths: ($closurePaths | sort_by(.narSize) | reverse | .[:30])
+ }
+ }
+ ' >"$artifact_file"
- const statusWord = comparison.status || 'unknown'
- const readiness = comparison.readiness || {}
- const readinessLabel = readiness.enforceable
- ? 'enforceable'
- : 'partial (' + (readiness.gateableCount ?? 0) + '/' + (readiness.enabledCount ?? 0) + ' enabled observations gateable)'
- const runUrl = runId ? serverUrl + '/' + repo + '/actions/runs/' + runId : undefined
- const shortSha = (headSha || sha || 'unknown').slice(0, 7)
- const existingState = extractState(existing?.body)
- const currentRun = {
- commitSha: headSha || sha || 'unknown',
- shortSha,
- generatedAt: new Date().toISOString(),
- status: statusWord,
- mode: comparison.mode || 'unknown',
- runUrl,
- runAttempt,
- workflow,
- job,
- visibleRows: visibleRows.map((row) => ({
- status: row.status,
- target: row.target?.label || row.target?.name || 'unknown',
- observation: row.observation?.label || row.observation?.name || 'unknown',
- meaning: interpretation(row).label,
- dimensions: dimensions(row).replaceAll('
', ', '),
- baseline: formatValue(row.baseline, row.observation?.unit),
- current: formatValue(row.current, row.observation?.unit),
- delta: formatDelta(row.delta, row.observation?.unit),
- ratio: formatRatio(row.ratio),
- impact: formatSemanticImpact(row.semanticImpactScore),
- })),
- }
- const hasComparableHistory = (run) => Array.isArray(run.visibleRows) && run.visibleRows.some((row) =>
- row.status !== 'missing_baseline' &&
- row.baseline !== 'n/a' &&
- row.ratio !== 'n/a'
- )
- const previousRuns = (existingState?.runs || []).filter((run) => run.commitSha !== currentRun.commitSha && hasComparableHistory(run))
- const historyLimit = Number.isFinite(maxHistory) && maxHistory > 0 ? maxHistory : 20
- const state = { _tag: stateTag, schemaVersion, title, runs: [currentRun, ...previousRuns].slice(0, historyLimit) }
- const gateModeLabel = (mode) => {
- if (mode === 'fail') return 'enforced'
- if (mode === 'warn') return 'advisory'
- if (mode === 'off') return 'off'
- return mode || 'unknown'
- }
- const historyRows = state.runs.slice(1).map((run) => {
- const link = run.runUrl ? '[' + run.shortSha + '](' + run.runUrl + ')' : run.shortSha
- const top = Array.isArray(run.visibleRows) && run.visibleRows.length > 0
- ? run.visibleRows.slice(0, 3).map((row) => (row.meaning || row.status) + ' ' + row.target + ' ' + row.observation + ' ' + row.delta + ' / ' + row.ratio).join('
')
- : 'No regressions'
- return '| ' + [link, run.status, gateModeLabel(run.mode), top].map(escapeCell).join(' | ') + ' |'
- })
+ cat "$artifact_file"
- const runLink = runUrl ? '[workflow run](' + runUrl + ')' : 'workflow run unavailable'
- const baselineProvenance = comparison.baselineProvenance
- const baselineLabel = baselineProvenance?.runId
- ? '[main run ' + baselineProvenance.runId + '](' + serverUrl + '/' + repo + '/actions/runs/' + baselineProvenance.runId + ')' +
- (Array.isArray(baselineProvenance.runs) && baselineProvenance.runs.length > 1 ? ' + ' + (baselineProvenance.runs.length - 1) + ' older baseline runs' : '')
- : 'not available'
- const sourceOfTruth = {
- schemaVersion,
- title,
- status: statusWord,
- gate: gateModeLabel(comparison.mode),
- readiness: readinessLabel,
- commit: {
- shortSha,
- sha: headSha || sha || 'unknown',
- },
- run: {
- id: runId || null,
- attempt: runAttempt || null,
- url: runUrl || null,
- },
- baseline: baselineProvenance || null,
- protocol: protocolLabel,
- chart: {
- meaning: 'semantic-impact',
- zeroImpactMeaning: 'no actionable PR impact after budgets, noise floor, and robust evidence checks',
- svg: chartSourceUrl || null,
- lightPng: chartUrl || null,
- darkPng: chartDarkUrl || null,
- },
- measurements: allRows.map(sourceMeasurement),
- }
- const chartSvg = hasComparableBaseline && visibleRows.length > 0 ? renderPerfChangeSvg(visibleRows) : ''
- const chartDarkSvg = hasComparableBaseline && visibleRows.length > 0 ? renderPerfChangeSvg(visibleRows, 'dark') : ''
- if (chartPath && chartSvg) writeFileSync(chartPath, chartSvg)
- if (chartDarkPath && chartDarkSvg) writeFileSync(chartDarkPath, chartDarkSvg)
- const chartImageMarkdown = chartUrl && chartSvg
- ? (chartDarkUrl
- ? '\n' +
- ' \n' +
- ' \n' +
- '
\n' +
- ''
- : '')
- : ''
- const chartMarkdown = chartImageMarkdown
- ? chartImageMarkdown +
- (chartSourceUrl ? '\n\n[SVG source](' + chartSourceUrl + ')' : '')
- : ''
+ - name: Compare CI measurements with baseline
+ shell: bash
+ env:
+ CI_MEASUREMENT_CURRENT_DIR: tmp/nix-closure-ci/current
+ CI_MEASUREMENT_BASELINE_DIR: tmp/nix-closure-ci/baseline
+ CI_MEASUREMENT_COMPARISON_FILE: tmp/nix-closure-ci/measurement-comparison.json
+ CI_MEASUREMENT_REGRESSION_MODE: warn
+ CI_MEASUREMENT_PR_COMMENT_ENABLED: 'false'
+ CI_MEASUREMENT_PR_COMMENT_TITLE: Nix Closure Measurements
+ CI_MEASUREMENT_PR_COMMENT_MAX_ROWS: '8'
+ CI_MEASUREMENT_PR_COMMENT_MAX_HISTORY: '20'
+ CI_MEASUREMENT_PR_COMMENT_ASSET_BRANCH: ci-measurement-assets
+ run: |
+ set -euo pipefail
- const regressionCount = allRows.filter((row) => row.status === 'fail' || row.status === 'warn').length
- const improvementCount = comparableRows.filter((row) => row.direction === 'improved' && !isZeroImpactRow(row)).length
- const neutralCount = zeroImpactRows.length + diagnosticRows.length
- const humanSummary = hasComparableBaseline
- ? regressionCount > 0
- ? String(regressionCount) + ' regression' + (regressionCount === 1 ? '' : 's') + ' need review.'
- : improvementCount > 0
- ? 'No regressions. ' + String(improvementCount) + ' probe' + (improvementCount === 1 ? '' : 's') + ' got faster; ' + String(neutralCount) + ' neutral or ungated row' + (neutralCount === 1 ? '' : 's') + ' are collapsed below.'
- : 'No regressions. Comparable movement is below the semantic impact threshold; neutral rows are collapsed below.'
- : 'No compatible baseline was available, so this run shows current measurements only.'
+ export PATH="/run/current-system/sw/bin:/usr/bin:/bin:$PATH"
- const summaryLines = [
- '## ' + title,
- '',
- '**' + statusWord + '** - ' + gateModeLabel(comparison.mode) + ' gate - readiness ' + readinessLabel + ' - commit ' + shortSha + ' - protocol ' + protocolLabel + '',
- '',
- '> ' + humanSummary,
- '',
- chartMarkdown,
- '',
- hasComparableBaseline
- ? scanTable(visibleNonZeroImpactRows)
- : currentOnlyTable(visibleRows),
- ]
+ current_dir="${CI_MEASUREMENT_CURRENT_DIR:?CI_MEASUREMENT_CURRENT_DIR not set}"
+ baseline_dir="${CI_MEASUREMENT_BASELINE_DIR:?CI_MEASUREMENT_BASELINE_DIR not set}"
+ comparison_file="${CI_MEASUREMENT_COMPARISON_FILE:?CI_MEASUREMENT_COMPARISON_FILE not set}"
+ mode="${CI_MEASUREMENT_REGRESSION_MODE:-warn}"
+ mkdir -p "$(dirname "$comparison_file")"
- if (hasComparableBaseline && zeroImpactRows.length > 0) {
- summaryLines.push(
- '',
- '',
- 'Unchanged / 0-impact measurements (' + zeroImpactRows.length + ')
',
- '',
- 'These rows had compatible baseline data, but their semantic impact rounded to 0.00x because the movement was below the configured budget, below the noise floor, or inside the robust noise band.',
- '',
- zeroImpactTable(zeroImpactRows),
- '',
- ' ',
- )
- }
+ if [ "$mode" = "off" ]; then
+ jq -n --argjson schemaVersion 1 --arg status skipped --arg mode "$mode" \
+ '{schemaVersion:$schemaVersion,status:$status,mode:$mode,comparisons:{}}' \
+ >"$comparison_file"
+ exit 0
+ fi
- if (diagnosticRows.length > 0) {
- summaryLines.push(
- '',
- '',
- 'Diagnostic / ungated measurements (' + diagnosticRows.length + ')
',
- '',
- diagnosticTable(diagnosticRows),
- '',
- ' ',
- )
- }
+ current_index="$(mktemp)"
+ baseline_index="$(mktemp)"
+ find "$current_dir" -name baseline -type d -prune -o -name measurements.json -type f -print | sort >"$current_index" || true
+ {
+ find "$baseline_dir" -name baseline -type d ! -path "$baseline_dir" -prune -o -name measurements.json -type f -print
+ } | sort -u >"$baseline_index" || true
- summaryLines.push(
- '',
- '',
- 'All measurements
',
- '',
- allMeasurementsTable(allRows),
- '',
- ' ',
- )
+ if [ ! -s "$current_index" ]; then
+ echo "::error::no current measurements.json files found under $current_dir"
+ exit 1
+ fi
- if (historyRows.length > 0) {
- summaryLines.push(
- '',
- '',
- 'Previous runs
',
- '',
- '| Commit | Status | Gate | Top changes |',
- '| --- | --- | --- | --- |',
- ...historyRows,
- '',
- ' ',
- )
- }
+ current_json="$comparison_file.current.json"
+ baseline_json="$comparison_file.baseline.json"
+ xargs -r jq -s '.' <"$current_index" >"$current_json"
+ if [ -s "$baseline_index" ]; then
+ xargs -r jq -s '.' <"$baseline_index" >"$baseline_json"
+ else
+ printf '[]\n' >"$baseline_json"
+ fi
- summaryLines.push(
- '',
- '',
- 'Source-of-truth JSON
',
- '',
- '~~~json',
- JSON.stringify(sourceOfTruth, null, 2),
- '~~~',
- '',
- ' ',
- )
+ jq -n \
+ --slurpfile current "$current_json" \
+ --slurpfile baseline "$baseline_json" \
+ --argjson schemaVersion 1 \
+ --arg mode "$mode" \
+ --arg currentDir "$current_dir" \
+ --arg baselineDir "$baseline_dir" \
+ '
+ def identity_dimensions:
+ (.dimensions // {})
+ | to_entries
+ | map(select(.key as $key | ["devenvRev", "otelServiceName", "status", "probeLabel", "sampleCount", "measuredSampleCount"] | index($key) | not))
+ | sort_by(.key)
+ | map("\(.key)=\(.value|tostring)")
+ | join(",");
- summaryLines.push('', marker, statePrefix + JSON.stringify(state, null, 2) + stateSuffix)
- writeFileSync(bodyPath, summaryLines.join('\n') + '\n')
- writeFileSync(commentIdPath, existing?.id ? String(existing.id) : '')
- EOF
+ def observation_key($doc):
+ [
+ ($doc.target.kind // "unknown"),
+ ($doc.target.id // $doc.target.name // "unknown"),
+ ($doc.target.system // "unknown"),
+ (.id // .name // "unknown"),
+ (.unit // "unknown"),
+ identity_dimensions
+ ] | join("|");
- node "$renderer_script" "$comparison_file" "$comments_json" "$comment_body" "$comment_id_file" "$chart_file" "$chart_dark_file"
+ def median:
+ sort as $sorted
+ | ($sorted | length) as $count
+ | if $count == 0 then null
+ elif ($count % 2) == 1 then $sorted[($count / 2 | floor)]
+ else (($sorted[($count / 2 - 1)] + $sorted[($count / 2)]) / 2)
+ end;
- if [ -s "$chart_file" ]; then
- if [ "$require_public_asset" = "true" ] && [ -z "$public_asset_command" ]; then
- echo "::error::CI measurement chart was rendered for a private repository, but CI_MEASUREMENT_PR_COMMENT_PUBLIC_ASSET_COMMAND is not configured. Private raw GitHub URLs cannot be embedded in PR comments."
- exit 1
- fi
+ def percentile($p):
+ sort as $sorted
+ | ($sorted | length) as $count
+ | if $count == 0 then null
+ else $sorted[(($p * ($count - 1)) | floor)]
+ end;
- if ensure_ci_measurement_tool resvg resvg; then
- resvg_font_args=()
- if command -v nix >/dev/null 2>&1; then
- if font_out="$(nix build --no-link --print-out-paths nixpkgs#dejavu_fonts 2>/dev/null)"; then
- resvg_font_args+=(--use-fonts-dir "$font_out/share/fonts/truetype")
- fi
- fi
- if ! resvg --background '#ffffff' "${resvg_font_args[@]}" "$chart_file" "$chart_png_file"; then
- echo "::notice::unable to render CI measurement chart PNG"
- rm -f "$chart_png_file"
- fi
- if [ -s "$chart_dark_file" ] && ! resvg --background '#0d1117' "${resvg_font_args[@]}" "$chart_dark_file" "$chart_dark_png_file"; then
- echo "::notice::unable to render dark CI measurement chart PNG"
- rm -f "$chart_dark_png_file"
- fi
- else
- echo "::notice::resvg is not available; skipping embedded CI measurement chart PNG"
- fi
+ def abs_value: if . < 0 then -. else . end;
- if ! gh api "repos/$repo/git/ref/heads/$asset_branch" >/dev/null 2>&1; then
- default_branch_sha="$(gh api "repos/$repo/git/ref/heads/${GITHUB_BASE_REF:-main}" --jq '.object.sha' 2>/dev/null || true)"
- if [ -z "$default_branch_sha" ]; then
- default_branch_sha="${GITHUB_SHA:-}"
- fi
- if [ -n "$default_branch_sha" ]; then
- gh api "repos/$repo/git/refs" --method POST --field ref="refs/heads/$asset_branch" --field sha="$default_branch_sha" >/dev/null || true
- fi
- fi
- chart_content="$(base64 <"$chart_file" | tr -d '\n')"
- if ! gh api "repos/$repo/contents/$asset_svg_path" --method PUT --field message="Update CI measurement chart SVG for PR #$pr_number" --field content="$chart_content" --field branch="$asset_branch" >/dev/null; then
- echo "::notice::unable to upload CI measurement chart SVG asset"
- if [ -z "$public_asset_command" ]; then
- sed -i.bak '/\[SVG source\]/d' "$comment_body"
- fi
- fi
- if [ -s "$chart_png_file" ]; then
- chart_png_content="$(base64 <"$chart_png_file" | tr -d '\n')"
- if ! gh api "repos/$repo/contents/$asset_png_path" --method PUT --field message="Update CI measurement chart PNG for PR #$pr_number" --field content="$chart_png_content" --field branch="$asset_branch" >/dev/null; then
- echo "::notice::unable to upload CI measurement chart PNG asset"
- if [ -z "$public_asset_command" ]; then
- sed -i.bak '/!\[Measurement change vs baseline chart\]/d; /!\[Perf change vs baseline chart\]/d; //,/<\\/picture>/d' "$comment_body"
- fi
- fi
- else
- sed -i.bak '/!\[Measurement change vs baseline chart\]/d; /!\[Perf change vs baseline chart\]/d; //,/<\\/picture>/d' "$comment_body"
- fi
- if [ -s "$chart_dark_png_file" ]; then
- chart_dark_png_content="$(base64 <"$chart_dark_png_file" | tr -d '\n')"
- if ! gh api "repos/$repo/contents/$asset_dark_png_path" --method PUT --field message="Update dark CI measurement chart PNG for PR #$pr_number" --field content="$chart_dark_png_content" --field branch="$asset_branch" >/dev/null; then
- echo "::notice::unable to upload dark CI measurement chart PNG asset"
- if [ -z "$public_asset_command" ]; then
- export CI_MEASUREMENT_PR_COMMENT_CHART_DARK_URL=""
- node "$renderer_script" "$comparison_file" "$comments_json" "$comment_body" "$comment_id_file" "$chart_file" "$chart_dark_file"
- fi
- fi
- fi
+ def observations_by_key($docs):
+ reduce $docs[]? as $doc
+ ({};
+ reduce (($doc.observations // [])[]? | select(.value | type == "number")) as $obs
+ (.;
+ ($obs | observation_key($doc)) as $key
+ | .[$key] = ((.[$key] // []) + [{
+ target: $doc.target,
+ observation: $obs,
+ generatedAt: $doc.generatedAt
+ }])
+ )
+ );
- if [ -n "$public_asset_command" ] && [ -s "$chart_png_file" ]; then
- if public_chart_url="$(bash -c "$public_asset_command" _ "$chart_png_file" png)" && [ -n "$public_chart_url" ]; then
- chart_url="$public_chart_url"
- export CI_MEASUREMENT_PR_COMMENT_CHART_URL="$chart_url"
- else
- echo "::notice::unable to publish CI measurement chart PNG to public asset host"
- export CI_MEASUREMENT_PR_COMMENT_CHART_URL=""
- fi
- if [ -s "$chart_dark_png_file" ] && public_chart_dark_url="$(bash -c "$public_asset_command" _ "$chart_dark_png_file" png)" && [ -n "$public_chart_dark_url" ]; then
- chart_dark_url="$public_chart_dark_url"
- export CI_MEASUREMENT_PR_COMMENT_CHART_DARK_URL="$chart_dark_url"
- else
- echo "::notice::unable to publish dark CI measurement chart PNG to public asset host"
- export CI_MEASUREMENT_PR_COMMENT_CHART_DARK_URL=""
- fi
- if public_chart_source_url="$(bash -c "$public_asset_command" _ "$chart_file" svg)" && [ -n "$public_chart_source_url" ]; then
- chart_source_url="$public_chart_source_url"
- export CI_MEASUREMENT_PR_COMMENT_CHART_SOURCE_URL="$chart_source_url"
- else
- echo "::notice::unable to publish CI measurement chart SVG to public asset host"
- export CI_MEASUREMENT_PR_COMMENT_CHART_SOURCE_URL=""
- fi
- if [ "$require_public_asset" = "true" ] && [ -z "$chart_url" ]; then
- echo "::error::unable to publish CI measurement chart PNG to a public asset host for private repository $repo"
- exit 1
- fi
- if [ "$require_public_asset" = "true" ] && [ -s "$chart_dark_png_file" ] && [ -z "$chart_dark_url" ]; then
- echo "::error::unable to publish dark CI measurement chart PNG to a public asset host for private repository $repo"
- exit 1
- fi
- node "$renderer_script" "$comparison_file" "$comments_json" "$comment_body" "$comment_id_file" "$chart_file" "$chart_dark_file"
- fi
- fi
+ def observation_stats($items):
+ ($items | map(.observation.value)) as $values
+ | ($items | map(.observation.comparison.baseline // empty)) as $pairedBaselineValues
+ | ($items | map(.observation.statistics.pairedDeltaMedian // empty)) as $pairedDeltaMedianValues
+ | ($items | map(.observation.statistics.pairedDeltaP25 // empty)) as $pairedDeltaP25Values
+ | ($items | map(.observation.statistics.pairedDeltaP75 // empty)) as $pairedDeltaP75Values
+ | ($items | map(.observation.statistics.pairedDeltaMad // empty)) as $pairedDeltaMadValues
+ | ($items | map(.observation.statistics.pairedDeltaSamples // []) | add // []) as $pairedDeltaSampleValues
+ | ($items | map(.observation.statistics.measuredSampleCount // .observation.statistics.sampleCount // 1) | add // ($items | length)) as $sampleCount
+ | ($values | median) as $median
+ | {
+ target: ($items[0].target // {}),
+ observation: ($items[-1].observation // {}),
+ measurementKind: ($items[-1].observation.measurementKind // null),
+ value: $median,
+ min: ($values | min),
+ max: ($values | max),
+ p25: ($values | percentile(0.25)),
+ p75: ($values | percentile(0.75)),
+ p95: ($values | percentile(0.95)),
+ mad: ($values | map(. - $median | if . < 0 then -. else . end) | median),
+ sourceCount: ($items | length),
+ sampleCount: $sampleCount,
+ pairedSampleCount: ($items | map(.observation.statistics.pairedSampleCount // .observation.comparison.pairedSampleCount // 0) | add // 0),
+ pairedBaselineValue: (if ($pairedBaselineValues | length) == 0 then null else ($pairedBaselineValues | median) end),
+ pairedDeltaMedianValue: (if ($pairedDeltaMedianValues | length) == 0 then null else ($pairedDeltaMedianValues | median) end),
+ pairedDeltaP25Value: (if ($pairedDeltaP25Values | length) == 0 then null else ($pairedDeltaP25Values | median) end),
+ pairedDeltaP75Value: (if ($pairedDeltaP75Values | length) == 0 then null else ($pairedDeltaP75Values | median) end),
+ pairedDeltaMadValue: (if ($pairedDeltaMadValues | length) == 0 then null else ($pairedDeltaMadValues | median) end),
+ pairedDeltaSampleValues: $pairedDeltaSampleValues,
+ generatedAt: ($items[-1].generatedAt // null)
+ };
- comment_id="$(cat "$comment_id_file")"
- comment_payload_file="$comment_body.payload.json"
- node -e "const fs=require('node:fs'); fs.writeFileSync(process.argv[2], JSON.stringify({ body: fs.readFileSync(process.argv[1], 'utf8') }))" "$comment_body" "$comment_payload_file"
- if [ -n "$comment_id" ]; then
- if ! gh api "repos/$repo/issues/comments/$comment_id" --method PATCH --input "$comment_payload_file" >/dev/null; then
- echo "::notice::unable to update CI measurement PR comment"
- fi
+ def budget($metric; $unit):
+ if $metric == "nix.closure.nar_size" then
+ {warnRatio:1.05, failRatio:1.10, warnAbs:52428800, failAbs:209715200, statisticalToleranceRatio:0.02, statisticalToleranceAbs:10485760}
+ elif $metric == "nix.closure.bucket.nar_size" then
+ {warnRatio:1.10, failRatio:1.20, warnAbs:52428800, failAbs:209715200, statisticalToleranceRatio:0.05, statisticalToleranceAbs:10485760}
+ elif $metric == "nix.closure.path_count" then
+ {warnRatio:1.05, failRatio:1.10, warnAbs:100, failAbs:500, statisticalToleranceRatio:0.02, statisticalToleranceAbs:10}
+ elif $unit == "seconds" then
+ {warnRatio:1.10, failRatio:1.20, warnAbs:0.25, failAbs:1, statisticalToleranceRatio:0.10, statisticalToleranceAbs:0.25}
else
- if ! gh api "repos/$repo/issues/$pr_number/comments" --method POST --input "$comment_payload_file" >/dev/null; then
- echo "::notice::unable to create CI measurement PR comment"
- fi
- fi
- fi
- fi
- fi
-
- if [ "$exit_code" -ne 0 ]; then
- exit "$exit_code"
- fi
+ {warnRatio:1.25, failRatio:1.50, warnAbs:1, failAbs:3, statisticalToleranceRatio:0.10, statisticalToleranceAbs:1}
+ end;
- - name: Upload devenv perf artifacts
- if: always()
- uses: actions/upload-artifact@v4
- with:
- name: devenv-perf
- path: |
- tmp/devenv-perf-ci
- !tmp/devenv-perf-ci/baseline/**
- if-no-files-found: error
- retention-days: 30
- timeout-minutes: 30
- concurrency:
- group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-devenv-perf"
- cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
- nix-closure-sizes:
- if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
- runs-on:
- [namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}']
- timeout-minutes: 30
- defaults:
- run:
- shell: bash
- permissions:
- actions: read
- contents: write
- issues: write
- pull-requests: write
- env:
- CI_MEASUREMENT_SUBJECT_REF: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.ref || github.ref }}
- CI_MEASUREMENT_SUBJECT_SHA: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.sha || github.sha }}
- CI_MEASUREMENT_SUBJECT_LABEL: ${{ inputs.measurement_baseline_label }}
- CI_MEASUREMENT_ALLOW_PROBE_FAILURES: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && '1' || '' }}
- steps:
- - uses: actions/checkout@v6
- - name: Checkout CI measurement baseline ref
- if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }}
- uses: actions/checkout@v6
- with:
- ref: ${{ inputs.measurement_baseline_ref }}
- - name: Install Nix
- uses: DeterminateSystems/determinate-nix-action@v3
- with:
- extra-conf: |
- experimental-features = nix-command flakes
- accept-flake-config = true
- extra-substituters = https://devenv.cachix.org
- extra-trusted-public-keys = devenv.cachix.org-1:w1cLUi8dv3hnoSPGAuibQv+f9TZLr6cv/Hm9XgU50cw=
- access-tokens = github.com=${{ github.token }}
- summarize: true
- - name: Provide cachix CLI from nixpkgs
- shell: bash
- run: |
- set -euo pipefail
- out=$(nix build --no-link --print-out-paths nixpkgs#cachix)
- echo "$out/bin" >> "$GITHUB_PATH"
- - name: Enable Cachix cache
- uses: cachix/cachix-action@v17
- with:
- name: overeng-effect-utils
- authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
- - name: Use pinned devenv from lock
- run: |
- DEVENV_REV=$(jq -r .nodes.devenv.locked.rev devenv.lock)
- if [ -z "$DEVENV_REV" ] || [ "$DEVENV_REV" = "null" ]; then
- echo '::error::devenv.lock missing .nodes.devenv.locked.rev'
- exit 1
- fi
- echo "DEVENV_REV=$DEVENV_REV" >> "$GITHUB_ENV"
- echo "Pinned devenv rev: $DEVENV_REV"
- shell: bash
- - name: Isolate pnpm state
- shell: bash
- run: |
- echo "PNPM_STORE_DIR=${{ runner.temp }}/pnpm-store/${{ github.job }}" >> "$GITHUB_ENV"
- echo "PNPM_HOME=${{ github.workspace }}/.pnpm-home" >> "$GITHUB_ENV"
- - id: restore-pnpm-state
- name: Restore pnpm state
- uses: actions/cache/restore@v4
- with:
- path: |
- ${{ github.workspace }}/.pnpm-home
- ${{ runner.temp }}/pnpm-store/${{ github.job }}
- key: "pnpm-state-v1-${{ runner.os }}-${{ runner.arch }}-${{ hashFiles('**/pnpm-lock.yaml') }}"
- - name: Resolve devenv
- run: |
- DEVENV_REV=$(jq -r .nodes.devenv.locked.rev devenv.lock)
- if [ -z "$DEVENV_REV" ] || [ "$DEVENV_REV" = "null" ]; then
- echo '::error::devenv.lock missing .nodes.devenv.locked.rev'
- exit 1
- fi
-
- resolve_devenv() {
- nix build \
- --accept-flake-config \
- --option extra-substituters https://devenv.cachix.org \
- --option extra-trusted-public-keys devenv.cachix.org-1:w1cLUi8dv3hnoSPGAuibQv+f9TZLr6cv/Hm9XgU50cw= \
- --no-link \
- --print-out-paths \
- "github:cachix/devenv/$DEVENV_REV#devenv"
- }
-
- # Temporary: capture diagnostics dir for #272 root-cause analysis.
- DIAG_ROOT="${RUNNER_TEMP:-/tmp}/nix-store-diagnostics-${GITHUB_JOB:-job}-${RUNNER_OS:-unknown}-${GITHUB_RUN_ATTEMPT:-0}"
- mkdir -p "$DIAG_ROOT"
- echo "NIX_STORE_DIAGNOSTICS_DIR=$DIAG_ROOT" >> "$GITHUB_ENV"
-
- {
- echo "timestamp_utc=$(date -u +%Y-%m-%dT%H:%M:%SZ)"
- echo "runner_name=${RUNNER_NAME:-unknown}"
- echo "runner_os=${RUNNER_OS:-unknown}"
- echo "runner_arch=${RUNNER_ARCH:-unknown}"
- echo "github_job=${GITHUB_JOB:-unknown}"
- echo "github_run_id=${GITHUB_RUN_ID:-unknown}"
- echo "nix_user_conf_files=${NIX_USER_CONF_FILES:-}"
- nix --version || true
- } > "$DIAG_ROOT/environment.txt" 2>&1
-
- if ! DEVENV_OUT=$(resolve_devenv 2> >(tee "$DIAG_ROOT/resolve-devenv.log" >&2)); then
- echo "::error::resolve_devenv failed. Last 30 lines of log:"
- tail -30 "$DIAG_ROOT/resolve-devenv.log" || true
- exit 1
- fi
- DEVENV_BIN="$DEVENV_OUT/bin/devenv"
-
- # Fast validity check on the devenv store path (~1-2s vs ~25s for devenv info).
- if ! nix-store --check-validity "$DEVENV_OUT" 2>/dev/null; then
- echo "::warning::devenv store path invalid, repairing targeted path..."
- nix-store --repair-path "$DEVENV_OUT" > "$DIAG_ROOT/nix-store-verify-repair.log" 2>&1 || true
- rm -rf "${XDG_CACHE_HOME:-$HOME/.cache}"/nix/eval-cache-* ~/.cache/nix/eval-cache-*
- if ! DEVENV_OUT=$(resolve_devenv 2> >(tee "$DIAG_ROOT/resolve-devenv-post-repair.log" >&2)); then
- echo "::error::resolve_devenv failed after repair. Last 30 lines of log:"
- tail -30 "$DIAG_ROOT/resolve-devenv-post-repair.log" || true
- exit 1
- fi
- DEVENV_BIN="$DEVENV_OUT/bin/devenv"
- fi
-
- echo "DEVENV_BIN=$DEVENV_BIN" >> "$GITHUB_ENV"
- "$DEVENV_BIN" version | tee "$DIAG_ROOT/devenv-version.txt"
- shell: bash
- - name: Evict cached pnpm deps for oxlint-npm
- shell: bash
- run: |
- targetRef='.#oxlint-npm'
- entriesJson=$(mktemp)
- if nix eval --json "$targetRef.passthru.depsBuildEntries" >"$entriesJson" 2>/dev/null; then
- while IFS=$'\t' read -r attrName drv; do
- [ -n "$drv" ] || continue
- while IFS= read -r outPath; do
- [ -n "$outPath" ] || continue
- if nix path-info "$outPath" >/dev/null 2>&1; then
- echo "evicting cached: $(basename "$outPath")"
- if ! nix store delete --ignore-liveness "$outPath" >/dev/null 2>&1; then
- echo "::error::failed to evict cached pnpm-deps output: $outPath"
- exit 1
- fi
- if nix path-info "$outPath" >/dev/null 2>&1; then
- echo "::error::cached pnpm-deps output still present after eviction: $outPath"
- exit 1
- fi
- fi
- done < <(nix-store -q --outputs "$drv" 2>/dev/null || true)
- done < <(jq -r '.[] | [.attrName, (.drvPath // "")] | @tsv' "$entriesJson")
- else
- topDrv=$(nix path-info --derivation "$targetRef" 2>/dev/null || true)
- if [ -n "$topDrv" ]; then
- while IFS= read -r drv; do
- [ -n "$drv" ] || continue
- attrName=""
- while IFS= read -r outPath; do
- [ -n "$outPath" ] || continue
- if nix path-info "$outPath" >/dev/null 2>&1; then
- echo "evicting cached: $(basename "$outPath")"
- if ! nix store delete --ignore-liveness "$outPath" >/dev/null 2>&1; then
- echo "::error::failed to evict cached pnpm-deps output: $outPath"
- exit 1
- fi
- if nix path-info "$outPath" >/dev/null 2>&1; then
- echo "::error::cached pnpm-deps output still present after eviction: $outPath"
- exit 1
- fi
- fi
- done < <(nix-store -q --outputs "$drv" 2>/dev/null || true)
- done < <(nix-store -qR "$topDrv" 2>/dev/null | grep "pnpm-deps-[a-z0-9-]*-v[0-9].*\.drv$" || true)
- fi
- fi
- rm -f "$entriesJson"
- - name: Force diagnostics failure (debug)
- if: ${{ github.event_name == 'workflow_dispatch' && (inputs.debug_force_nix_diagnostics_failure == true || inputs.debug_force_nix_diagnostics_failure == 'true') }}
- shell: bash
- run: |
- diag_dir="${NIX_STORE_DIAGNOSTICS_DIR:-${RUNNER_TEMP:-/tmp}/nix-store-diagnostics-missing}"
- mkdir -p "$diag_dir"
- cat > "$diag_dir/synthetic-signature.log" <<'EOF'
- Failed to convert config.cachix to JSON
- ... while evaluating the option `cachix.package`
- error: path '/nix/store/synthetic-invalid-path' is not valid
- EOF
- echo "::warning::Intentional failure for diagnostics validation (#272)"
- exit 1
- - name: 'Download previous artifact: nix-closure-measurements'
- shell: bash
- env:
- GH_TOKEN: ${{ github.token }}
- BASELINE_ARTIFACT_NAME: nix-closure-measurements
- BASELINE_OUTPUT_DIR: tmp/nix-closure-ci/baseline
- BASELINE_WORKFLOW_NAME: ${{ github.workflow }}
- BASELINE_BRANCH: ${{ github.base_ref || github.ref_name }}
- BASELINE_SEED_RUNS_JSON: '[]'
- BASELINE_MAX_RUNS: '20'
- BASELINE_MAX_CANDIDATE_RUNS: '60'
- BASELINE_REQUIRED_OBSERVATIONS_JSON: '[]'
- run: |
- set -euo pipefail
-
- mkdir -p "$BASELINE_OUTPUT_DIR"
-
- if command -v gh >/dev/null 2>&1; then
- GH_BIN="$(command -v gh)"
- else
- echo "::notice::gh is not on PATH; resolving GitHub CLI through Nix"
- if ! GH_BIN="$(nix build --no-link --print-out-paths nixpkgs#gh 2>/dev/null)/bin/gh"; then
- echo "::notice::unable to resolve GitHub CLI through Nix; skipping previous artifact download"
- exit 0
- fi
- fi
- echo "Using GitHub CLI: $GH_BIN"
-
- repo="${GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}"
- workflow="${BASELINE_WORKFLOW_NAME:-CI}"
- branch="${BASELINE_BRANCH:-${GITHUB_BASE_REF:-${GITHUB_REF_NAME:-main}}}"
- seed_runs_file="$BASELINE_OUTPUT_DIR/baseline-seed-runs.json"
- required_observations_file="$BASELINE_OUTPUT_DIR/baseline-required-observations.json"
- printf '%s' "${BASELINE_SEED_RUNS_JSON:-[]}" >"$seed_runs_file"
- printf '%s' "${BASELINE_REQUIRED_OBSERVATIONS_JSON:-[]}" >"$required_observations_file"
- if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.runId | type == "string")) else false end' \
- "$seed_runs_file" >/dev/null; then
- echo "::error::BASELINE_SEED_RUNS_JSON must be an array of objects with string runId fields"
- exit 1
- fi
- if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.id | type == "string") and (.minSources | type == "number")) else false end' \
- "$required_observations_file" >/dev/null; then
- echo "::error::BASELINE_REQUIRED_OBSERVATIONS_JSON must be an array of objects with string id and numeric minSources fields"
- exit 1
- fi
- seed_run_ids="$(jq -r '.[].runId' "$seed_runs_file")"
- required_observation_count="$(jq 'length' "$required_observations_file")"
- max_candidate_runs="${BASELINE_MAX_CANDIDATE_RUNS:-${BASELINE_MAX_RUNS:-5}}"
- if ! [[ "$max_candidate_runs" =~ ^[0-9]+$ ]] || [ "$max_candidate_runs" -lt 1 ]; then
- max_candidate_runs=1
- fi
-
- candidate_runs="$(
- "$GH_BIN" run list \
- --repo "$repo" \
- --workflow "$workflow" \
- --branch "$branch" \
- --event push \
- --status success \
- --json databaseId,headSha \
- --limit "$max_candidate_runs" \
- --jq '[.[] | select(.headSha != env.GITHUB_SHA) | .databaseId] | .[]'
- )"
-
- candidate_runs="$seed_run_ids
- $candidate_runs"
-
- max_runs="${BASELINE_MAX_RUNS:-5}"
- if ! [[ "$max_runs" =~ ^[0-9]+$ ]] || [ "$max_runs" -lt 1 ]; then
- max_runs=1
- fi
-
- write_baseline_observation_counts() {
- local measurement_index="$BASELINE_OUTPUT_DIR/baseline-measurement-files.txt"
- local counts_file="$BASELINE_OUTPUT_DIR/baseline-observation-counts.json"
- find "$BASELINE_OUTPUT_DIR" \
- -mindepth 2 \
- -maxdepth 2 \
- -name measurements.json \
- -type f \
- -print \
- | sort >"$measurement_index" || true
-
- if [ -s "$measurement_index" ]; then
- xargs -r jq -s \
- --slurpfile required "$required_observations_file" \
- '
- ([.[] | (.observations // [])[]? | select(.value | type == "number") | .id] | sort | group_by(.) | map({id: .[0], sources: length})) as $counts
- | ($required[0] // []) as $requiredRows
- | {
- counts: $counts,
- required: (
- $requiredRows
- | map(. as $requiredRow | ($counts | map(select(.id == $requiredRow.id)) | .[0].sources // 0) as $actual | $requiredRow + {sources:$actual, satisfied:($actual >= $requiredRow.minSources)})
- )
- }
- ' <"$measurement_index" >"$counts_file"
- else
- jq -n --slurpfile required "$required_observations_file" \
- '{counts: [], required: (($required[0] // []) | map(. + {sources:0, satisfied:false}))}' >"$counts_file"
- fi
- }
-
- baseline_requirements_satisfied() {
- if [ "$required_observation_count" -eq 0 ]; then
- return 1
- fi
- write_baseline_observation_counts
- jq -e '.required | all(.satisfied == true)' "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" >/dev/null
- }
-
- run_id=""
- artifact_name=""
- artifact_id=""
- downloaded_runs_file="$BASELINE_OUTPUT_DIR/baseline-runs.jsonl"
- seen_runs_file="$BASELINE_OUTPUT_DIR/baseline-seen-runs.txt"
- : >"$downloaded_runs_file"
- : >"$seen_runs_file"
- for candidate_run in $candidate_runs; do
- if [ -z "$candidate_run" ]; then
- continue
- fi
- if grep -qxF "$candidate_run" "$seen_runs_file"; then
- continue
- fi
- downloaded_count="$(wc -l <"$downloaded_runs_file" | tr -d ' ')"
- if [ "$downloaded_count" -ge "$max_runs" ]; then
- if baseline_requirements_satisfied; then
- break
- fi
- echo "::notice::downloaded $downloaded_count baseline artifact(s), but required observation counts are not satisfied yet; continuing through bounded candidate history"
- fi
- if [ "$(wc -l <"$seen_runs_file" | tr -d ' ')" -ge "$max_candidate_runs" ]; then
- break
- fi
- printf '%s\n' "$candidate_run" >>"$seen_runs_file"
-
- artifact_json="$(
- "$GH_BIN" api "repos/$repo/actions/runs/$candidate_run/artifacts" \
- | jq --arg artifactName "$BASELINE_ARTIFACT_NAME" '.artifacts
- | map(select(.expired == false))
- | map(select(.name == $artifactName or (.name | startswith($artifactName + "-"))))
- | sort_by(.created_at // "")
- | reverse
- | .[0] // empty'
- )"
-
- if [ -n "$artifact_json" ]; then
- current_artifact_name="$(printf '%s' "$artifact_json" | jq -r '.name')"
- current_artifact_id="$(printf '%s' "$artifact_json" | jq -r '.id')"
- current_output_dir="$BASELINE_OUTPUT_DIR/run-$candidate_run"
- mkdir -p "$current_output_dir"
- if "$GH_BIN" run download "$candidate_run" \
- --repo "$repo" \
- --name "$current_artifact_name" \
- --dir "$current_output_dir"; then
- if [ -z "$run_id" ]; then
- run_id="$candidate_run"
- artifact_name="$current_artifact_name"
- artifact_id="$current_artifact_id"
- fi
- jq -cn \
- --arg runId "$candidate_run" \
- --arg artifactName "$current_artifact_name" \
- --arg artifactId "$current_artifact_id" \
- --arg path "run-$candidate_run" \
- '{runId:$runId, artifactName:$artifactName, artifactId:$artifactId, path:$path}' \
- >>"$downloaded_runs_file"
- else
- echo "::notice::failed to download baseline artifact $current_artifact_name from run $candidate_run"
- fi
- fi
- done
-
- write_baseline_observation_counts
-
- if [ -z "$run_id" ] || [ -z "$artifact_name" ]; then
- echo "::notice::no successful baseline run found for $repo workflow=$workflow branch=$branch"
- exit 0
- fi
-
- jq -n \
- --slurpfile runs "$downloaded_runs_file" \
- --slurpfile seedRuns "$seed_runs_file" \
- --slurpfile observationCounts "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" \
- --argjson schemaVersion 1 \
- --arg repository "$repo" \
- --arg workflow "$workflow" \
- --arg branch "$branch" \
- --arg runId "$run_id" \
- --arg artifactName "$artifact_name" \
- --arg artifactId "$artifact_id" \
- '{
- schemaVersion: $schemaVersion,
- source: "github-actions-artifact",
- repository: $repository,
- workflow: $workflow,
- branch: $branch,
- runId: $runId,
- artifactName: $artifactName,
- artifactId: $artifactId,
- seedRuns: ($seedRuns[0] // []),
- runs: $runs,
- observationCounts: ($observationCounts[0] // null)
- }' >"$BASELINE_OUTPUT_DIR/baseline-provenance.json"
-
- echo "Downloaded $(wc -l <"$downloaded_runs_file" | tr -d ' ') baseline artifact(s), latest $artifact_name from run $run_id into $BASELINE_OUTPUT_DIR"
-
- - name: 'Measure Nix closure: genie'
- shell: bash
- env:
- ARTIFACT_DIR: tmp/nix-closure-ci/current/genie_package
- RUNNER_CLASS: '${{ runner.os }}-${{ runner.arch }}'
- run: |
- set -euo pipefail
-
- mkdir -p "$ARTIFACT_DIR"
- installable='.#genie'
- target_id='genie_package'
- target_name='genie'
- target_label='Genie package'
- target_group='packages'
- target_description='the packaged Genie CLI closure'
- artifact_file="$ARTIFACT_DIR/measurements.json"
- target_system='x86_64-linux'
-
- out_path="$(nix build --no-update-lock-file --no-link --print-out-paths "$installable")"
- path_info="$ARTIFACT_DIR/nix-closure-path-info.json"
- paths_file="$ARTIFACT_DIR/nix-closure-paths.json"
-
- nix path-info --recursive --json "$out_path" >"$path_info"
- jq 'to_entries | map({ path: .key, narSize: (.value.narSize // 0) })' "$path_info" >"$paths_file"
-
- jq -n \
- --slurpfile paths "$paths_file" \
- --argjson schemaVersion 1 \
- --arg generatedAt "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
- --arg repository "${GITHUB_REPOSITORY:-unknown}" \
- --arg branchKind "${GITHUB_EVENT_NAME:-unknown}" \
- --arg ref "${CI_MEASUREMENT_SUBJECT_REF:-${GITHUB_REF:-unknown}}" \
- --arg headSha "${CI_MEASUREMENT_SUBJECT_SHA:-${GITHUB_SHA:-unknown}}" \
- --arg baseSha "${GITHUB_BASE_SHA:-}" \
- --arg runnerName "${RUNNER_NAME:-unknown}" \
- --arg runnerOs "${RUNNER_OS:-unknown}" \
- --arg runnerArch "${RUNNER_ARCH:-unknown}" \
- --arg runnerClass "${RUNNER_CLASS:-unknown}" \
- --arg githubRunId "${GITHUB_RUN_ID:-unknown}" \
- --arg githubRunAttempt "${GITHUB_RUN_ATTEMPT:-unknown}" \
- --arg githubJob "${GITHUB_JOB:-unknown}" \
- --arg taskId "${CROSSTASK_TASK_ID:-}" \
- --arg taskAttemptId "${CROSSTASK_ATTEMPT_ID:-}" \
- --arg traceId "${TRACE_ID:-}" \
- --arg targetName "$target_name" \
- --arg targetId "$target_id" \
- --arg targetLabel "$target_label" \
- --arg targetGroup "$target_group" \
- --arg targetDescription "$target_description" \
- --arg targetSystem "$target_system" \
- --arg outPath "$out_path" \
- --argjson buckets '[{"name":"node","label":"Node / pnpm","pathRegex":"node_modules|npm-deps|pnpm"},{"name":"nix-sources","label":"Nix sources","pathRegex":"-source$"},{"name":"rust","label":"Rust","pathRegex":"cargo|rust|rustc"}]' \
- --argjson targetPath '["nix","closures","packages","genie"]' \
- --argjson gatePolicy '{}' \
- '
- ($paths[0] // []) as $closurePaths
- | ($closurePaths | map(.narSize) | add // 0) as $totalNarSize
- | ($closurePaths | length) as $pathCount
- | ($buckets | map(
- . as $bucket
- | {
- name: "nix.closure.bucket.nar_size",
- id: "nix.closure.bucket.nar_size",
- label: (($bucket.label // $bucket.name) + " closure size"),
- group: "nix closure buckets",
- path: ($targetPath + ["buckets", $bucket.name]),
- description: ("NAR size contributed by closure paths matching " + $bucket.pathRegex),
- measurementKind: "deterministic",
- unit: "bytes",
- value: (
- $closurePaths
- | map(select(.path | test($bucket.pathRegex)) | .narSize)
- | add // 0
- ),
- policy: $gatePolicy,
- dimensions: { bucket: $bucket.name }
- }
- )) as $bucketObservations
- | {
- schemaVersion: $schemaVersion,
- generatedAt: $generatedAt,
- producer: { name: "effect-utils-ci-measurement", version: 1 },
- subject: {
- repo: $repository,
- branchKind: (if $branchKind == "" then "unknown" else $branchKind end),
- ref: $ref,
- headSha: $headSha,
- baseSha: $baseSha
- },
- execution: {
- provider: (if ($githubRunId != "" and $githubRunId != "unknown") then "github-actions" else "local" end),
- workflow: "CI",
- job: $githubJob,
- runId: $githubRunId,
- runAttempt: $githubRunAttempt,
- taskId: $taskId,
- attemptId: $taskAttemptId,
- traceId: $traceId,
- runner: { name: $runnerName, os: $runnerOs, arch: $runnerArch, class: $runnerClass }
- },
- target: { kind: "nix-closure", id: $targetId, name: $targetName, label: $targetLabel, group: $targetGroup, path: $targetPath, system: $targetSystem },
- observations: ([
- {
- id: "nix.closure.nar_size",
- label: "Total closure size",
- group: "nix closure",
- path: ($targetPath + ["total", "nar-size"]),
- description: ("Total NAR size for all paths in " + $targetDescription),
- name: "nix.closure.nar_size",
- measurementKind: "deterministic",
- unit: "bytes",
- value: $totalNarSize,
- policy: $gatePolicy,
- dimensions: { bucket: "total" }
- },
- {
- id: "nix.closure.path_count",
- label: "Total closure path count",
- group: "nix closure",
- path: ($targetPath + ["total", "path-count"]),
- description: ("Number of store paths in " + $targetDescription),
- name: "nix.closure.path_count",
- measurementKind: "deterministic",
- unit: "count",
- value: $pathCount,
- policy: $gatePolicy,
- dimensions: { bucket: "total" }
- }
- ] + $bucketObservations),
- artifacts: [
- { name: "nix-closure-path-info", path: "nix-closure-path-info.json", contentType: "application/json" },
- { name: "nix-closure-paths", path: "nix-closure-paths.json", contentType: "application/json" }
- ],
- details: {
- outPath: $outPath,
- topPaths: ($closurePaths | sort_by(.narSize) | reverse | .[:30])
- }
- }
- ' >"$artifact_file"
-
- cat "$artifact_file"
-
- - name: 'Measure Nix closure: megarepo'
- shell: bash
- env:
- ARTIFACT_DIR: tmp/nix-closure-ci/current/megarepo_package
- RUNNER_CLASS: '${{ runner.os }}-${{ runner.arch }}'
- run: |
- set -euo pipefail
-
- mkdir -p "$ARTIFACT_DIR"
- installable='.#megarepo'
- target_id='megarepo_package'
- target_name='megarepo'
- target_label='Megarepo package'
- target_group='packages'
- target_description='the packaged megarepo CLI closure'
- artifact_file="$ARTIFACT_DIR/measurements.json"
- target_system='x86_64-linux'
-
- out_path="$(nix build --no-update-lock-file --no-link --print-out-paths "$installable")"
- path_info="$ARTIFACT_DIR/nix-closure-path-info.json"
- paths_file="$ARTIFACT_DIR/nix-closure-paths.json"
-
- nix path-info --recursive --json "$out_path" >"$path_info"
- jq 'to_entries | map({ path: .key, narSize: (.value.narSize // 0) })' "$path_info" >"$paths_file"
-
- jq -n \
- --slurpfile paths "$paths_file" \
- --argjson schemaVersion 1 \
- --arg generatedAt "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
- --arg repository "${GITHUB_REPOSITORY:-unknown}" \
- --arg branchKind "${GITHUB_EVENT_NAME:-unknown}" \
- --arg ref "${CI_MEASUREMENT_SUBJECT_REF:-${GITHUB_REF:-unknown}}" \
- --arg headSha "${CI_MEASUREMENT_SUBJECT_SHA:-${GITHUB_SHA:-unknown}}" \
- --arg baseSha "${GITHUB_BASE_SHA:-}" \
- --arg runnerName "${RUNNER_NAME:-unknown}" \
- --arg runnerOs "${RUNNER_OS:-unknown}" \
- --arg runnerArch "${RUNNER_ARCH:-unknown}" \
- --arg runnerClass "${RUNNER_CLASS:-unknown}" \
- --arg githubRunId "${GITHUB_RUN_ID:-unknown}" \
- --arg githubRunAttempt "${GITHUB_RUN_ATTEMPT:-unknown}" \
- --arg githubJob "${GITHUB_JOB:-unknown}" \
- --arg taskId "${CROSSTASK_TASK_ID:-}" \
- --arg taskAttemptId "${CROSSTASK_ATTEMPT_ID:-}" \
- --arg traceId "${TRACE_ID:-}" \
- --arg targetName "$target_name" \
- --arg targetId "$target_id" \
- --arg targetLabel "$target_label" \
- --arg targetGroup "$target_group" \
- --arg targetDescription "$target_description" \
- --arg targetSystem "$target_system" \
- --arg outPath "$out_path" \
- --argjson buckets '[{"name":"node","label":"Node / pnpm","pathRegex":"node_modules|npm-deps|pnpm"},{"name":"nix-sources","label":"Nix sources","pathRegex":"-source$"},{"name":"rust","label":"Rust","pathRegex":"cargo|rust|rustc"}]' \
- --argjson targetPath '["nix","closures","packages","megarepo"]' \
- --argjson gatePolicy '{}' \
- '
- ($paths[0] // []) as $closurePaths
- | ($closurePaths | map(.narSize) | add // 0) as $totalNarSize
- | ($closurePaths | length) as $pathCount
- | ($buckets | map(
- . as $bucket
- | {
- name: "nix.closure.bucket.nar_size",
- id: "nix.closure.bucket.nar_size",
- label: (($bucket.label // $bucket.name) + " closure size"),
- group: "nix closure buckets",
- path: ($targetPath + ["buckets", $bucket.name]),
- description: ("NAR size contributed by closure paths matching " + $bucket.pathRegex),
- measurementKind: "deterministic",
- unit: "bytes",
- value: (
- $closurePaths
- | map(select(.path | test($bucket.pathRegex)) | .narSize)
- | add // 0
- ),
- policy: $gatePolicy,
- dimensions: { bucket: $bucket.name }
- }
- )) as $bucketObservations
- | {
- schemaVersion: $schemaVersion,
- generatedAt: $generatedAt,
- producer: { name: "effect-utils-ci-measurement", version: 1 },
- subject: {
- repo: $repository,
- branchKind: (if $branchKind == "" then "unknown" else $branchKind end),
- ref: $ref,
- headSha: $headSha,
- baseSha: $baseSha
- },
- execution: {
- provider: (if ($githubRunId != "" and $githubRunId != "unknown") then "github-actions" else "local" end),
- workflow: "CI",
- job: $githubJob,
- runId: $githubRunId,
- runAttempt: $githubRunAttempt,
- taskId: $taskId,
- attemptId: $taskAttemptId,
- traceId: $traceId,
- runner: { name: $runnerName, os: $runnerOs, arch: $runnerArch, class: $runnerClass }
- },
- target: { kind: "nix-closure", id: $targetId, name: $targetName, label: $targetLabel, group: $targetGroup, path: $targetPath, system: $targetSystem },
- observations: ([
- {
- id: "nix.closure.nar_size",
- label: "Total closure size",
- group: "nix closure",
- path: ($targetPath + ["total", "nar-size"]),
- description: ("Total NAR size for all paths in " + $targetDescription),
- name: "nix.closure.nar_size",
- measurementKind: "deterministic",
- unit: "bytes",
- value: $totalNarSize,
- policy: $gatePolicy,
- dimensions: { bucket: "total" }
- },
- {
- id: "nix.closure.path_count",
- label: "Total closure path count",
- group: "nix closure",
- path: ($targetPath + ["total", "path-count"]),
- description: ("Number of store paths in " + $targetDescription),
- name: "nix.closure.path_count",
- measurementKind: "deterministic",
- unit: "count",
- value: $pathCount,
- policy: $gatePolicy,
- dimensions: { bucket: "total" }
- }
- ] + $bucketObservations),
- artifacts: [
- { name: "nix-closure-path-info", path: "nix-closure-path-info.json", contentType: "application/json" },
- { name: "nix-closure-paths", path: "nix-closure-paths.json", contentType: "application/json" }
- ],
- details: {
- outPath: $outPath,
- topPaths: ($closurePaths | sort_by(.narSize) | reverse | .[:30])
- }
- }
- ' >"$artifact_file"
-
- cat "$artifact_file"
-
- - name: 'Measure Nix closure: oxlint-npm'
- shell: bash
- env:
- ARTIFACT_DIR: tmp/nix-closure-ci/current/oxlint_npm_package
- RUNNER_CLASS: '${{ runner.os }}-${{ runner.arch }}'
- run: |
- set -euo pipefail
-
- mkdir -p "$ARTIFACT_DIR"
- installable='.#oxlint-npm'
- target_id='oxlint_npm_package'
- target_name='oxlint-npm'
- target_label='oxlint npm package'
- target_group='packages'
- target_description='the packaged oxlint npm compatibility wrapper closure'
- artifact_file="$ARTIFACT_DIR/measurements.json"
- target_system='x86_64-linux'
-
- out_path="$(nix build --no-update-lock-file --no-link --print-out-paths "$installable")"
- path_info="$ARTIFACT_DIR/nix-closure-path-info.json"
- paths_file="$ARTIFACT_DIR/nix-closure-paths.json"
-
- nix path-info --recursive --json "$out_path" >"$path_info"
- jq 'to_entries | map({ path: .key, narSize: (.value.narSize // 0) })' "$path_info" >"$paths_file"
-
- jq -n \
- --slurpfile paths "$paths_file" \
- --argjson schemaVersion 1 \
- --arg generatedAt "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
- --arg repository "${GITHUB_REPOSITORY:-unknown}" \
- --arg branchKind "${GITHUB_EVENT_NAME:-unknown}" \
- --arg ref "${CI_MEASUREMENT_SUBJECT_REF:-${GITHUB_REF:-unknown}}" \
- --arg headSha "${CI_MEASUREMENT_SUBJECT_SHA:-${GITHUB_SHA:-unknown}}" \
- --arg baseSha "${GITHUB_BASE_SHA:-}" \
- --arg runnerName "${RUNNER_NAME:-unknown}" \
- --arg runnerOs "${RUNNER_OS:-unknown}" \
- --arg runnerArch "${RUNNER_ARCH:-unknown}" \
- --arg runnerClass "${RUNNER_CLASS:-unknown}" \
- --arg githubRunId "${GITHUB_RUN_ID:-unknown}" \
- --arg githubRunAttempt "${GITHUB_RUN_ATTEMPT:-unknown}" \
- --arg githubJob "${GITHUB_JOB:-unknown}" \
- --arg taskId "${CROSSTASK_TASK_ID:-}" \
- --arg taskAttemptId "${CROSSTASK_ATTEMPT_ID:-}" \
- --arg traceId "${TRACE_ID:-}" \
- --arg targetName "$target_name" \
- --arg targetId "$target_id" \
- --arg targetLabel "$target_label" \
- --arg targetGroup "$target_group" \
- --arg targetDescription "$target_description" \
- --arg targetSystem "$target_system" \
- --arg outPath "$out_path" \
- --argjson buckets '[{"name":"node","label":"Node / pnpm","pathRegex":"node_modules|npm-deps|pnpm"},{"name":"nix-sources","label":"Nix sources","pathRegex":"-source$"},{"name":"rust","label":"Rust","pathRegex":"cargo|rust|rustc"}]' \
- --argjson targetPath '["nix","closures","packages","oxlint-npm"]' \
- --argjson gatePolicy '{}' \
- '
- ($paths[0] // []) as $closurePaths
- | ($closurePaths | map(.narSize) | add // 0) as $totalNarSize
- | ($closurePaths | length) as $pathCount
- | ($buckets | map(
- . as $bucket
- | {
- name: "nix.closure.bucket.nar_size",
- id: "nix.closure.bucket.nar_size",
- label: (($bucket.label // $bucket.name) + " closure size"),
- group: "nix closure buckets",
- path: ($targetPath + ["buckets", $bucket.name]),
- description: ("NAR size contributed by closure paths matching " + $bucket.pathRegex),
- measurementKind: "deterministic",
- unit: "bytes",
- value: (
- $closurePaths
- | map(select(.path | test($bucket.pathRegex)) | .narSize)
- | add // 0
- ),
- policy: $gatePolicy,
- dimensions: { bucket: $bucket.name }
- }
- )) as $bucketObservations
- | {
- schemaVersion: $schemaVersion,
- generatedAt: $generatedAt,
- producer: { name: "effect-utils-ci-measurement", version: 1 },
- subject: {
- repo: $repository,
- branchKind: (if $branchKind == "" then "unknown" else $branchKind end),
- ref: $ref,
- headSha: $headSha,
- baseSha: $baseSha
- },
- execution: {
- provider: (if ($githubRunId != "" and $githubRunId != "unknown") then "github-actions" else "local" end),
- workflow: "CI",
- job: $githubJob,
- runId: $githubRunId,
- runAttempt: $githubRunAttempt,
- taskId: $taskId,
- attemptId: $taskAttemptId,
- traceId: $traceId,
- runner: { name: $runnerName, os: $runnerOs, arch: $runnerArch, class: $runnerClass }
- },
- target: { kind: "nix-closure", id: $targetId, name: $targetName, label: $targetLabel, group: $targetGroup, path: $targetPath, system: $targetSystem },
- observations: ([
- {
- id: "nix.closure.nar_size",
- label: "Total closure size",
- group: "nix closure",
- path: ($targetPath + ["total", "nar-size"]),
- description: ("Total NAR size for all paths in " + $targetDescription),
- name: "nix.closure.nar_size",
- measurementKind: "deterministic",
- unit: "bytes",
- value: $totalNarSize,
- policy: $gatePolicy,
- dimensions: { bucket: "total" }
- },
- {
- id: "nix.closure.path_count",
- label: "Total closure path count",
- group: "nix closure",
- path: ($targetPath + ["total", "path-count"]),
- description: ("Number of store paths in " + $targetDescription),
- name: "nix.closure.path_count",
- measurementKind: "deterministic",
- unit: "count",
- value: $pathCount,
- policy: $gatePolicy,
- dimensions: { bucket: "total" }
- }
- ] + $bucketObservations),
- artifacts: [
- { name: "nix-closure-path-info", path: "nix-closure-path-info.json", contentType: "application/json" },
- { name: "nix-closure-paths", path: "nix-closure-paths.json", contentType: "application/json" }
- ],
- details: {
- outPath: $outPath,
- topPaths: ($closurePaths | sort_by(.narSize) | reverse | .[:30])
- }
- }
- ' >"$artifact_file"
-
- cat "$artifact_file"
-
- - name: Compare CI measurements with baseline
- shell: bash
- env:
- CI_MEASUREMENT_CURRENT_DIR: tmp/nix-closure-ci/current
- CI_MEASUREMENT_BASELINE_DIR: tmp/nix-closure-ci/baseline
- CI_MEASUREMENT_COMPARISON_FILE: tmp/nix-closure-ci/measurement-comparison.json
- CI_MEASUREMENT_REGRESSION_MODE: warn
- CI_MEASUREMENT_PR_COMMENT_ENABLED: 'true'
- CI_MEASUREMENT_PR_COMMENT_TITLE: Nix Closure Measurements
- CI_MEASUREMENT_PR_COMMENT_MAX_ROWS: '8'
- CI_MEASUREMENT_PR_COMMENT_MAX_HISTORY: '20'
- CI_MEASUREMENT_PR_COMMENT_ASSET_BRANCH: ci-measurement-assets
- GH_TOKEN: ${{ github.token }}
- run: |
- set -euo pipefail
-
- export PATH="/run/current-system/sw/bin:/usr/bin:/bin:$PATH"
-
- current_dir="${CI_MEASUREMENT_CURRENT_DIR:?CI_MEASUREMENT_CURRENT_DIR not set}"
- baseline_dir="${CI_MEASUREMENT_BASELINE_DIR:?CI_MEASUREMENT_BASELINE_DIR not set}"
- comparison_file="${CI_MEASUREMENT_COMPARISON_FILE:?CI_MEASUREMENT_COMPARISON_FILE not set}"
- mode="${CI_MEASUREMENT_REGRESSION_MODE:-warn}"
- mkdir -p "$(dirname "$comparison_file")"
-
- if [ "$mode" = "off" ]; then
- jq -n --argjson schemaVersion 1 --arg status skipped --arg mode "$mode" \
- '{schemaVersion:$schemaVersion,status:$status,mode:$mode,comparisons:{}}' \
- >"$comparison_file"
- exit 0
- fi
-
- current_index="$(mktemp)"
- baseline_index="$(mktemp)"
- find "$current_dir" -name baseline -type d -prune -o -name measurements.json -type f -print | sort >"$current_index" || true
- {
- find "$baseline_dir" -name baseline -type d ! -path "$baseline_dir" -prune -o -name measurements.json -type f -print
- } | sort -u >"$baseline_index" || true
-
- if [ ! -s "$current_index" ]; then
- echo "::error::no current measurements.json files found under $current_dir"
- exit 1
- fi
-
- current_json="$comparison_file.current.json"
- baseline_json="$comparison_file.baseline.json"
- xargs -r jq -s '.' <"$current_index" >"$current_json"
- if [ -s "$baseline_index" ]; then
- xargs -r jq -s '.' <"$baseline_index" >"$baseline_json"
- else
- printf '[]\n' >"$baseline_json"
- fi
-
- jq -n \
- --slurpfile current "$current_json" \
- --slurpfile baseline "$baseline_json" \
- --argjson schemaVersion 1 \
- --arg mode "$mode" \
- --arg currentDir "$current_dir" \
- --arg baselineDir "$baseline_dir" \
- '
- def identity_dimensions:
- (.dimensions // {})
- | to_entries
- | map(select(.key as $key | ["devenvRev", "otelServiceName", "status", "probeLabel", "sampleCount", "measuredSampleCount"] | index($key) | not))
- | sort_by(.key)
- | map("\(.key)=\(.value|tostring)")
- | join(",");
-
- def observation_key($doc):
- [
- ($doc.target.kind // "unknown"),
- ($doc.target.id // $doc.target.name // "unknown"),
- ($doc.target.system // "unknown"),
- (.id // .name // "unknown"),
- (.unit // "unknown"),
- identity_dimensions
- ] | join("|");
-
- def median:
- sort as $sorted
- | ($sorted | length) as $count
- | if $count == 0 then null
- elif ($count % 2) == 1 then $sorted[($count / 2 | floor)]
- else (($sorted[($count / 2 - 1)] + $sorted[($count / 2)]) / 2)
- end;
-
- def percentile($p):
- sort as $sorted
- | ($sorted | length) as $count
- | if $count == 0 then null
- else $sorted[(($p * ($count - 1)) | floor)]
- end;
-
- def abs_value: if . < 0 then -. else . end;
-
- def observations_by_key($docs):
- reduce $docs[]? as $doc
- ({};
- reduce (($doc.observations // [])[]? | select(.value | type == "number")) as $obs
- (.;
- ($obs | observation_key($doc)) as $key
- | .[$key] = ((.[$key] // []) + [{
- target: $doc.target,
- observation: $obs,
- generatedAt: $doc.generatedAt
- }])
- )
- );
-
- def observation_stats($items):
- ($items | map(.observation.value)) as $values
- | ($items | map(.observation.comparison.baseline // empty)) as $pairedBaselineValues
- | ($items | map(.observation.statistics.pairedDeltaMedian // empty)) as $pairedDeltaMedianValues
- | ($items | map(.observation.statistics.pairedDeltaP25 // empty)) as $pairedDeltaP25Values
- | ($items | map(.observation.statistics.pairedDeltaP75 // empty)) as $pairedDeltaP75Values
- | ($items | map(.observation.statistics.pairedDeltaMad // empty)) as $pairedDeltaMadValues
- | ($items | map(.observation.statistics.pairedDeltaSamples // []) | add // []) as $pairedDeltaSampleValues
- | ($items | map(.observation.statistics.measuredSampleCount // .observation.statistics.sampleCount // 1) | add // ($items | length)) as $sampleCount
- | ($values | median) as $median
- | {
- target: ($items[0].target // {}),
- observation: ($items[-1].observation // {}),
- measurementKind: ($items[-1].observation.measurementKind // null),
- value: $median,
- min: ($values | min),
- max: ($values | max),
- p25: ($values | percentile(0.25)),
- p75: ($values | percentile(0.75)),
- p95: ($values | percentile(0.95)),
- mad: ($values | map(. - $median | if . < 0 then -. else . end) | median),
- sourceCount: ($items | length),
- sampleCount: $sampleCount,
- pairedSampleCount: ($items | map(.observation.statistics.pairedSampleCount // .observation.comparison.pairedSampleCount // 0) | add // 0),
- pairedBaselineValue: (if ($pairedBaselineValues | length) == 0 then null else ($pairedBaselineValues | median) end),
- pairedDeltaMedianValue: (if ($pairedDeltaMedianValues | length) == 0 then null else ($pairedDeltaMedianValues | median) end),
- pairedDeltaP25Value: (if ($pairedDeltaP25Values | length) == 0 then null else ($pairedDeltaP25Values | median) end),
- pairedDeltaP75Value: (if ($pairedDeltaP75Values | length) == 0 then null else ($pairedDeltaP75Values | median) end),
- pairedDeltaMadValue: (if ($pairedDeltaMadValues | length) == 0 then null else ($pairedDeltaMadValues | median) end),
- pairedDeltaSampleValues: $pairedDeltaSampleValues,
- generatedAt: ($items[-1].generatedAt // null)
- };
-
- def budget($metric; $unit):
- if $metric == "nix.closure.nar_size" then
- {warnRatio:1.05, failRatio:1.10, warnAbs:52428800, failAbs:209715200, statisticalToleranceRatio:0.02, statisticalToleranceAbs:10485760}
- elif $metric == "nix.closure.bucket.nar_size" then
- {warnRatio:1.10, failRatio:1.20, warnAbs:52428800, failAbs:209715200, statisticalToleranceRatio:0.05, statisticalToleranceAbs:10485760}
- elif $metric == "nix.closure.path_count" then
- {warnRatio:1.05, failRatio:1.10, warnAbs:100, failAbs:500, statisticalToleranceRatio:0.02, statisticalToleranceAbs:10}
- elif $unit == "seconds" then
- {warnRatio:1.10, failRatio:1.20, warnAbs:0.25, failAbs:1, statisticalToleranceRatio:0.10, statisticalToleranceAbs:0.25}
- else
- {warnRatio:1.25, failRatio:1.50, warnAbs:1, failAbs:3, statisticalToleranceRatio:0.10, statisticalToleranceAbs:1}
- end;
-
- def noise_floor($metric; $unit):
- if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" then 10485760
- elif $metric == "nix.closure.path_count" then 10
- elif $unit == "seconds" then 0.1
- else 0
- end;
- def default_policy($metric; $unit):
- budget($metric; $unit) as $b
- | noise_floor($metric; $unit) as $noise
- | $b + {
- enabled:true,
- comparisonMode:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" or $unit != "seconds" then "budget" else "historical" end),
- minBaselineSources:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" or $unit != "seconds" then 1 else 10 end),
- minCurrentSamples:(if $unit == "seconds" then 3 else 1 end),
- minPairedSamples:(if $unit == "seconds" then 5 else 0 end),
- noiseFloor:$noise
- };
- def observation_policy($obs):
- default_policy($obs.name // "unknown"; $obs.unit // "unknown") + ($obs.policy // {});
- def policy_enabled($policy):
- if ($policy | has("enabled")) then $policy.enabled else true end;
+ def noise_floor($metric; $unit):
+ if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" then 10485760
+ elif $metric == "nix.closure.path_count" then 10
+ elif $unit == "seconds" then 0.1
+ else 0
+ end;
+ def default_policy($metric; $unit):
+ budget($metric; $unit) as $b
+ | noise_floor($metric; $unit) as $noise
+ | $b + {
+ enabled:true,
+ comparisonMode:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" or $unit != "seconds" then "budget" else "historical" end),
+ minBaselineSources:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" or $unit != "seconds" then 1 else 10 end),
+ minCurrentSamples:(if $unit == "seconds" then 3 else 1 end),
+ minPairedSamples:(if $unit == "seconds" then 5 else 0 end),
+ noiseFloor:$noise
+ };
+ def observation_policy($obs):
+ default_policy($obs.name // "unknown"; $obs.unit // "unknown") + ($obs.policy // {});
+ def policy_enabled($policy):
+ if ($policy | has("enabled")) then $policy.enabled else true end;
def classify($metric; $unit; $measurementKind; $policy; $current; $currentP25; $currentP75; $currentMad; $baseline; $baselineMin; $baselineMax; $baselineP25; $baselineP75; $baselineP95; $baselineMad; $currentSamples; $baselineSources; $pairedSamples; $pairedDeltaMedian; $pairedDeltaP25; $pairedDeltaP75; $pairedDeltaMad; $pairedDeltaValues):
$policy as $b
@@ -6084,1222 +5024,1543 @@ jobs:
mv "$comparison_with_provenance" "$comparison_file"
fi
- status="$(jq -r '.status' "$comparison_file")"
- exit_code=0
- case "$status:$mode" in
- fail:fail)
- echo "::error::CI measurement regression detected"
- exit_code=1
- ;;
- fail:*|warn:*)
- echo "::warning::CI measurement regression threshold exceeded"
- ;;
- partial:*)
- echo "::notice::CI measurement comparison is partial because one or more enabled observations are not gateable"
- ;;
- esac
+ status="$(jq -r '.status' "$comparison_file")"
+ exit_code=0
+ case "$status:$mode" in
+ fail:fail)
+ echo "::error::CI measurement regression detected"
+ exit_code=1
+ ;;
+ fail:*|warn:*)
+ echo "::warning::CI measurement regression threshold exceeded"
+ ;;
+ partial:*)
+ echo "::notice::CI measurement comparison is partial because one or more enabled observations are not gateable"
+ ;;
+ esac
+
+ if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
+ {
+ echo "### ${CI_MEASUREMENT_PR_COMMENT_TITLE:-CI Measurements}"
+ echo ""
+ jq -r '"- Status: " + .status + "\n- Gate: " + (if .mode == "fail" then "enforced" elif .mode == "warn" then "advisory" elif .mode == "off" then "off" else (.mode // "unknown") end) + "\n- Baseline: " + .baselineDir' "$comparison_file"
+ echo ""
+ echo "| Status | Gate | Target | Observation | Current | Baseline | Delta | Ratio |"
+ echo "| --- | --- | --- | --- | ---: | ---: | ---: | ---: |"
+ jq -r '
+ .comparisons
+ | to_entries
+ | sort_by(
+ if .value.status == "fail" then 0
+ elif .value.status == "warn" then 1
+ elif .value.status == "missing_baseline" then 2
+ else 3
+ end
+ )
+ | .[:20]
+ | .[]
+ | .value as $v
+ | [
+ $v.status,
+ (if ($v.gateable // false) then "yes" else ($v.gateReason // "no") end),
+ (($v.target.kind // "unknown") + "/" + ($v.target.name // "unknown") + "/" + ($v.target.system // "unknown")),
+ ($v.observation.name // "unknown"),
+ (($v.current // $v.observation.value // 0) | tostring),
+ (($v.baseline // "") | tostring),
+ (($v.delta // "") | tostring),
+ (if $v.ratio == null or $v.ratio == "" then "" else (($v.ratio * 100 | round / 100) | tostring) end)
+ ]
+ | "| " + (map(gsub("\\|"; "\\\\|")) | join(" | ")) + " |"
+ ' "$comparison_file"
+ } >>"$GITHUB_STEP_SUMMARY"
+ fi
+
+
+
+ if [ "$exit_code" -ne 0 ]; then
+ exit "$exit_code"
+ fi
+
+ - name: 'Upload CI measurements: nix-closure-measurements'
+ if: always()
+ uses: actions/upload-artifact@v4
+ with:
+ name: nix-closure-measurements
+ path: |
+ tmp/nix-closure-ci
+ !tmp/nix-closure-ci/baseline/**
+ if-no-files-found: error
+ retention-days: 30
+ - name: Save pnpm state
+ if: ${{ success() && steps.restore-pnpm-state.outputs.cache-hit != 'true' }}
+ uses: actions/cache/save@v4
+ with:
+ path: |
+ ${{ github.workspace }}/.pnpm-home
+ ${{ runner.temp }}/pnpm-store/${{ github.job }}
+ key: "pnpm-state-v1-${{ runner.os }}-${{ runner.arch }}-${{ hashFiles('**/pnpm-lock.yaml') }}"
+ - name: Nix diagnostics summary
+ if: failure()
+ shell: bash
+ run: |
+ diag_dir="${NIX_STORE_DIAGNOSTICS_DIR:-}"
+ if [ -z "$diag_dir" ] || [ ! -d "$diag_dir" ]; then
+ echo "## Nix Store Diagnostics" >> "$GITHUB_STEP_SUMMARY"
+ echo "" >> "$GITHUB_STEP_SUMMARY"
+ echo "No diagnostics directory found (validation may have failed before capture)." >> "$GITHUB_STEP_SUMMARY"
+ exit 0
+ fi
+
+ {
+ echo "## Nix Store Diagnostics"
+ echo ""
+ echo "Temporary instrumentation for #272; remove after root cause is confirmed and CI is stable."
+ echo ""
+ echo "- Diagnostics directory: \`$diag_dir\`"
+ echo "- Tracking issue: https://github.com/overengineeringstudio/effect-utils/issues/272"
+ } >> "$GITHUB_STEP_SUMMARY"
+
+ markers_file="${RUNNER_TEMP:-/tmp}/nix-store-signature-markers.txt"
+ grep -R -n -E "config\\.cachix|cachix\\.package|error: path '/nix/store/.+ is not valid" --exclude="$(basename "$markers_file")" "$diag_dir" > "$markers_file" || true
+
+ if [ -s "$markers_file" ]; then
+ {
+ echo ""
+ echo "### Signature markers"
+ echo '```text'
+ head -n 120 "$markers_file"
+ echo '```'
+ } >> "$GITHUB_STEP_SUMMARY"
+ else
+ echo "" >> "$GITHUB_STEP_SUMMARY"
+ echo "- No signature markers found in captured diagnostics." >> "$GITHUB_STEP_SUMMARY"
+ fi
+ - name: Upload Nix diagnostics artifact
+ if: failure() && env.NIX_STORE_DIAGNOSTICS_DIR != ''
+ uses: actions/upload-artifact@v4
+ with:
+ name: 'nix-store-diagnostics-${{ github.job }}-${{ runner.os }}-run-${{ github.run_id }}-attempt-${{ github.run_attempt }}'
+ path: ${{ env.NIX_STORE_DIAGNOSTICS_DIR }}
+ if-no-files-found: ignore
+ retention-days: 14
+ - name: Failure note
+ if: failure()
+ shell: bash
+ run: |
+ echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
+ echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
+ concurrency:
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-nix-closure-sizes"
+ cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
+ source-shape:
+ runs-on:
+ [namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}']
+ timeout-minutes: 30
+ defaults:
+ run:
+ shell: bash
+ permissions:
+ actions: read
+ contents: write
+ issues: write
+ pull-requests: write
+ env:
+ CI_MEASUREMENT_SUBJECT_REF: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.ref || github.ref }}
+ CI_MEASUREMENT_SUBJECT_SHA: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.sha || github.sha }}
+ CI_MEASUREMENT_SUBJECT_LABEL: ${{ inputs.measurement_baseline_label }}
+ CI_MEASUREMENT_ALLOW_PROBE_FAILURES: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && '1' || '' }}
+ steps:
+ - uses: actions/checkout@v6
+ - name: Checkout CI measurement baseline ref
+ if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }}
+ uses: actions/checkout@v6
+ with:
+ ref: ${{ inputs.measurement_baseline_ref }}
+ - name: 'Download previous artifact: source-shape'
+ shell: bash
+ env:
+ GH_TOKEN: ${{ github.token }}
+ BASELINE_ARTIFACT_NAME: source-shape
+ BASELINE_OUTPUT_DIR: tmp/source-shape-ci/baseline
+ BASELINE_WORKFLOW_NAME: ${{ github.workflow }}
+ BASELINE_BRANCH: ${{ github.base_ref || github.ref_name }}
+ BASELINE_SEED_RUNS_JSON: '[{"runId":"26085158592","label":"main baseline","sha":"ce7cf8f8ebfaa1da6c7e9122cd195a5f95ce2fca","source":"manual-backfill","artifacts":["source-shape"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."}]'
+ BASELINE_MAX_RUNS: '20'
+ BASELINE_MAX_CANDIDATE_RUNS: '60'
+ BASELINE_REQUIRED_OBSERVATIONS_JSON: '[]'
+ run: |
+ set -euo pipefail
+
+ mkdir -p "$BASELINE_OUTPUT_DIR"
+
+ if command -v gh >/dev/null 2>&1; then
+ GH_BIN="$(command -v gh)"
+ else
+ echo "::notice::gh is not on PATH; resolving GitHub CLI through Nix"
+ if ! GH_BIN="$(nix build --no-link --print-out-paths nixpkgs#gh 2>/dev/null)/bin/gh"; then
+ echo "::notice::unable to resolve GitHub CLI through Nix; skipping previous artifact download"
+ exit 0
+ fi
+ fi
+ echo "Using GitHub CLI: $GH_BIN"
+
+ repo="${GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}"
+ workflow="${BASELINE_WORKFLOW_NAME:-CI}"
+ branch="${BASELINE_BRANCH:-${GITHUB_BASE_REF:-${GITHUB_REF_NAME:-main}}}"
+ seed_runs_file="$BASELINE_OUTPUT_DIR/baseline-seed-runs.json"
+ required_observations_file="$BASELINE_OUTPUT_DIR/baseline-required-observations.json"
+ printf '%s' "${BASELINE_SEED_RUNS_JSON:-[]}" >"$seed_runs_file"
+ printf '%s' "${BASELINE_REQUIRED_OBSERVATIONS_JSON:-[]}" >"$required_observations_file"
+ if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.runId | type == "string")) else false end' \
+ "$seed_runs_file" >/dev/null; then
+ echo "::error::BASELINE_SEED_RUNS_JSON must be an array of objects with string runId fields"
+ exit 1
+ fi
+ if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.id | type == "string") and (.minSources | type == "number")) else false end' \
+ "$required_observations_file" >/dev/null; then
+ echo "::error::BASELINE_REQUIRED_OBSERVATIONS_JSON must be an array of objects with string id and numeric minSources fields"
+ exit 1
+ fi
+ seed_run_ids="$(jq -r '.[].runId' "$seed_runs_file")"
+ required_observation_count="$(jq 'length' "$required_observations_file")"
+ max_candidate_runs="${BASELINE_MAX_CANDIDATE_RUNS:-${BASELINE_MAX_RUNS:-5}}"
+ if ! [[ "$max_candidate_runs" =~ ^[0-9]+$ ]] || [ "$max_candidate_runs" -lt 1 ]; then
+ max_candidate_runs=1
+ fi
+
+ candidate_runs="$(
+ "$GH_BIN" run list \
+ --repo "$repo" \
+ --workflow "$workflow" \
+ --branch "$branch" \
+ --event push \
+ --status success \
+ --json databaseId,headSha \
+ --limit "$max_candidate_runs" \
+ --jq '[.[] | select(.headSha != env.GITHUB_SHA) | .databaseId] | .[]'
+ )"
- if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
- {
- echo "### ${CI_MEASUREMENT_PR_COMMENT_TITLE:-CI Measurements}"
- echo ""
- jq -r '"- Status: " + .status + "\n- Gate: " + (if .mode == "fail" then "enforced" elif .mode == "warn" then "advisory" elif .mode == "off" then "off" else (.mode // "unknown") end) + "\n- Baseline: " + .baselineDir' "$comparison_file"
- echo ""
- echo "| Status | Gate | Target | Observation | Current | Baseline | Delta | Ratio |"
- echo "| --- | --- | --- | --- | ---: | ---: | ---: | ---: |"
- jq -r '
- .comparisons
- | to_entries
- | sort_by(
- if .value.status == "fail" then 0
- elif .value.status == "warn" then 1
- elif .value.status == "missing_baseline" then 2
- else 3
- end
- )
- | .[:20]
- | .[]
- | .value as $v
- | [
- $v.status,
- (if ($v.gateable // false) then "yes" else ($v.gateReason // "no") end),
- (($v.target.kind // "unknown") + "/" + ($v.target.name // "unknown") + "/" + ($v.target.system // "unknown")),
- ($v.observation.name // "unknown"),
- (($v.current // $v.observation.value // 0) | tostring),
- (($v.baseline // "") | tostring),
- (($v.delta // "") | tostring),
- (if $v.ratio == null or $v.ratio == "" then "" else (($v.ratio * 100 | round / 100) | tostring) end)
- ]
- | "| " + (map(gsub("\\|"; "\\\\|")) | join(" | ")) + " |"
- ' "$comparison_file"
- } >>"$GITHUB_STEP_SUMMARY"
+ candidate_runs="$seed_run_ids
+ $candidate_runs"
+
+ max_runs="${BASELINE_MAX_RUNS:-5}"
+ if ! [[ "$max_runs" =~ ^[0-9]+$ ]] || [ "$max_runs" -lt 1 ]; then
+ max_runs=1
fi
- if [ "${CI_MEASUREMENT_PR_COMMENT_ENABLED:-false}" = "true" ]; then
- if [ "${GITHUB_EVENT_NAME:-}" != "pull_request" ]; then
- echo "::notice::CI measurement PR comments are produced only by pull_request workflows; skipping comment for event ${GITHUB_EVENT_NAME:-unknown}"
- exit 0
- fi
+ write_baseline_observation_counts() {
+ local measurement_index="$BASELINE_OUTPUT_DIR/baseline-measurement-files.txt"
+ local counts_file="$BASELINE_OUTPUT_DIR/baseline-observation-counts.json"
+ find "$BASELINE_OUTPUT_DIR" \
+ -mindepth 2 \
+ -maxdepth 2 \
+ -name measurements.json \
+ -type f \
+ -print \
+ | sort >"$measurement_index" || true
- can_render_pr_comment=true
+ if [ -s "$measurement_index" ]; then
+ xargs -r jq -s \
+ --slurpfile required "$required_observations_file" \
+ '
+ ([.[] | (.observations // [])[]? | select(.value | type == "number") | .id] | sort | group_by(.) | map({id: .[0], sources: length})) as $counts
+ | ($required[0] // []) as $requiredRows
+ | {
+ counts: $counts,
+ required: (
+ $requiredRows
+ | map(. as $requiredRow | ($counts | map(select(.id == $requiredRow.id)) | .[0].sources // 0) as $actual | $requiredRow + {sources:$actual, satisfied:($actual >= $requiredRow.minSources)})
+ )
+ }
+ ' <"$measurement_index" >"$counts_file"
+ else
+ jq -n --slurpfile required "$required_observations_file" \
+ '{counts: [], required: (($required[0] // []) | map(. + {sources:0, satisfied:false}))}' >"$counts_file"
+ fi
+ }
- ensure_ci_measurement_tool() {
- tool_name="$1"
- nix_attr="$2"
- if command -v "$tool_name" >/dev/null 2>&1; then
- return 0
- fi
- if ! command -v nix >/dev/null 2>&1; then
- return 1
- fi
- if tool_out="$(nix build --no-link --print-out-paths "nixpkgs#$nix_attr" 2>/dev/null)"; then
- export PATH="$tool_out/bin:$PATH"
- fi
- command -v "$tool_name" >/dev/null 2>&1
- }
+ baseline_requirements_satisfied() {
+ if [ "$required_observation_count" -eq 0 ]; then
+ return 1
+ fi
+ write_baseline_observation_counts
+ jq -e '.required | all(.satisfied == true)' "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" >/dev/null
+ }
- if ! ensure_ci_measurement_tool gh gh; then
- echo "::error::gh is not available; unable to publish required CI measurement PR comment"
- can_render_pr_comment=false
+ run_id=""
+ artifact_name=""
+ artifact_id=""
+ downloaded_runs_file="$BASELINE_OUTPUT_DIR/baseline-runs.jsonl"
+ seen_runs_file="$BASELINE_OUTPUT_DIR/baseline-seen-runs.txt"
+ : >"$downloaded_runs_file"
+ : >"$seen_runs_file"
+ for candidate_run in $candidate_runs; do
+ if [ -z "$candidate_run" ]; then
+ continue
fi
- if ! ensure_ci_measurement_tool node nodejs; then
- echo "::error::node is not available; unable to publish required CI measurement PR comment"
- can_render_pr_comment=false
+ if grep -qxF "$candidate_run" "$seen_runs_file"; then
+ continue
fi
- if ! command -v jq >/dev/null 2>&1; then
- if ensure_ci_measurement_tool jq jq; then
- :
- else
- echo "::error::jq is not available; unable to publish required CI measurement PR comment"
- can_render_pr_comment=false
+ downloaded_count="$(wc -l <"$downloaded_runs_file" | tr -d ' ')"
+ if [ "$downloaded_count" -ge "$max_runs" ]; then
+ if baseline_requirements_satisfied; then
+ break
fi
+ echo "::notice::downloaded $downloaded_count baseline artifact(s), but required observation counts are not satisfied yet; continuing through bounded candidate history"
fi
- if [ -z "${GH_TOKEN:-${GITHUB_TOKEN:-}}" ]; then
- echo "::error::GH_TOKEN/GITHUB_TOKEN is not set; unable to publish required CI measurement PR comment"
- can_render_pr_comment=false
+ if [ "$(wc -l <"$seen_runs_file" | tr -d ' ')" -ge "$max_candidate_runs" ]; then
+ break
fi
+ printf '%s\n' "$candidate_run" >>"$seen_runs_file"
- event_path="${GITHUB_EVENT_PATH:-}"
- pr_number=""
- if [ "$can_render_pr_comment" = "true" ] && [ -n "$event_path" ] && [ -f "$event_path" ]; then
- pr_number="$(jq -r '.pull_request.number // empty' "$event_path")"
- fi
- if [ "$can_render_pr_comment" = "true" ] && [ -z "$pr_number" ]; then
- echo "::error::pull request number is unavailable; unable to publish required CI measurement PR comment"
- can_render_pr_comment=false
+ artifact_json="$(
+ "$GH_BIN" api "repos/$repo/actions/runs/$candidate_run/artifacts" \
+ | jq --arg artifactName "$BASELINE_ARTIFACT_NAME" '.artifacts
+ | map(select(.expired == false))
+ | map(select(.name == $artifactName or (.name | startswith($artifactName + "-"))))
+ | sort_by(.created_at // "")
+ | reverse
+ | .[0] // empty'
+ )"
+
+ if [ -n "$artifact_json" ]; then
+ current_artifact_name="$(printf '%s' "$artifact_json" | jq -r '.name')"
+ current_artifact_id="$(printf '%s' "$artifact_json" | jq -r '.id')"
+ current_output_dir="$BASELINE_OUTPUT_DIR/run-$candidate_run"
+ mkdir -p "$current_output_dir"
+ if "$GH_BIN" run download "$candidate_run" \
+ --repo "$repo" \
+ --name "$current_artifact_name" \
+ --dir "$current_output_dir"; then
+ if [ -z "$run_id" ]; then
+ run_id="$candidate_run"
+ artifact_name="$current_artifact_name"
+ artifact_id="$current_artifact_id"
+ fi
+ jq -cn \
+ --arg runId "$candidate_run" \
+ --arg artifactName "$current_artifact_name" \
+ --arg artifactId "$current_artifact_id" \
+ --arg path "run-$candidate_run" \
+ '{runId:$runId, artifactName:$artifactName, artifactId:$artifactId, path:$path}' \
+ >>"$downloaded_runs_file"
+ else
+ echo "::notice::failed to download baseline artifact $current_artifact_name from run $candidate_run"
+ fi
fi
+ done
+
+ write_baseline_observation_counts
+
+ if [ -z "$run_id" ] || [ -z "$artifact_name" ]; then
+ echo "::notice::no successful baseline run found for $repo workflow=$workflow branch=$branch"
+ exit 0
+ fi
+
+ jq -n \
+ --slurpfile runs "$downloaded_runs_file" \
+ --slurpfile seedRuns "$seed_runs_file" \
+ --slurpfile observationCounts "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" \
+ --argjson schemaVersion 1 \
+ --arg repository "$repo" \
+ --arg workflow "$workflow" \
+ --arg branch "$branch" \
+ --arg runId "$run_id" \
+ --arg artifactName "$artifact_name" \
+ --arg artifactId "$artifact_id" \
+ '{
+ schemaVersion: $schemaVersion,
+ source: "github-actions-artifact",
+ repository: $repository,
+ workflow: $workflow,
+ branch: $branch,
+ runId: $runId,
+ artifactName: $artifactName,
+ artifactId: $artifactId,
+ seedRuns: ($seedRuns[0] // []),
+ runs: $runs,
+ observationCounts: ($observationCounts[0] // null)
+ }' >"$BASELINE_OUTPUT_DIR/baseline-provenance.json"
+
+ echo "Downloaded $(wc -l <"$downloaded_runs_file" | tr -d ' ') baseline artifact(s), latest $artifact_name from run $run_id into $BASELINE_OUTPUT_DIR"
+
+ if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
+ - name: 'Measure source shape: effect-utils'
+ shell: bash
+ env:
+ ARTIFACT_DIR: tmp/source-shape-ci/current/effect-utils
+ RUNNER_CLASS: '${{ runner.os }}-${{ runner.arch }}'
+ run: |
+ set -euo pipefail
- if [ "$can_render_pr_comment" != "true" ]; then
- exit 1
- fi
+ mkdir -p "$ARTIFACT_DIR"
+ target_id='effect_utils'
+ target_name='effect-utils'
+ target_label='effect-utils repository'
+ target_group='source'
+ artifact_file="$ARTIFACT_DIR/measurements.json"
+ target_system="${DEVENV_SYSTEM:-${RUNNER_OS:-unknown}}"
- if [ "$can_render_pr_comment" = "true" ]; then
- repo="${GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}"
- comment_tmp_dir="$(mktemp -d)"
- comments_json="$comment_tmp_dir/comments.json"
- comment_body="$comment_tmp_dir/comment.md"
- comment_id_file="$comment_tmp_dir/comment-id.txt"
- chart_file="$comment_tmp_dir/perf-change-vs-baseline.svg"
- chart_dark_file="$comment_tmp_dir/perf-change-vs-baseline-dark.svg"
- chart_png_file="$comment_tmp_dir/perf-change-vs-baseline.png"
- chart_dark_png_file="$comment_tmp_dir/perf-change-vs-baseline-dark.png"
- renderer_script="$comment_tmp_dir/render-ci-measurement-comment.mjs"
+ SCOPES_JSON='[{"id":"genie_ci_workflow","label":"Genie CI workflow helpers","group":"source / ci","path":["source","effect-utils","genie","ci-workflow"],"includePaths":["genie/ci-workflow",".github/workflows/ci.yml.genie.ts"],"includeExtensions":[".ts"]},{"id":"genie_runtime","label":"Genie runtime","group":"source / genie","path":["source","effect-utils","packages","genie"],"includePaths":["packages/@overeng/genie/src"],"includeExtensions":[".ts",".tsx"]},{"id":"nix_workspace_tools","label":"Nix workspace tools","group":"source / nix","path":["source","effect-utils","nix","workspace-tools"],"includePaths":["nix/workspace-tools"],"includeExtensions":[".nix"]}]' \
+ TARGET_PATH_JSON='["source","effect-utils"]' \
+ TARGET_ID="$target_id" \
+ TARGET_NAME="$target_name" \
+ TARGET_LABEL="$target_label" \
+ TARGET_GROUP="$target_group" \
+ TARGET_SYSTEM="$target_system" \
+ node <<'NODE' >"$artifact_file"
+ const cp = require('node:child_process')
+ const fs = require('node:fs')
+ const path = require('node:path')
- if ! gh api "repos/$repo/issues/$pr_number/comments" --paginate >"$comments_json"; then
- echo "::notice::unable to list PR comments; skipping CI measurement PR comment"
- can_render_pr_comment=false
- fi
+ const normalize = (value) => {
+ const normalized = value.replace(/\\/g, '/').replace(/^\.\//, '').replace(/\/+$/, '')
+ return normalized === '.' ? '' : normalized
+ }
+ const scopes = JSON.parse(process.env.SCOPES_JSON || '[]')
+ const targetPath = JSON.parse(process.env.TARGET_PATH_JSON || '["source"]')
+ const gitFiles = cp
+ .execFileSync('git', ['ls-files', '-z'], { encoding: 'buffer' })
+ .toString('utf8')
+ .split('\0')
+ .filter(Boolean)
+ .map(normalize)
- if [ "$can_render_pr_comment" = "true" ]; then
- asset_branch="${CI_MEASUREMENT_PR_COMMENT_ASSET_BRANCH:-ci-measurement-assets}"
- asset_title="$(printf '%s' "${CI_MEASUREMENT_PR_COMMENT_TITLE:-ci-measurements}" | tr '[:upper:]' '[:lower:]' | sed -E 's/[^a-z0-9]+/-/g; s/^-+//; s/-+$//')"
- if [ -z "$asset_title" ]; then
- asset_title="ci-measurements"
- fi
- asset_head_sha="${CI_MEASUREMENT_SUBJECT_SHA:-${GITHUB_HEAD_SHA:-${GITHUB_SHA:-unknown}}}"
- asset_run_id="${GITHUB_RUN_ID:-local}"
- asset_run_attempt="${GITHUB_RUN_ATTEMPT:-0}"
- asset_svg_path="ci-measurements/pr-$pr_number/${asset_head_sha}/run-${asset_run_id}-attempt-${asset_run_attempt}/${asset_title}.svg"
- asset_png_path="ci-measurements/pr-$pr_number/${asset_head_sha}/run-${asset_run_id}-attempt-${asset_run_attempt}/${asset_title}.png"
- asset_dark_png_path="ci-measurements/pr-$pr_number/${asset_head_sha}/run-${asset_run_id}-attempt-${asset_run_attempt}/${asset_title}-dark.png"
- public_asset_command="${CI_MEASUREMENT_PR_COMMENT_PUBLIC_ASSET_COMMAND:-}"
- repo_private="$(gh api "repos/$repo" --jq '.private // false' 2>/dev/null || printf 'true')"
- require_public_asset=false
- if [ "$repo_private" = "true" ]; then
- require_public_asset=true
- fi
- if [ "${GITHUB_SERVER_URL:-https://github.com}" = "https://github.com" ]; then
- github_raw_chart_url="https://raw.githubusercontent.com/$repo/$asset_branch/$asset_png_path"
- github_raw_chart_dark_url="https://raw.githubusercontent.com/$repo/$asset_branch/$asset_dark_png_path"
- github_raw_chart_source_url="https://raw.githubusercontent.com/$repo/$asset_branch/$asset_svg_path"
- else
- github_raw_chart_url="${GITHUB_SERVER_URL:-https://github.com}/$repo/raw/$asset_branch/$asset_png_path"
- github_raw_chart_dark_url="${GITHUB_SERVER_URL:-https://github.com}/$repo/raw/$asset_branch/$asset_dark_png_path"
- github_raw_chart_source_url="${GITHUB_SERVER_URL:-https://github.com}/$repo/raw/$asset_branch/$asset_svg_path"
- fi
- if [ "$repo_private" = "true" ]; then
- chart_url=""
- chart_dark_url=""
- chart_source_url=""
- else
- chart_url="$github_raw_chart_url"
- chart_dark_url="$github_raw_chart_dark_url"
- chart_source_url="$github_raw_chart_source_url"
- fi
- export CI_MEASUREMENT_PR_COMMENT_CHART_URL="$chart_url"
- export CI_MEASUREMENT_PR_COMMENT_CHART_DARK_URL="$chart_dark_url"
- export CI_MEASUREMENT_PR_COMMENT_CHART_SOURCE_URL="$chart_source_url"
+ const includesPath = (file, candidates) => {
+ if (!Array.isArray(candidates) || candidates.length === 0) return true
+ return candidates.map(normalize).some((candidate) => candidate === '' || file === candidate || file.startsWith(candidate + '/'))
+ }
- cat > "$renderer_script" <<'EOF'
- import { readFileSync, writeFileSync } from 'node:fs'
+ const excludesPath = (file, candidates) =>
+ Array.isArray(candidates) &&
+ candidates.map(normalize).some((candidate) => candidate !== '' && (file === candidate || file.startsWith(candidate + '/')))
- const [comparisonPath, commentsPath, bodyPath, commentIdPath, chartPath, chartDarkPath] = process.argv.slice(2)
- const title = process.env.CI_MEASUREMENT_PR_COMMENT_TITLE || 'CI Measurements'
- const maxRows = Number.parseInt(process.env.CI_MEASUREMENT_PR_COMMENT_MAX_ROWS || '10', 10)
- const maxHistory = Number.parseInt(process.env.CI_MEASUREMENT_PR_COMMENT_MAX_HISTORY || '20', 10)
- const repo = process.env.GITHUB_REPOSITORY || 'unknown'
- const runId = process.env.GITHUB_RUN_ID || ''
- const runAttempt = process.env.GITHUB_RUN_ATTEMPT || ''
- const sha = process.env.GITHUB_SHA || ''
- const headSha = process.env.CI_MEASUREMENT_SUBJECT_SHA || process.env.GITHUB_HEAD_SHA || sha
- const serverUrl = process.env.GITHUB_SERVER_URL || 'https://github.com'
- const workflow = process.env.GITHUB_WORKFLOW || 'CI'
- const job = process.env.GITHUB_JOB || ''
- const chartUrl = process.env.CI_MEASUREMENT_PR_COMMENT_CHART_URL || ''
- const chartDarkUrl = process.env.CI_MEASUREMENT_PR_COMMENT_CHART_DARK_URL || ''
- const chartSourceUrl = process.env.CI_MEASUREMENT_PR_COMMENT_CHART_SOURCE_URL || ''
+ const matchesExtension = (file, extensions) => {
+ if (!Array.isArray(extensions) || extensions.length === 0) return true
+ const ext = path.extname(file).toLowerCase()
+ return extensions.map((extension) => extension.toLowerCase()).some((extension) => ext === extension)
+ }
- const markerScope = (process.env.CI_MEASUREMENT_PR_COMMENT_MARKER || title)
- .toLowerCase()
- .replace(/[^a-z0-9]+/g, '-')
- .replace(/^-+|-+$/g, '') || 'default'
- const marker = ''
- const legacyMarker = ''
- const statePrefix = ''
- const stateTag = 'ci-measurement-comment-state'
- const schemaVersion = 1
+ const countLines = (file) => {
+ const buffer = fs.readFileSync(file)
+ if (buffer.includes(0)) return undefined
+ if (buffer.length === 0) return 0
+ let lines = 0
+ for (const byte of buffer) {
+ if (byte === 10) lines += 1
+ }
+ return buffer[buffer.length - 1] === 10 ? lines : lines + 1
+ }
- const comparison = JSON.parse(readFileSync(comparisonPath, 'utf8'))
- const comments = JSON.parse(readFileSync(commentsPath, 'utf8'))
- if (!Array.isArray(comments)) throw new Error('comments response must be an array')
+ const observations = []
+ const scopeSummaries = []
- const existing = comments.find((comment) => {
- if (typeof comment?.body !== 'string') return false
- return comment.body.includes(marker) ||
- (comment.body.includes(legacyMarker) && comment.body.includes('## ' + title))
- })
+ for (const scope of scopes) {
+ const root = normalize(scope.root || '.')
+ const includePaths = Array.isArray(scope.includePaths) && scope.includePaths.length > 0 ? scope.includePaths : [root]
+ const files = gitFiles
+ .filter((file) => includesPath(file, includePaths))
+ .filter((file) => !excludesPath(file, scope.excludePaths))
+ .filter((file) => matchesExtension(file, scope.includeExtensions))
- const extractState = (body) => {
- if (typeof body !== 'string') return undefined
- const start = body.indexOf(statePrefix)
- if (start === -1) return undefined
- const end = body.indexOf(stateSuffix, start + statePrefix.length)
- if (end === -1) return undefined
- try {
- const parsed = JSON.parse(body.slice(start + statePrefix.length, end))
- if (parsed && parsed._tag === stateTag && Array.isArray(parsed.runs)) return parsed
- } catch {
- return undefined
+ let lineCount = 0
+ let measuredFileCount = 0
+ for (const file of files) {
+ const lines = countLines(file)
+ if (lines === undefined) continue
+ lineCount += lines
+ measuredFileCount += 1
}
- return undefined
+
+ const group = scope.group || 'source shape'
+ const scopePath = Array.isArray(scope.path) ? scope.path : ['source', scope.id]
+ const policy = scope.gate || { enabled: false, minBaselineSources: 3, minCurrentSamples: 1 }
+ observations.push(
+ {
+ id: 'source.lines',
+ label: scope.label + ' lines',
+ group,
+ path: scopePath,
+ description: 'Tracked non-binary source lines in the configured scope.',
+ measurementKind: 'deterministic',
+ name: 'source.lines',
+ unit: 'lines',
+ value: lineCount,
+ dimensions: { scope: scope.id },
+ policy,
+ statistics: { sampleCount: 1, measuredSampleCount: measuredFileCount },
+ },
+ {
+ id: 'source.files',
+ label: scope.label + ' files',
+ group,
+ path: scopePath,
+ description: 'Tracked non-binary source files in the configured scope.',
+ measurementKind: 'deterministic',
+ name: 'source.files',
+ unit: 'count',
+ value: measuredFileCount,
+ dimensions: { scope: scope.id },
+ policy,
+ statistics: { sampleCount: 1, measuredSampleCount: measuredFileCount },
+ },
+ )
+ scopeSummaries.push({
+ id: scope.id,
+ label: scope.label,
+ root,
+ includePaths,
+ excludePaths: scope.excludePaths || [],
+ includeExtensions: scope.includeExtensions || [],
+ fileCount: measuredFileCount,
+ lineCount,
+ })
+ }
+
+ const artifact = {
+ schemaVersion: 1,
+ generatedAt: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z'),
+ producer: {
+ name: 'effect-utils-ci-measurement',
+ version: 1,
+ measurementProtocol: 'source-shape-v1',
+ },
+ subject: {
+ repo: process.env.GITHUB_REPOSITORY || 'unknown',
+ branchKind: process.env.GITHUB_EVENT_NAME || 'unknown',
+ ref: process.env.CI_MEASUREMENT_SUBJECT_REF || process.env.GITHUB_REF || 'unknown',
+ headSha: process.env.CI_MEASUREMENT_SUBJECT_SHA || process.env.GITHUB_SHA || 'unknown',
+ baseSha: process.env.GITHUB_BASE_SHA || '',
+ },
+ execution: {
+ provider: process.env.GITHUB_RUN_ID && process.env.GITHUB_RUN_ID !== 'unknown' ? 'github-actions' : 'local',
+ workflow: 'CI',
+ job: process.env.GITHUB_JOB || 'unknown',
+ runId: process.env.GITHUB_RUN_ID || 'unknown',
+ runAttempt: process.env.GITHUB_RUN_ATTEMPT || 'unknown',
+ taskId: process.env.CROSSTASK_TASK_ID || '',
+ attemptId: process.env.CROSSTASK_ATTEMPT_ID || '',
+ traceId: process.env.TRACE_ID || '',
+ runner: {
+ name: process.env.RUNNER_NAME || 'unknown',
+ os: process.env.RUNNER_OS || 'unknown',
+ arch: process.env.RUNNER_ARCH || 'unknown',
+ class: process.env.RUNNER_CLASS || 'unknown',
+ },
+ },
+ target: {
+ kind: 'source-shape',
+ id: process.env.TARGET_ID,
+ name: process.env.TARGET_NAME,
+ label: process.env.TARGET_LABEL,
+ group: process.env.TARGET_GROUP,
+ path: targetPath,
+ system: process.env.TARGET_SYSTEM,
+ },
+ observations,
+ details: { scopes: scopeSummaries },
}
- const formatNumber = (value) => {
- if (value === null || value === undefined || Number.isNaN(value)) return 'n/a'
- if (Number.isInteger(value)) return String(value)
- return String(Math.round(value * 1000) / 1000)
- }
+ process.stdout.write(JSON.stringify(artifact, null, 2) + '\n')
+ NODE
- const formatValue = (value, unit) => {
- if (value === null || value === undefined) return 'n/a'
- if (unit === 'bytes') {
- if (value >= 1073741824) return formatNumber(Math.round((value / 1073741824) * 10) / 10) + ' GiB'
- if (value >= 1048576) return formatNumber(Math.round((value / 1048576) * 10) / 10) + ' MiB'
- if (value >= 1024) return formatNumber(Math.round((value / 1024) * 10) / 10) + ' KiB'
- return formatNumber(value) + ' B'
- }
- if (unit === 'seconds') return formatNumber(value) + ' s'
- return formatNumber(value) + (unit ? ' ' + unit : '')
- }
+ cat "$artifact_file"
- const formatDelta = (value, unit) => {
- if (value === null || value === undefined) return 'n/a'
- const sign = value >= 0 ? '+' : '-'
- return sign + formatValue(Math.abs(value), unit)
- }
+ - name: Compare CI measurements with baseline
+ shell: bash
+ env:
+ CI_MEASUREMENT_CURRENT_DIR: tmp/source-shape-ci/current
+ CI_MEASUREMENT_BASELINE_DIR: tmp/source-shape-ci/baseline
+ CI_MEASUREMENT_COMPARISON_FILE: tmp/source-shape-ci/measurement-comparison.json
+ CI_MEASUREMENT_REGRESSION_MODE: warn
+ CI_MEASUREMENT_PR_COMMENT_ENABLED: 'false'
+ CI_MEASUREMENT_PR_COMMENT_TITLE: Source Shape Measurements
+ CI_MEASUREMENT_PR_COMMENT_MAX_ROWS: '12'
+ CI_MEASUREMENT_PR_COMMENT_MAX_HISTORY: '20'
+ CI_MEASUREMENT_PR_COMMENT_ASSET_BRANCH: ci-measurement-assets
+ run: |
+ set -euo pipefail
- const formatRatio = (value) => {
- if (value === null || value === undefined) return 'n/a'
- return formatNumber(Math.round((value - 1) * 1000) / 10) + '%'
- }
+ export PATH="/run/current-system/sw/bin:/usr/bin:/bin:$PATH"
- const formatSemanticImpact = (value) => {
- if (value === null || value === undefined || Number.isNaN(value)) return 'n/a'
- if (Math.abs(value) < 0.005) return '0.00x'
- const sign = value > 0 ? '+' : ''
- return sign + formatNumber(Math.round(value * 100) / 100) + 'x'
- }
+ current_dir="${CI_MEASUREMENT_CURRENT_DIR:?CI_MEASUREMENT_CURRENT_DIR not set}"
+ baseline_dir="${CI_MEASUREMENT_BASELINE_DIR:?CI_MEASUREMENT_BASELINE_DIR not set}"
+ comparison_file="${CI_MEASUREMENT_COMPARISON_FILE:?CI_MEASUREMENT_COMPARISON_FILE not set}"
+ mode="${CI_MEASUREMENT_REGRESSION_MODE:-warn}"
+ mkdir -p "$(dirname "$comparison_file")"
- const formatRowImpact = (row) => {
- if (row.confidence === 'diagnostic' || row.gateReason === 'disabled' || row.semanticImpactKind === 'diagnostic') {
- return 'diagnostic'
- }
- return formatSemanticImpact(row.semanticImpactScore)
- }
+ if [ "$mode" = "off" ]; then
+ jq -n --argjson schemaVersion 1 --arg status skipped --arg mode "$mode" \
+ '{schemaVersion:$schemaVersion,status:$status,mode:$mode,comparisons:{}}' \
+ >"$comparison_file"
+ exit 0
+ fi
- const formatEvidence = (row) => {
- const unit = row.observation?.unit
- if (row.comparisonMode === 'paired' && typeof row.evidenceDeltaLower === 'number' && typeof row.evidenceDeltaUpper === 'number') {
- const quantile = typeof row.pairedEvidenceQuantile === 'number'
- ? Math.round(row.pairedEvidenceQuantile * 100)
- : 25
- return (row.confidence || 'unknown')
- + '
paired n=' + (row.pairedSamples ?? 0)
- + ', ' + quantile + '-' + (100 - quantile) + '% delta '
- + formatValue(row.evidenceDeltaLower, unit)
- + ' - ' + formatValue(row.evidenceDeltaUpper, unit)
- + ''
- }
- return (row.confidence || 'unknown') + '
baseline n=' + (row.baselineSources ?? 0) + ', current samples=' + (row.currentSamples ?? 1) + ''
- }
+ current_index="$(mktemp)"
+ baseline_index="$(mktemp)"
+ find "$current_dir" -name baseline -type d -prune -o -name measurements.json -type f -print | sort >"$current_index" || true
+ {
+ find "$baseline_dir" -name baseline -type d ! -path "$baseline_dir" -prune -o -name measurements.json -type f -print
+ } | sort -u >"$baseline_index" || true
- const interpretation = (row) => {
- if (row.confidence === 'low_baseline_count') return {
- label: 'Needs more baseline',
- detail: 'Not enough compatible baseline runs to make this gate trustworthy.',
- tone: 'neutral',
- color: '#94a3b8',
- }
- if (row.confidence === 'low_current_sample_count') return {
- label: 'Needs repeat',
- detail: 'Current run has too few successful measured samples.',
- tone: 'neutral',
- color: '#94a3b8',
- }
- if (row.confidence === 'low_paired_sample_count') return {
- label: 'Needs paired evidence',
- detail: 'Wall-clock gates require same-run base/head samples before they can block merges.',
- tone: 'neutral',
- color: '#94a3b8',
- }
- if (row.confidence === 'missing_paired_delta') return {
- label: 'Needs paired delta stats',
- detail: 'Wall-clock gates require per-pair delta statistics, not only paired medians.',
- tone: 'neutral',
- color: '#94a3b8',
- }
- if (row.confidence === 'paired_uncertain') return {
- label: 'Uncertain wall-clock movement',
- detail: 'The paired median moved, but the paired delta band still crosses the configured budget.',
- tone: 'neutral',
- color: '#94a3b8',
- }
- if (row.confidence === 'diagnostic') return {
- label: 'Diagnostic only',
- detail: 'Shown for investigation, but intentionally excluded from gating.',
- tone: 'diagnostic',
- color: '#a78bfa',
- }
- if (row.status === 'fail') return {
- label: 'Regression - blocks merge',
- detail: 'Worse than the configured fail threshold with enough samples.',
- tone: 'bad',
- color: '#ef4444',
- }
- if (row.status === 'warn') return {
- label: 'Regression - review',
- detail: 'Worse than the configured warning threshold.',
- tone: 'warn',
- color: '#f59e0b',
- }
- if (row.status === 'missing_baseline') return {
- label: 'No baseline yet',
- detail: 'Current value is measured, but no comparable baseline exists.',
- tone: 'neutral',
- color: '#94a3b8',
- }
- if (row.confidence === 'noise_floor') return {
- label: 'Too small to matter',
- detail: 'The absolute change is below the noise floor for this metric.',
- tone: 'neutral',
- color: '#94a3b8',
- }
- if (row.confidence === 'within_baseline_range') return {
- label: 'Historical range only',
- detail: 'Inside the full historical min/max range, but this range is not used to pass a gate.',
- tone: 'neutral',
- color: '#94a3b8',
- }
- if (row.confidence === 'within_robust_band' || row.confidence === 'within_baseline_distribution') return {
- label: 'Within noise band',
- detail: 'Current and baseline robust noise bands overlap.',
- tone: 'neutral',
- color: '#94a3b8',
- }
- if (row.direction === 'improved' && typeof row.semanticImpactScore === 'number' && row.semanticImpactScore <= -1) return {
- label: 'Meaningfully lower',
- detail: 'Lower than baseline by enough to cross the configured review threshold.',
- tone: 'good',
- color: '#10b981',
- }
- if (row.direction === 'improved') return {
- label: 'Slightly lower, ok',
- detail: 'Lower than baseline, but still inside the configured review budget.',
- tone: 'neutral',
- color: '#94a3b8',
- }
- if (row.direction === 'regressed') return {
- label: 'Slightly higher, ok',
- detail: 'Higher than baseline but still inside the configured budget.',
- tone: 'neutral',
- color: '#94a3b8',
- }
- return {
- label: 'Unchanged',
- detail: 'No meaningful movement from baseline.',
- tone: 'neutral',
- color: '#94a3b8',
- }
- }
+ if [ ! -s "$current_index" ]; then
+ echo "::error::no current measurements.json files found under $current_dir"
+ exit 1
+ fi
- const formatGate = (row) => {
- if (row.gateable) return 'yes'
- const reason = row.gateReason || row.confidence || 'unknown'
- return 'no
' + reason + ''
- }
+ current_json="$comparison_file.current.json"
+ baseline_json="$comparison_file.baseline.json"
+ xargs -r jq -s '.' <"$current_index" >"$current_json"
+ if [ -s "$baseline_index" ]; then
+ xargs -r jq -s '.' <"$baseline_index" >"$baseline_json"
+ else
+ printf '[]\n' >"$baseline_json"
+ fi
- const escapeCell = (value) => String(value ?? '-').replaceAll('|', '\\|').replaceAll('\n', '
')
- const escapeXml = (value) => String(value)
- .replaceAll('&', '&')
- .replaceAll('<', '<')
- .replaceAll('>', '>')
- .replaceAll('"', '"')
+ jq -n \
+ --slurpfile current "$current_json" \
+ --slurpfile baseline "$baseline_json" \
+ --argjson schemaVersion 1 \
+ --arg mode "$mode" \
+ --arg currentDir "$current_dir" \
+ --arg baselineDir "$baseline_dir" \
+ '
+ def identity_dimensions:
+ (.dimensions // {})
+ | to_entries
+ | map(select(.key as $key | ["devenvRev", "otelServiceName", "status", "probeLabel", "sampleCount", "measuredSampleCount"] | index($key) | not))
+ | sort_by(.key)
+ | map("\(.key)=\(.value|tostring)")
+ | join(",");
- const humanProbe = (row) => {
- if (row.observation?.label) return row.observation.label
- const probe = row.observation?.dimensions?.probe
- const name = row.observation?.name || 'unknown'
- const labels = {
- shell_eval_traced: 'Shell eval with OTEL trace',
- shell_eval_warm: 'Warm shell eval',
- tasks_list: 'devenv tasks list',
- processes_help: 'devenv processes --help',
- task_pnpm_install: 'pnpm:install',
- task_genie_run: 'genie:run',
- task_check_quick: 'check:quick',
- task_check_quick_warm: 'Warm cached check:quick',
- task_check_quick_forced: 'Forced check:quick',
- }
- if (probe && labels[probe]) return labels[probe]
- if (name.startsWith('devenv.') && name.endsWith('.duration')) {
- return name.slice('devenv.'.length, -'.duration'.length).replaceAll('_', ' ')
- }
- return name
- }
+ def observation_key($doc):
+ [
+ ($doc.target.kind // "unknown"),
+ ($doc.target.id // $doc.target.name // "unknown"),
+ ($doc.target.system // "unknown"),
+ (.id // .name // "unknown"),
+ (.unit // "unknown"),
+ identity_dimensions
+ ] | join("|");
- const semanticPath = (row) => {
- const parts = [
- ...(Array.isArray(row.target?.path) ? row.target.path : []),
- row.target?.group,
- ...(Array.isArray(row.observation?.path) ? row.observation.path : []),
- row.observation?.group,
- ].filter((value) => typeof value === 'string' && value.length > 0)
- const seen = new Set()
- const unique = parts.filter((part) => {
- if (seen.has(part)) return false
- seen.add(part)
- return true
- })
- return unique.length > 0 ? unique.join(' / ') : '-'
- }
+ def median:
+ sort as $sorted
+ | ($sorted | length) as $count
+ | if $count == 0 then null
+ elif ($count % 2) == 1 then $sorted[($count / 2 | floor)]
+ else (($sorted[($count / 2 - 1)] + $sorted[($count / 2)]) / 2)
+ end;
- const chartProbe = (row) => {
- if (row.observation?.label) return row.observation.label
- const probe = row.observation?.dimensions?.probe
- const labels = {
- shell_eval_traced: 'Shell eval with OTEL trace',
- shell_eval_warm: 'Warm shell eval',
- tasks_list: 'devenv tasks list',
- processes_help: 'processes --help',
- task_pnpm_install: 'pnpm:install',
- task_genie_run: 'genie:run',
- task_check_quick: 'check:quick',
- task_check_quick_warm: 'Warm cached check:quick',
- task_check_quick_forced: 'Forced check:quick',
- }
- if (probe && labels[probe]) return labels[probe]
- return humanProbe(row)
- }
+ def percentile($p):
+ sort as $sorted
+ | ($sorted | length) as $count
+ | if $count == 0 then null
+ else $sorted[(($p * ($count - 1)) | floor)]
+ end;
- const dimensions = (row) => {
- const entries = Object.entries(row.observation?.dimensions || {})
- if (entries.length === 0) return '-'
- return entries
- .sort(([left], [right]) => left.localeCompare(right))
- .map(([key, value]) => key + '=' + String(value))
- .join('
')
- }
+ def abs_value: if . < 0 then -. else . end;
- const rank = (row) => {
- if (row.status === 'fail') return 0
- if (row.status === 'warn') return 1
- if (row.status === 'missing_baseline') return 3
- return 2
- }
+ def observations_by_key($docs):
+ reduce $docs[]? as $doc
+ ({};
+ reduce (($doc.observations // [])[]? | select(.value | type == "number")) as $obs
+ (.;
+ ($obs | observation_key($doc)) as $key
+ | .[$key] = ((.[$key] // []) + [{
+ target: $doc.target,
+ observation: $obs,
+ generatedAt: $doc.generatedAt
+ }])
+ )
+ );
- const allRows = Object.values(comparison.comparisons || {}).sort((left, right) => {
- const byRank = rank(left) - rank(right)
- if (byRank !== 0) return byRank
- const leftImpact = typeof left.semanticImpactScore === 'number' ? Math.abs(left.semanticImpactScore) : 0
- const rightImpact = typeof right.semanticImpactScore === 'number' ? Math.abs(right.semanticImpactScore) : 0
- if (rightImpact !== leftImpact) return rightImpact - leftImpact
- const leftDelta = typeof left.delta === 'number' ? Math.abs(left.delta) : 0
- const rightDelta = typeof right.delta === 'number' ? Math.abs(right.delta) : 0
- if (rightDelta !== leftDelta) return rightDelta - leftDelta
- return humanProbe(left).localeCompare(humanProbe(right))
- })
- const protocolLabel = (() => {
- const protocols = new Set(
- allRows
- .map((row) => row.observation?.dimensions?.measurementProtocol)
- .filter((value) => typeof value === 'string' && value.length > 0),
- )
- return protocols.size > 0 ? Array.from(protocols).join(', ') : 'legacy'
- })()
- const visibleLimit = Number.isFinite(maxRows) && maxRows > 0 ? maxRows : 10
- const comparableRows = allRows.filter((row) => typeof row.baseline === 'number')
- const hasComparableBaseline = comparableRows.length > 0
- const isDiagnosticRow = (row) =>
- row.status === 'missing_baseline' ||
- row.confidence === 'diagnostic' ||
- row.gateReason === 'disabled' ||
- row.semanticImpactKind === 'diagnostic' ||
- (!row.gateable && typeof row.baseline !== 'number')
- const isZeroImpactRow = (row) =>
- typeof row.semanticImpactScore === 'number' &&
- !Number.isNaN(row.semanticImpactScore) &&
- Math.abs(row.semanticImpactScore) < 0.005
- const actionableComparableRows = comparableRows.filter((row) => !isDiagnosticRow(row))
- const visibleRows = (hasComparableBaseline
- ? actionableComparableRows
- : allRows.filter((row) => !isDiagnosticRow(row)).sort((left, right) => (right.current || 0) - (left.current || 0))
- ).slice(0, visibleLimit)
- const nonZeroImpactRows = actionableComparableRows.filter((row) => !isZeroImpactRow(row))
- const zeroImpactRows = actionableComparableRows.filter(isZeroImpactRow)
- const visibleNonZeroImpactRows = nonZeroImpactRows.slice(0, visibleLimit)
- const diagnosticRows = allRows.filter(isDiagnosticRow)
+ def observation_stats($items):
+ ($items | map(.observation.value)) as $values
+ | ($items | map(.observation.comparison.baseline // empty)) as $pairedBaselineValues
+ | ($items | map(.observation.statistics.pairedDeltaMedian // empty)) as $pairedDeltaMedianValues
+ | ($items | map(.observation.statistics.pairedDeltaP25 // empty)) as $pairedDeltaP25Values
+ | ($items | map(.observation.statistics.pairedDeltaP75 // empty)) as $pairedDeltaP75Values
+ | ($items | map(.observation.statistics.pairedDeltaMad // empty)) as $pairedDeltaMadValues
+ | ($items | map(.observation.statistics.pairedDeltaSamples // []) | add // []) as $pairedDeltaSampleValues
+ | ($items | map(.observation.statistics.measuredSampleCount // .observation.statistics.sampleCount // 1) | add // ($items | length)) as $sampleCount
+ | ($values | median) as $median
+ | {
+ target: ($items[0].target // {}),
+ observation: ($items[-1].observation // {}),
+ measurementKind: ($items[-1].observation.measurementKind // null),
+ value: $median,
+ min: ($values | min),
+ max: ($values | max),
+ p25: ($values | percentile(0.25)),
+ p75: ($values | percentile(0.75)),
+ p95: ($values | percentile(0.95)),
+ mad: ($values | map(. - $median | if . < 0 then -. else . end) | median),
+ sourceCount: ($items | length),
+ sampleCount: $sampleCount,
+ pairedSampleCount: ($items | map(.observation.statistics.pairedSampleCount // .observation.comparison.pairedSampleCount // 0) | add // 0),
+ pairedBaselineValue: (if ($pairedBaselineValues | length) == 0 then null else ($pairedBaselineValues | median) end),
+ pairedDeltaMedianValue: (if ($pairedDeltaMedianValues | length) == 0 then null else ($pairedDeltaMedianValues | median) end),
+ pairedDeltaP25Value: (if ($pairedDeltaP25Values | length) == 0 then null else ($pairedDeltaP25Values | median) end),
+ pairedDeltaP75Value: (if ($pairedDeltaP75Values | length) == 0 then null else ($pairedDeltaP75Values | median) end),
+ pairedDeltaMadValue: (if ($pairedDeltaMadValues | length) == 0 then null else ($pairedDeltaMadValues | median) end),
+ pairedDeltaSampleValues: $pairedDeltaSampleValues,
+ generatedAt: ($items[-1].generatedAt // null)
+ };
- const baselineToCurrent = (row) => {
- const unit = row.observation?.unit
- return formatValue(row.baseline, unit) + ' -> ' + formatValue(row.current, unit)
- }
+ def budget($metric; $unit):
+ if $metric == "nix.closure.nar_size" then
+ {warnRatio:1.05, failRatio:1.10, warnAbs:52428800, failAbs:209715200, statisticalToleranceRatio:0.02, statisticalToleranceAbs:10485760}
+ elif $metric == "nix.closure.bucket.nar_size" then
+ {warnRatio:1.10, failRatio:1.20, warnAbs:52428800, failAbs:209715200, statisticalToleranceRatio:0.05, statisticalToleranceAbs:10485760}
+ elif $metric == "nix.closure.path_count" then
+ {warnRatio:1.05, failRatio:1.10, warnAbs:100, failAbs:500, statisticalToleranceRatio:0.02, statisticalToleranceAbs:10}
+ elif $unit == "seconds" then
+ {warnRatio:1.10, failRatio:1.20, warnAbs:0.25, failAbs:1, statisticalToleranceRatio:0.10, statisticalToleranceAbs:0.25}
+ else
+ {warnRatio:1.25, failRatio:1.50, warnAbs:1, failAbs:3, statisticalToleranceRatio:0.10, statisticalToleranceAbs:1}
+ end;
- const rawChange = (row) => {
- const unit = row.observation?.unit
- return formatDelta(row.delta, unit) + ' / ' + formatRatio(row.ratio)
- }
+ def noise_floor($metric; $unit):
+ if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" then 10485760
+ elif $metric == "nix.closure.path_count" then 10
+ elif $unit == "seconds" then 0.1
+ else 0
+ end;
+ def default_policy($metric; $unit):
+ budget($metric; $unit) as $b
+ | noise_floor($metric; $unit) as $noise
+ | $b + {
+ enabled:true,
+ comparisonMode:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" or $unit != "seconds" then "budget" else "historical" end),
+ minBaselineSources:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" or $unit != "seconds" then 1 else 10 end),
+ minCurrentSamples:(if $unit == "seconds" then 3 else 1 end),
+ minPairedSamples:(if $unit == "seconds" then 5 else 0 end),
+ noiseFloor:$noise
+ };
+ def observation_policy($obs):
+ default_policy($obs.name // "unknown"; $obs.unit // "unknown") + ($obs.policy // {});
+ def policy_enabled($policy):
+ if ($policy | has("enabled")) then $policy.enabled else true end;
+
+ def classify($metric; $unit; $measurementKind; $policy; $current; $currentP25; $currentP75; $currentMad; $baseline; $baselineMin; $baselineMax; $baselineP25; $baselineP75; $baselineP95; $baselineMad; $currentSamples; $baselineSources; $pairedSamples; $pairedDeltaMedian; $pairedDeltaP25; $pairedDeltaP75; $pairedDeltaMad; $pairedDeltaValues):
+ $policy as $b
+ | ($policy.comparisonMode // (if $measurementKind == "deterministic" or $unit != "seconds" then "budget" elif $measurementKind == "diagnostic" then "diagnostic" else "historical" end)) as $comparisonMode
+ | ($policy.noiseFloor // noise_floor($metric; $unit)) as $noise
+ | ($current - $baseline) as $delta
+ | (if $comparisonMode == "paired" and $pairedDeltaMedian != null then $pairedDeltaMedian else $delta end) as $evidenceDelta
+ | (($policy.pairedEvidenceQuantile // 0.25) | tonumber) as $pairedEvidenceQuantile
+ | (if $baseline > 0 then ($current / $baseline) else null end) as $ratio
+ | (($baselineP75 // $baseline) - ($baselineP25 // $baseline)) as $iqr
+ | (($currentP75 // $current) - ($currentP25 // $current)) as $currentIqr
+ | (($pairedDeltaP75 // $evidenceDelta) - ($pairedDeltaP25 // $evidenceDelta)) as $pairedDeltaIqr
+ | ([
+ $noise,
+ (($policy.statisticalToleranceAbs // 0) | tonumber),
+ (if $baseline > 0 then ($baseline * (($policy.statisticalToleranceRatio // 0) | tonumber)) else 0 end),
+ (($baselineMad // 0) * 3),
+ (($iqr // 0) * 1.5)
+ ] | max) as $robustTolerance
+ | (if $currentSamples > 1 then ([
+ $noise,
+ (($policy.statisticalToleranceAbs // 0) | tonumber),
+ (if $current > 0 then ($current * (($policy.statisticalToleranceRatio // 0) | tonumber)) else 0 end),
+ (($currentMad // 0) * 3),
+ (($currentIqr // 0) * 1.5)
+ ] | max) else 0 end) as $currentRobustTolerance
+ | ([
+ $noise,
+ (($policy.statisticalToleranceAbs // 0) | tonumber),
+ (if $baseline > 0 then ($baseline * (($policy.statisticalToleranceRatio // 0) | tonumber)) else 0 end),
+ (($pairedDeltaMad // 0) * 3),
+ (($pairedDeltaIqr // 0) * 1.5)
+ ] | max) as $pairedDeltaTolerance
+ | ($baseline + $robustTolerance) as $robustUpper
+ | ($baseline - $robustTolerance) as $robustLower
+ | ($current + $currentRobustTolerance) as $currentRobustUpper
+ | ($current - $currentRobustTolerance) as $currentRobustLower
+ | (if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then ($pairedDeltaValues | percentile($pairedEvidenceQuantile)) else ($evidenceDelta - $pairedDeltaTolerance) end) as $evidenceDeltaLower
+ | (if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then ($pairedDeltaValues | percentile(1 - $pairedEvidenceQuantile)) else ($evidenceDelta + $pairedDeltaTolerance) end) as $evidenceDeltaUpper
+ | ([($b.warnAbs // 0), (if $baseline > 0 then ($baseline * (($b.warnRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $warnBudget
+ | ([($b.failAbs // 0), (if $baseline > 0 then ($baseline * (($b.failRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $failBudget
+ | ($comparisonMode != "paired") as $needsHistoricalBaselineCount
+ | (
+ ($current >= $robustLower and $current <= $robustUpper)
+ or ($currentRobustTolerance > 0 and $currentRobustLower <= $robustUpper and $currentRobustUpper >= $robustLower)
+ ) as $withinRobustBand
+ | ($comparisonMode == "historical" and $measurementKind != "deterministic") as $canUseRobustBandSuppression
+ | (
+ $baselineMin != null
+ and $baselineMax != null
+ and $current >= $baselineMin
+ and $current <= $baselineMax
+ ) as $withinBaselineRange
+ | (
+ if $baseline <= 0 then "unknown"
+ elif $comparisonMode == "paired" and $evidenceDeltaLower > $failBudget then "fail"
+ elif $comparisonMode == "paired" and $evidenceDeltaLower > $warnBudget then "warn"
+ elif $comparisonMode == "paired" then "pass"
+ elif ($delta > $b.failAbs and $current > ($baseline * $b.failRatio)) then "fail"
+ elif ($delta > $b.warnAbs and $current > ($baseline * $b.warnRatio)) then "warn"
+ else "pass"
+ end
+ ) as $thresholdStatus
+ | (
+ policy_enabled($policy) == true
+ and $baseline > 0
+ and (if $needsHistoricalBaselineCount then $baselineSources >= ($policy.minBaselineSources // 1) else true end)
+ and $currentSamples >= ($policy.minCurrentSamples // 1)
+ and (if $comparisonMode == "paired" then $pairedSamples >= ($policy.minPairedSamples // 1) else true end)
+ and (if $comparisonMode == "paired" then $pairedDeltaMedian != null else true end)
+ ) as $gateable
+ | (
+ if (policy_enabled($policy) != true) then "disabled"
+ elif $baseline <= 0 then "missing_baseline"
+ elif $needsHistoricalBaselineCount and $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
+ elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
+ elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
+ elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta"
+ else "eligible"
+ end
+ ) as $gateReason
+ | (
+ if $baseline <= 0 then "unknown"
+ elif (policy_enabled($policy) != true) then "diagnostic"
+ elif ($delta | abs_value) <= $noise then "noise_floor"
+ elif $needsHistoricalBaselineCount and $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
+ elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
+ elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
+ elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta"
+ elif $comparisonMode == "paired" and $thresholdStatus == "pass" and $evidenceDelta > $warnBudget then "paired_uncertain"
+ elif ($canUseRobustBandSuppression and $thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band"
+ elif $thresholdStatus == "pass" then "within_budget"
+ else "threshold_exceeded"
+ end
+ ) as $confidence
+ | (
+ if ($gateable and $confidence == "threshold_exceeded") then $thresholdStatus
+ elif $thresholdStatus == "unknown" then "unknown"
+ else "pass"
+ end
+ ) as $status
+ | (
+ if $baseline <= 0 then "unknown"
+ elif $comparisonMode == "paired" and ($evidenceDelta | abs_value) <= $noise then "unchanged"
+ elif $comparisonMode == "paired" and $evidenceDeltaLower <= 0 and $evidenceDeltaUpper >= 0 then "unchanged"
+ elif $comparisonMode == "paired" and $evidenceDelta < 0 then "improved"
+ elif $comparisonMode == "paired" then "regressed"
+ elif ($delta | abs_value) <= $noise then "unchanged"
+ elif $canUseRobustBandSuppression and $withinRobustBand then "unchanged"
+ elif $delta < 0 then "improved"
+ else "regressed"
+ end
+ ) as $direction
+ | (
+ if $baseline <= 0 then null
+ elif (policy_enabled($policy) != true) then null
+ elif $comparisonMode == "paired" and ($evidenceDeltaLower <= 0 and $evidenceDeltaUpper >= 0) then 0
+ elif $comparisonMode == "paired" and ($evidenceDelta | abs_value) <= $noise then 0
+ elif $comparisonMode == "paired" and $evidenceDelta > 0 then ([0, $evidenceDeltaLower] | max) / $warnBudget
+ elif $comparisonMode == "paired" then -(([0, (-$evidenceDeltaUpper)] | max) / $warnBudget)
+ elif $canUseRobustBandSuppression and $withinRobustBand then 0
+ elif ($delta | abs_value) <= $noise then 0
+ elif ($confidence == "threshold_exceeded" and $delta > 0) then ([0, ($currentRobustLower - $robustUpper), $delta] | max) / $warnBudget
+ elif ($confidence == "threshold_exceeded" and $delta < 0) then -(([0, ($robustLower - $currentRobustUpper), (-$delta)] | max) / $warnBudget)
+ elif $delta > 0 then ([0, ($currentRobustLower - $robustUpper)] | max) / $warnBudget
+ else -(([0, ($robustLower - $currentRobustUpper)] | max) / $warnBudget)
+ end
+ ) as $semanticImpactScore
+ | (
+ if (policy_enabled($policy) != true) then "diagnostic"
+ elif $semanticImpactScore == null then "unknown"
+ elif $semanticImpactScore == 0 then "neutral"
+ elif $semanticImpactScore >= ($failBudget / $warnBudget) then "fail_boundary"
+ elif $semanticImpactScore >= 1 then "warn_boundary"
+ elif $semanticImpactScore > 0 then "below_warn_boundary"
+ else "improvement"
+ end
+ ) as $semanticImpactKind
+ | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,comparisonMode:$comparisonMode,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,semanticImpactScore:$semanticImpactScore,semanticImpactKind:$semanticImpactKind,semanticWarnBudget:$warnBudget,semanticFailBudget:$failBudget,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange,pairedSamples:$pairedSamples,evidenceDelta:$evidenceDelta,evidenceDeltaLower:$evidenceDeltaLower,evidenceDeltaUpper:$evidenceDeltaUpper,evidenceDeltaTolerance:$pairedDeltaTolerance,pairedEvidenceQuantile:$pairedEvidenceQuantile,pairedEvidenceProtocol:(if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then "paired-delta-quantile-v1" elif $comparisonMode == "paired" then "paired-summary-robust-band-v1" else null end)};
- const confidenceSummary = (row) => {
- const unit = row.observation?.unit
- if (row.comparisonMode === 'paired' && typeof row.evidenceDeltaLower === 'number' && typeof row.evidenceDeltaUpper === 'number') {
- const quantile = typeof row.pairedEvidenceQuantile === 'number'
- ? Math.round(row.pairedEvidenceQuantile * 100)
- : 25
- return 'paired n=' + (row.pairedSamples ?? 0)
- + ', ' + quantile + '-' + (100 - quantile) + '% delta '
- + formatValue(row.evidenceDeltaLower, unit)
- + '..' + formatValue(row.evidenceDeltaUpper, unit)
- }
- return (row.confidence || 'unknown') + ', baseline n=' + (row.baselineSources ?? 0) + ', current n=' + (row.currentSamples ?? 1)
- }
+ (observations_by_key($current[0]) | with_entries(.value = observation_stats(.value))) as $currentObs
+ | (observations_by_key($baseline[0]) | with_entries(.value = observation_stats(.value))) as $baselineObs
+ | (
+ $currentObs
+ | to_entries
+ | map(
+ .key as $key
+ | .value as $currentValue
+ | ($baselineObs[$key] // null) as $baselineValue
+ | ($currentValue.observation | observation_policy(.)) as $policy
+ | ($policy.comparisonMode // (if ($currentValue.observation.measurementKind // $currentValue.measurementKind) == "deterministic" or ($currentValue.observation.unit // "") != "seconds" then "budget" elif ($currentValue.observation.measurementKind // $currentValue.measurementKind) == "diagnostic" then "diagnostic" else "historical" end)) as $comparisonMode
+ | ($currentValue.pairedBaselineValue // null) as $pairedBaselineValue
+ | (if $comparisonMode == "paired" and $pairedBaselineValue != null then {
+ value: $pairedBaselineValue,
+ min: $pairedBaselineValue,
+ max: $pairedBaselineValue,
+ p25: $pairedBaselineValue,
+ p75: $pairedBaselineValue,
+ p95: $pairedBaselineValue,
+ mad: 0,
+ sourceCount: $currentValue.pairedSampleCount
+ } else $baselineValue end) as $effectiveBaselineValue
+ | {
+ key: $key,
+ value: (
+ if $effectiveBaselineValue == null then
+ {
+ status: "missing_baseline",
+ target: $currentValue.target,
+ observation: $currentValue.observation,
+ current: $currentValue.value,
+ currentSamples: $currentValue.sampleCount,
+ baselineSources: 0,
+ gatePolicy: $policy,
+ comparisonMode: $comparisonMode,
+ gateable: false,
+ gateReason: "missing_baseline",
+ confidence: "missing_baseline",
+ direction: "unknown"
+ }
+ else
+ classify(
+ $currentValue.observation.name;
+ $currentValue.observation.unit;
+ ($currentValue.observation.measurementKind // $currentValue.measurementKind);
+ $policy;
+ $currentValue.value;
+ $currentValue.p25;
+ $currentValue.p75;
+ $currentValue.mad;
+ $effectiveBaselineValue.value;
+ $effectiveBaselineValue.min;
+ $effectiveBaselineValue.max;
+ $effectiveBaselineValue.p25;
+ $effectiveBaselineValue.p75;
+ $effectiveBaselineValue.p95;
+ $effectiveBaselineValue.mad;
+ $currentValue.sampleCount;
+ $effectiveBaselineValue.sourceCount;
+ $currentValue.pairedSampleCount;
+ $currentValue.pairedDeltaMedianValue;
+ $currentValue.pairedDeltaP25Value;
+ $currentValue.pairedDeltaP75Value;
+ $currentValue.pairedDeltaMadValue;
+ ($currentValue.pairedDeltaSampleValues // [])
+ ) + {
+ target: $currentValue.target,
+ observation: $currentValue.observation,
+ currentSamples: $currentValue.sampleCount,
+ baselineSources: $effectiveBaselineValue.sourceCount,
+ baselineMin: $effectiveBaselineValue.min,
+ baselineMax: $effectiveBaselineValue.max,
+ baselineP25: $effectiveBaselineValue.p25,
+ baselineP75: $effectiveBaselineValue.p75,
+ baselineP95: $effectiveBaselineValue.p95
+ ,baselineMad: $effectiveBaselineValue.mad
+ }
+ end
+ )
+ }
+ )
+ | from_entries
+ ) as $comparisons
+ | (
+ if any($comparisons[]?; .status == "fail") then "fail"
+ elif any($comparisons[]?; .status == "warn") then "warn"
+ elif any($comparisons[]?;
+ (if (.gatePolicy | has("enabled")) then .gatePolicy.enabled else true end)
+ and (.gateReason == "missing_baseline"
+ or .gateReason == "low_baseline_count"
+ or .gateReason == "low_current_sample_count"
+ or .gateReason == "low_paired_sample_count"
+ or .gateReason == "missing_paired_delta")
+ ) then "partial"
+ else "pass"
+ end
+ ) as $status
+ | (
+ [$comparisons[]?]
+ | {
+ enabledCount: (map(select((if (.gatePolicy | has("enabled")) then .gatePolicy.enabled else true end))) | length),
+ gateableCount: (map(select(.gateable == true)) | length),
+ missingBaselineCount: (map(select(.gateReason == "missing_baseline")) | length),
+ lowBaselineCount: (map(select(.gateReason == "low_baseline_count")) | length),
+ lowCurrentSampleCount: (map(select(.gateReason == "low_current_sample_count")) | length),
+ lowPairedSampleCount: (map(select(.gateReason == "low_paired_sample_count")) | length),
+ missingPairedDeltaCount: (map(select(.gateReason == "missing_paired_delta")) | length)
+ }
+ | . + {
+ nonGateableCount: (.enabledCount - .gateableCount),
+ enforceable: (.enabledCount == .gateableCount)
+ }
+ ) as $readiness
+ | {
+ schemaVersion:$schemaVersion,
+ status:$status,
+ mode:$mode,
+ readiness:$readiness,
+ currentDir:$currentDir,
+ baselineDir:$baselineDir,
+ comparisons:$comparisons
+ }
+ ' >"$comparison_file"
- const scanDecision = (row) => {
- if (row.status === 'fail') return 'regression blocks'
- if (row.status === 'warn') return 'regression review'
- if (row.status === 'missing_baseline') return 'needs baseline'
- if (row.direction === 'improved') return 'faster'
- if (row.direction === 'regressed') return 'no material impact'
- return 'unchanged'
- }
+ baseline_provenance_file="$baseline_dir/baseline-provenance.json"
+ if [ -f "$baseline_provenance_file" ]; then
+ comparison_with_provenance="$(mktemp)"
+ jq --slurpfile baselineProvenance "$baseline_provenance_file" \
+ '. + {baselineProvenance: ($baselineProvenance[0] // null)}' \
+ "$comparison_file" >"$comparison_with_provenance"
+ mv "$comparison_with_provenance" "$comparison_file"
+ fi
- const scanTable = (rows) => {
- if (rows.length === 0) return 'No non-zero actionable measurement impact detected.'
- return [
- '| What changed? | Probe | Baseline -> current | Raw change | Impact | Confidence |',
- '| --- | --- | --- | ---: | ---: | --- |',
- ...rows.map((row) => {
- return '| ' + [
- scanDecision(row),
- humanProbe(row),
- baselineToCurrent(row),
- rawChange(row),
- formatRowImpact(row),
- confidenceSummary(row),
- ].map(escapeCell).join(' | ') + ' |'
- }),
- ].join('\n')
- }
+ status="$(jq -r '.status' "$comparison_file")"
+ exit_code=0
+ case "$status:$mode" in
+ fail:fail)
+ echo "::error::CI measurement regression detected"
+ exit_code=1
+ ;;
+ fail:*|warn:*)
+ echo "::warning::CI measurement regression threshold exceeded"
+ ;;
+ partial:*)
+ echo "::notice::CI measurement comparison is partial because one or more enabled observations are not gateable"
+ ;;
+ esac
- const zeroImpactTable = (rows) => {
- if (rows.length === 0) return 'No zero-impact measurements.'
- return [
- '| Probe | Baseline -> current | Raw change | Impact | Gate | Evidence | Why hidden |',
- '| --- | --- | ---: | ---: | --- | --- | --- |',
- ...rows.map((row) => {
- const meaning = interpretation(row)
- return '| ' + [
- humanProbe(row),
- baselineToCurrent(row),
- rawChange(row),
- formatRowImpact(row),
- row.gateable ? 'yes' : (row.gateReason || 'no'),
- confidenceSummary(row),
- meaning.label,
- ].map(escapeCell).join(' | ') + ' |'
- }),
- ].join('\n')
- }
+ if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
+ {
+ echo "### ${CI_MEASUREMENT_PR_COMMENT_TITLE:-CI Measurements}"
+ echo ""
+ jq -r '"- Status: " + .status + "\n- Gate: " + (if .mode == "fail" then "enforced" elif .mode == "warn" then "advisory" elif .mode == "off" then "off" else (.mode // "unknown") end) + "\n- Baseline: " + .baselineDir' "$comparison_file"
+ echo ""
+ echo "| Status | Gate | Target | Observation | Current | Baseline | Delta | Ratio |"
+ echo "| --- | --- | --- | --- | ---: | ---: | ---: | ---: |"
+ jq -r '
+ .comparisons
+ | to_entries
+ | sort_by(
+ if .value.status == "fail" then 0
+ elif .value.status == "warn" then 1
+ elif .value.status == "missing_baseline" then 2
+ else 3
+ end
+ )
+ | .[:20]
+ | .[]
+ | .value as $v
+ | [
+ $v.status,
+ (if ($v.gateable // false) then "yes" else ($v.gateReason // "no") end),
+ (($v.target.kind // "unknown") + "/" + ($v.target.name // "unknown") + "/" + ($v.target.system // "unknown")),
+ ($v.observation.name // "unknown"),
+ (($v.current // $v.observation.value // 0) | tostring),
+ (($v.baseline // "") | tostring),
+ (($v.delta // "") | tostring),
+ (if $v.ratio == null or $v.ratio == "" then "" else (($v.ratio * 100 | round / 100) | tostring) end)
+ ]
+ | "| " + (map(gsub("\\|"; "\\\\|")) | join(" | ")) + " |"
+ ' "$comparison_file"
+ } >>"$GITHUB_STEP_SUMMARY"
+ fi
- const diagnosticTable = (rows) => {
- if (rows.length === 0) return 'No diagnostic or ungated measurements.'
- return [
- '| Probe | Current | Baseline | Impact | Gate | Reason | Evidence |',
- '| --- | ---: | ---: | ---: | --- | --- | --- |',
- ...rows.map((row) => {
- return '| ' + [
- humanProbe(row),
- formatValue(row.current, row.observation?.unit),
- formatValue(row.baseline, row.observation?.unit),
- formatRowImpact(row),
- row.gateable ? 'yes' : (row.gateReason || row.status || 'no'),
- interpretation(row).label,
- confidenceSummary(row),
- ].map(escapeCell).join(' | ') + ' |'
- }),
- ].join('\n')
- }
- const comparisonTable = (rows) => {
- if (rows.length === 0) return 'No measurement regressions detected.'
- return [
- '| Group | Measurement | Baseline | Current | Raw change | Impact | Meaning | Gate | Evidence |',
- '| --- | --- | ---: | ---: | ---: | ---: | --- | --- | --- |',
- ...rows.map((row) => {
- const unit = row.observation?.unit
- const baselineRange = typeof row.baselineRobustLower === 'number' && typeof row.baselineRobustUpper === 'number' && row.baselineRobustLower !== row.baselineRobustUpper
- ? '
noise band ' + formatValue(row.baselineRobustLower, unit) + ' - ' + formatValue(row.baselineRobustUpper, unit) + ''
- : typeof row.baselineMin === 'number' && typeof row.baselineMax === 'number' && row.baselineMin !== row.baselineMax
- ? '
range ' + formatValue(row.baselineMin, unit) + ' - ' + formatValue(row.baselineMax, unit) + ''
- : ''
- const meaning = interpretation(row)
- return '| ' + [
- semanticPath(row),
- humanProbe(row),
- formatValue(row.baseline, unit) + baselineRange,
- formatValue(row.current, unit),
- formatDelta(row.delta, unit) + ' / ' + formatRatio(row.ratio),
- formatRowImpact(row),
- meaning.label + '
' + meaning.detail + '',
- formatGate(row),
- formatEvidence(row),
- ].map(escapeCell).join(' | ') + ' |'
- }),
- ].join('\n')
- }
- const currentOnlyTable = (rows) => {
- if (rows.length === 0) return 'No current measurements found.'
- return [
- '| Group | Measurement | Current |',
- '| --- | --- | ---: |',
- ...rows.map((row) => {
- return '| ' + [semanticPath(row), humanProbe(row), formatValue(row.current, row.observation?.unit)].map(escapeCell).join(' | ') + ' |'
- }),
- ].join('\n')
- }
+ if [ "$exit_code" -ne 0 ]; then
+ exit "$exit_code"
+ fi
- const allMeasurementsTable = (rows) => {
- if (rows.length === 0) return 'No measurement regressions detected.'
- return [
- '| Status | Gate | Target | Observation | Dimensions | Baseline | Current | Delta | Ratio | Impact |',
- '| --- | --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: |',
- ...rows.map((row) => {
- const unit = row.observation?.unit
- return '| ' + [
- row.status,
- row.gateable ? 'yes' : (row.gateReason || 'no'),
- row.target?.label || row.target?.name || 'unknown',
- row.observation?.label || row.observation?.name || 'unknown',
- dimensions(row),
- formatValue(row.baseline, unit),
- formatValue(row.current, unit),
- formatDelta(row.delta, unit),
- formatRatio(row.ratio),
- formatRowImpact(row),
- ].map(escapeCell).join(' | ') + ' |'
- }),
- ].join('\n')
- }
+ if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
+ - name: 'Upload CI measurements: source-shape'
+ if: always()
+ uses: actions/upload-artifact@v4
+ with:
+ name: source-shape
+ path: |
+ tmp/source-shape-ci
+ !tmp/source-shape-ci/baseline/**
+ if-no-files-found: error
+ retention-days: 30
+ concurrency:
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-source-shape"
+ cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
+ ci-measurements-report:
+ name: ci/measurements-report
+ if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
+ needs: [devenv-perf, nix-closure-sizes, source-shape]
+ runs-on:
+ [namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}']
+ timeout-minutes: 30
+ defaults:
+ run:
+ shell: bash
+ permissions:
+ actions: read
+ contents: write
+ issues: write
+ pull-requests: write
+ env:
+ CI_MEASUREMENT_SUBJECT_REF: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.ref || github.ref }}
+ CI_MEASUREMENT_SUBJECT_SHA: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.sha || github.sha }}
+ CI_MEASUREMENT_SUBJECT_LABEL: ${{ inputs.measurement_baseline_label }}
+ CI_MEASUREMENT_ALLOW_PROBE_FAILURES: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && '1' || '' }}
+ steps:
+ - uses: actions/checkout@v6
+ - name: Install Nix
+ uses: DeterminateSystems/determinate-nix-action@v3
+ with:
+ extra-conf: |
+ experimental-features = nix-command flakes
+ accept-flake-config = true
+ extra-substituters = https://devenv.cachix.org
+ extra-trusted-public-keys = devenv.cachix.org-1:w1cLUi8dv3hnoSPGAuibQv+f9TZLr6cv/Hm9XgU50cw=
+ access-tokens = github.com=${{ github.token }}
+ summarize: true
+ - name: Provide CI measurement report tools
+ shell: bash
+ run: |
+ set -euo pipefail
+ for out in $(nix build --no-link --print-out-paths nixpkgs#jq nixpkgs#nodejs nixpkgs#gh nixpkgs#resvg); do
+ echo "$out/bin" >> "$GITHUB_PATH"
+ done
+ - name: 'Download current measurement artifact: devenv-perf'
+ uses: actions/download-artifact@v4
+ with:
+ name: devenv-perf
+ path: tmp/ci-measurement-report/current/devenv-perf
+ - name: 'Download current measurement artifact: nix-closure-measurements'
+ uses: actions/download-artifact@v4
+ with:
+ name: nix-closure-measurements
+ path: tmp/ci-measurement-report/current/nix-closure-measurements
+ - name: 'Download current measurement artifact: source-shape'
+ uses: actions/download-artifact@v4
+ with:
+ name: source-shape
+ path: tmp/ci-measurement-report/current/source-shape
+ - name: 'Download previous artifact: devenv-perf'
+ shell: bash
+ env:
+ GH_TOKEN: ${{ github.token }}
+ BASELINE_ARTIFACT_NAME: devenv-perf
+ BASELINE_OUTPUT_DIR: tmp/ci-measurement-report/baseline/devenv-perf
+ BASELINE_WORKFLOW_NAME: ${{ github.workflow }}
+ BASELINE_BRANCH: ${{ github.base_ref || github.ref_name }}
+ BASELINE_SEED_RUNS_JSON: '[]'
+ BASELINE_MAX_RUNS: '20'
+ BASELINE_MAX_CANDIDATE_RUNS: '60'
+ BASELINE_REQUIRED_OBSERVATIONS_JSON: '[]'
+ run: |
+ set -euo pipefail
- const sourceMeasurement = (row) => ({
- id: row.observation?.dimensions?.probe || row.observation?.name || humanProbe(row),
- label: humanProbe(row),
- group: semanticPath(row),
- status: row.status,
- direction: row.direction,
- gateable: row.gateable,
- gateReason: row.gateReason,
- confidence: row.confidence,
- comparisonMode: row.comparisonMode,
- unit: row.observation?.unit,
- baseline: row.baseline ?? null,
- current: row.current ?? null,
- delta: row.delta ?? null,
- ratio: row.ratio ?? null,
- semanticImpactScore: row.semanticImpactScore ?? null,
- semanticImpactKind: row.semanticImpactKind ?? null,
- baselineSources: row.baselineSources ?? null,
- currentSamples: row.currentSamples ?? null,
- pairedSamples: row.pairedSamples ?? null,
- evidenceDeltaLower: row.evidenceDeltaLower ?? null,
- evidenceDeltaUpper: row.evidenceDeltaUpper ?? null,
- pairedEvidenceQuantile: row.pairedEvidenceQuantile ?? null,
- dimensions: row.observation?.dimensions || {},
- })
+ mkdir -p "$BASELINE_OUTPUT_DIR"
- const truncate = (value, maxLength) => {
- const text = String(value)
- if (text.length <= maxLength) return text
- if (maxLength <= 1) return text.slice(0, maxLength)
- return text.slice(0, Math.max(0, maxLength - 3)) + '...'
- }
+ if command -v gh >/dev/null 2>&1; then
+ GH_BIN="$(command -v gh)"
+ else
+ echo "::notice::gh is not on PATH; resolving GitHub CLI through Nix"
+ if ! GH_BIN="$(nix build --no-link --print-out-paths nixpkgs#gh 2>/dev/null)/bin/gh"; then
+ echo "::notice::unable to resolve GitHub CLI through Nix; skipping previous artifact download"
+ exit 0
+ fi
+ fi
+ echo "Using GitHub CLI: $GH_BIN"
+
+ repo="${GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}"
+ workflow="${BASELINE_WORKFLOW_NAME:-CI}"
+ branch="${BASELINE_BRANCH:-${GITHUB_BASE_REF:-${GITHUB_REF_NAME:-main}}}"
+ seed_runs_file="$BASELINE_OUTPUT_DIR/baseline-seed-runs.json"
+ required_observations_file="$BASELINE_OUTPUT_DIR/baseline-required-observations.json"
+ printf '%s' "${BASELINE_SEED_RUNS_JSON:-[]}" >"$seed_runs_file"
+ printf '%s' "${BASELINE_REQUIRED_OBSERVATIONS_JSON:-[]}" >"$required_observations_file"
+ if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.runId | type == "string")) else false end' \
+ "$seed_runs_file" >/dev/null; then
+ echo "::error::BASELINE_SEED_RUNS_JSON must be an array of objects with string runId fields"
+ exit 1
+ fi
+ if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.id | type == "string") and (.minSources | type == "number")) else false end' \
+ "$required_observations_file" >/dev/null; then
+ echo "::error::BASELINE_REQUIRED_OBSERVATIONS_JSON must be an array of objects with string id and numeric minSources fields"
+ exit 1
+ fi
+ seed_run_ids="$(jq -r '.[].runId' "$seed_runs_file")"
+ required_observation_count="$(jq 'length' "$required_observations_file")"
+ max_candidate_runs="${BASELINE_MAX_CANDIDATE_RUNS:-${BASELINE_MAX_RUNS:-5}}"
+ if ! [[ "$max_candidate_runs" =~ ^[0-9]+$ ]] || [ "$max_candidate_runs" -lt 1 ]; then
+ max_candidate_runs=1
+ fi
- const renderPerfChangeSvg = (rows, theme = 'adaptive') => {
- const chartRows = rows
- .filter((row) => typeof row.current === 'number' && typeof row.baseline === 'number')
- .filter((row) => row.gateable === true)
- .filter((row) => typeof row.semanticImpactScore === 'number')
- .sort((left, right) => (left.semanticImpactScore || 0) - (right.semanticImpactScore || 0))
- .slice(0, visibleLimit)
- if (chartRows.length === 0) return ''
+ candidate_runs="$(
+ "$GH_BIN" run list \
+ --repo "$repo" \
+ --workflow "$workflow" \
+ --branch "$branch" \
+ --event push \
+ --status success \
+ --json databaseId,headSha \
+ --limit "$max_candidate_runs" \
+ --jq '[.[] | select(.headSha != env.GITHUB_SHA) | .databaseId] | .[]'
+ )"
- const impactScores = chartRows.map((row) => row.semanticImpactScore || 0)
- const minImpact = Math.min(-1, ...impactScores)
- const maxImpact = Math.max(1, ...impactScores)
- const lower = Math.floor(minImpact)
- const upper = Math.ceil(maxImpact)
- const span = upper - lower || 1
- const width = 1040
- const rowHeight = 46
- const height = 112 + chartRows.length * rowHeight + 34
- const labelX = 230
- const plotX = 252
- const plotWidth = 320
- const impactX = 596
- const nominalX = 672
- const meaningX = 804
- const topY = 92
- const barHeight = 18
- const zeroX = plotX + ((0 - lower) / span) * plotWidth
- const themeCss = theme === 'dark'
- ? [
- ' .chart-bg { fill: #0d1117; }',
- ' .chart-border { fill: none; stroke: #30363d; }',
- ' .chart-title { fill: #f0f6fc; }',
- ' .chart-muted { fill: #8b949e; }',
- ' .chart-axis { stroke: #8b949e; }',
- ' .chart-label { fill: #c9d1d9; }',
- ' .chart-value { fill: #8b949e; }',
- ' .chart-track { fill: #21262d; }',
- ]
- : [
- ' .chart-bg { fill: #ffffff; }',
- ' .chart-border { fill: none; stroke: #d0d7de; }',
- ' .chart-title { fill: #24292f; }',
- ' .chart-muted { fill: #57606a; }',
- ' .chart-axis { stroke: #8c959f; }',
- ' .chart-label { fill: #24292f; }',
- ' .chart-value { fill: #57606a; }',
- ' .chart-track { fill: #f6f8fa; }',
- ...(theme === 'adaptive'
- ? [
- ' @media (prefers-color-scheme: dark) {',
- ' .chart-bg { fill: #0d1117; }',
- ' .chart-border { stroke: #30363d; }',
- ' .chart-title { fill: #f0f6fc; }',
- ' .chart-muted { fill: #8b949e; }',
- ' .chart-axis { stroke: #8b949e; }',
- ' .chart-label { fill: #c9d1d9; }',
- ' .chart-value { fill: #8b949e; }',
- ' .chart-track { fill: #21262d; }',
- ' }',
- ]
- : []),
- ]
+ candidate_runs="$seed_run_ids
+ $candidate_runs"
- const svg = [
- '',
- '',
- '',
- '',
- '',
- 'Actionable measurement impact',
- '0 means no actionable PR impact; 1x reaches the warning budget.',
- 'improved',
- 'regressed',
- 'impact',
- 'baseline -> current',
- 'meaning',
- '',
- ]
+ max_runs="${BASELINE_MAX_RUNS:-5}"
+ if ! [[ "$max_runs" =~ ^[0-9]+$ ]] || [ "$max_runs" -lt 1 ]; then
+ max_runs=1
+ fi
- for (const [index, row] of chartRows.entries()) {
- const impact = row.semanticImpactScore || 0
- const y = topY + index * rowHeight
- const valueWidth = Math.max(2, Math.abs(impact) / span * plotWidth)
- const x = impact < 0 ? zeroX - valueWidth : zeroX
- const meaning = interpretation(row)
- const color = meaning.color
- const formattedImpact = formatSemanticImpact(impact)
- const label = chartProbe(row)
- const nominal = formatValue(row.baseline, row.observation?.unit).replaceAll(' ', '') + ' -> ' + formatValue(row.current, row.observation?.unit).replaceAll(' ', '')
- const barOpacity = meaning.tone === 'neutral' ? '0.65' : '1'
- const dash = meaning.tone === 'diagnostic' ? ' stroke-dasharray="3 3"' : ''
- svg.push(
- '' + escapeXml(label) + '' + escapeXml(truncate(label, 28)) + '',
- '',
- '',
- '' + escapeXml(formattedImpact) + '',
- '' + escapeXml(nominal) + '' + escapeXml(truncate(nominal, 21)) + '',
- '' + escapeXml(meaning.detail) + '' + escapeXml(truncate(meaning.label, 30)) + '',
- )
- }
+ write_baseline_observation_counts() {
+ local measurement_index="$BASELINE_OUTPUT_DIR/baseline-measurement-files.txt"
+ local counts_file="$BASELINE_OUTPUT_DIR/baseline-observation-counts.json"
+ find "$BASELINE_OUTPUT_DIR" \
+ -mindepth 2 \
+ -maxdepth 2 \
+ -name measurements.json \
+ -type f \
+ -print \
+ | sort >"$measurement_index" || true
- svg.push(
- '0',
- '',
- )
- return svg.join('\n')
+ if [ -s "$measurement_index" ]; then
+ xargs -r jq -s \
+ --slurpfile required "$required_observations_file" \
+ '
+ ([.[] | (.observations // [])[]? | select(.value | type == "number") | .id] | sort | group_by(.) | map({id: .[0], sources: length})) as $counts
+ | ($required[0] // []) as $requiredRows
+ | {
+ counts: $counts,
+ required: (
+ $requiredRows
+ | map(. as $requiredRow | ($counts | map(select(.id == $requiredRow.id)) | .[0].sources // 0) as $actual | $requiredRow + {sources:$actual, satisfied:($actual >= $requiredRow.minSources)})
+ )
+ }
+ ' <"$measurement_index" >"$counts_file"
+ else
+ jq -n --slurpfile required "$required_observations_file" \
+ '{counts: [], required: (($required[0] // []) | map(. + {sources:0, satisfied:false}))}' >"$counts_file"
+ fi
}
- const statusWord = comparison.status || 'unknown'
- const readiness = comparison.readiness || {}
- const readinessLabel = readiness.enforceable
- ? 'enforceable'
- : 'partial (' + (readiness.gateableCount ?? 0) + '/' + (readiness.enabledCount ?? 0) + ' enabled observations gateable)'
- const runUrl = runId ? serverUrl + '/' + repo + '/actions/runs/' + runId : undefined
- const shortSha = (headSha || sha || 'unknown').slice(0, 7)
- const existingState = extractState(existing?.body)
- const currentRun = {
- commitSha: headSha || sha || 'unknown',
- shortSha,
- generatedAt: new Date().toISOString(),
- status: statusWord,
- mode: comparison.mode || 'unknown',
- runUrl,
- runAttempt,
- workflow,
- job,
- visibleRows: visibleRows.map((row) => ({
- status: row.status,
- target: row.target?.label || row.target?.name || 'unknown',
- observation: row.observation?.label || row.observation?.name || 'unknown',
- meaning: interpretation(row).label,
- dimensions: dimensions(row).replaceAll('
', ', '),
- baseline: formatValue(row.baseline, row.observation?.unit),
- current: formatValue(row.current, row.observation?.unit),
- delta: formatDelta(row.delta, row.observation?.unit),
- ratio: formatRatio(row.ratio),
- impact: formatSemanticImpact(row.semanticImpactScore),
- })),
- }
- const hasComparableHistory = (run) => Array.isArray(run.visibleRows) && run.visibleRows.some((row) =>
- row.status !== 'missing_baseline' &&
- row.baseline !== 'n/a' &&
- row.ratio !== 'n/a'
- )
- const previousRuns = (existingState?.runs || []).filter((run) => run.commitSha !== currentRun.commitSha && hasComparableHistory(run))
- const historyLimit = Number.isFinite(maxHistory) && maxHistory > 0 ? maxHistory : 20
- const state = { _tag: stateTag, schemaVersion, title, runs: [currentRun, ...previousRuns].slice(0, historyLimit) }
- const gateModeLabel = (mode) => {
- if (mode === 'fail') return 'enforced'
- if (mode === 'warn') return 'advisory'
- if (mode === 'off') return 'off'
- return mode || 'unknown'
+ baseline_requirements_satisfied() {
+ if [ "$required_observation_count" -eq 0 ]; then
+ return 1
+ fi
+ write_baseline_observation_counts
+ jq -e '.required | all(.satisfied == true)' "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" >/dev/null
}
- const historyRows = state.runs.slice(1).map((run) => {
- const link = run.runUrl ? '[' + run.shortSha + '](' + run.runUrl + ')' : run.shortSha
- const top = Array.isArray(run.visibleRows) && run.visibleRows.length > 0
- ? run.visibleRows.slice(0, 3).map((row) => (row.meaning || row.status) + ' ' + row.target + ' ' + row.observation + ' ' + row.delta + ' / ' + row.ratio).join('
')
- : 'No regressions'
- return '| ' + [link, run.status, gateModeLabel(run.mode), top].map(escapeCell).join(' | ') + ' |'
- })
- const runLink = runUrl ? '[workflow run](' + runUrl + ')' : 'workflow run unavailable'
- const baselineProvenance = comparison.baselineProvenance
- const baselineLabel = baselineProvenance?.runId
- ? '[main run ' + baselineProvenance.runId + '](' + serverUrl + '/' + repo + '/actions/runs/' + baselineProvenance.runId + ')' +
- (Array.isArray(baselineProvenance.runs) && baselineProvenance.runs.length > 1 ? ' + ' + (baselineProvenance.runs.length - 1) + ' older baseline runs' : '')
- : 'not available'
- const sourceOfTruth = {
- schemaVersion,
- title,
- status: statusWord,
- gate: gateModeLabel(comparison.mode),
- readiness: readinessLabel,
- commit: {
- shortSha,
- sha: headSha || sha || 'unknown',
- },
- run: {
- id: runId || null,
- attempt: runAttempt || null,
- url: runUrl || null,
- },
- baseline: baselineProvenance || null,
- protocol: protocolLabel,
- chart: {
- meaning: 'semantic-impact',
- zeroImpactMeaning: 'no actionable PR impact after budgets, noise floor, and robust evidence checks',
- svg: chartSourceUrl || null,
- lightPng: chartUrl || null,
- darkPng: chartDarkUrl || null,
- },
- measurements: allRows.map(sourceMeasurement),
- }
- const chartSvg = hasComparableBaseline && visibleRows.length > 0 ? renderPerfChangeSvg(visibleRows) : ''
- const chartDarkSvg = hasComparableBaseline && visibleRows.length > 0 ? renderPerfChangeSvg(visibleRows, 'dark') : ''
- if (chartPath && chartSvg) writeFileSync(chartPath, chartSvg)
- if (chartDarkPath && chartDarkSvg) writeFileSync(chartDarkPath, chartDarkSvg)
- const chartImageMarkdown = chartUrl && chartSvg
- ? (chartDarkUrl
- ? '\n' +
- ' \n' +
- ' \n' +
- '
\n' +
- ''
- : '')
- : ''
- const chartMarkdown = chartImageMarkdown
- ? chartImageMarkdown +
- (chartSourceUrl ? '\n\n[SVG source](' + chartSourceUrl + ')' : '')
- : ''
+ run_id=""
+ artifact_name=""
+ artifact_id=""
+ downloaded_runs_file="$BASELINE_OUTPUT_DIR/baseline-runs.jsonl"
+ seen_runs_file="$BASELINE_OUTPUT_DIR/baseline-seen-runs.txt"
+ : >"$downloaded_runs_file"
+ : >"$seen_runs_file"
+ for candidate_run in $candidate_runs; do
+ if [ -z "$candidate_run" ]; then
+ continue
+ fi
+ if grep -qxF "$candidate_run" "$seen_runs_file"; then
+ continue
+ fi
+ downloaded_count="$(wc -l <"$downloaded_runs_file" | tr -d ' ')"
+ if [ "$downloaded_count" -ge "$max_runs" ]; then
+ if baseline_requirements_satisfied; then
+ break
+ fi
+ echo "::notice::downloaded $downloaded_count baseline artifact(s), but required observation counts are not satisfied yet; continuing through bounded candidate history"
+ fi
+ if [ "$(wc -l <"$seen_runs_file" | tr -d ' ')" -ge "$max_candidate_runs" ]; then
+ break
+ fi
+ printf '%s\n' "$candidate_run" >>"$seen_runs_file"
+
+ artifact_json="$(
+ "$GH_BIN" api "repos/$repo/actions/runs/$candidate_run/artifacts" \
+ | jq --arg artifactName "$BASELINE_ARTIFACT_NAME" '.artifacts
+ | map(select(.expired == false))
+ | map(select(.name == $artifactName or (.name | startswith($artifactName + "-"))))
+ | sort_by(.created_at // "")
+ | reverse
+ | .[0] // empty'
+ )"
+
+ if [ -n "$artifact_json" ]; then
+ current_artifact_name="$(printf '%s' "$artifact_json" | jq -r '.name')"
+ current_artifact_id="$(printf '%s' "$artifact_json" | jq -r '.id')"
+ current_output_dir="$BASELINE_OUTPUT_DIR/run-$candidate_run"
+ mkdir -p "$current_output_dir"
+ if "$GH_BIN" run download "$candidate_run" \
+ --repo "$repo" \
+ --name "$current_artifact_name" \
+ --dir "$current_output_dir"; then
+ if [ -z "$run_id" ]; then
+ run_id="$candidate_run"
+ artifact_name="$current_artifact_name"
+ artifact_id="$current_artifact_id"
+ fi
+ jq -cn \
+ --arg runId "$candidate_run" \
+ --arg artifactName "$current_artifact_name" \
+ --arg artifactId "$current_artifact_id" \
+ --arg path "run-$candidate_run" \
+ '{runId:$runId, artifactName:$artifactName, artifactId:$artifactId, path:$path}' \
+ >>"$downloaded_runs_file"
+ else
+ echo "::notice::failed to download baseline artifact $current_artifact_name from run $candidate_run"
+ fi
+ fi
+ done
+
+ write_baseline_observation_counts
- const regressionCount = allRows.filter((row) => row.status === 'fail' || row.status === 'warn').length
- const improvementCount = comparableRows.filter((row) => row.direction === 'improved' && !isZeroImpactRow(row)).length
- const neutralCount = zeroImpactRows.length + diagnosticRows.length
- const humanSummary = hasComparableBaseline
- ? regressionCount > 0
- ? String(regressionCount) + ' regression' + (regressionCount === 1 ? '' : 's') + ' need review.'
- : improvementCount > 0
- ? 'No regressions. ' + String(improvementCount) + ' probe' + (improvementCount === 1 ? '' : 's') + ' got faster; ' + String(neutralCount) + ' neutral or ungated row' + (neutralCount === 1 ? '' : 's') + ' are collapsed below.'
- : 'No regressions. Comparable movement is below the semantic impact threshold; neutral rows are collapsed below.'
- : 'No compatible baseline was available, so this run shows current measurements only.'
+ if [ -z "$run_id" ] || [ -z "$artifact_name" ]; then
+ echo "::notice::no successful baseline run found for $repo workflow=$workflow branch=$branch"
+ exit 0
+ fi
- const summaryLines = [
- '## ' + title,
- '',
- '**' + statusWord + '** - ' + gateModeLabel(comparison.mode) + ' gate - readiness ' + readinessLabel + ' - commit ' + shortSha + ' - protocol ' + protocolLabel + '',
- '',
- '> ' + humanSummary,
- '',
- chartMarkdown,
- '',
- hasComparableBaseline
- ? scanTable(visibleNonZeroImpactRows)
- : currentOnlyTable(visibleRows),
- ]
+ jq -n \
+ --slurpfile runs "$downloaded_runs_file" \
+ --slurpfile seedRuns "$seed_runs_file" \
+ --slurpfile observationCounts "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" \
+ --argjson schemaVersion 1 \
+ --arg repository "$repo" \
+ --arg workflow "$workflow" \
+ --arg branch "$branch" \
+ --arg runId "$run_id" \
+ --arg artifactName "$artifact_name" \
+ --arg artifactId "$artifact_id" \
+ '{
+ schemaVersion: $schemaVersion,
+ source: "github-actions-artifact",
+ repository: $repository,
+ workflow: $workflow,
+ branch: $branch,
+ runId: $runId,
+ artifactName: $artifactName,
+ artifactId: $artifactId,
+ seedRuns: ($seedRuns[0] // []),
+ runs: $runs,
+ observationCounts: ($observationCounts[0] // null)
+ }' >"$BASELINE_OUTPUT_DIR/baseline-provenance.json"
- if (hasComparableBaseline && zeroImpactRows.length > 0) {
- summaryLines.push(
- '',
- '',
- 'Unchanged / 0-impact measurements (' + zeroImpactRows.length + ')
',
- '',
- 'These rows had compatible baseline data, but their semantic impact rounded to 0.00x because the movement was below the configured budget, below the noise floor, or inside the robust noise band.',
- '',
- zeroImpactTable(zeroImpactRows),
- '',
- ' ',
- )
- }
+ echo "Downloaded $(wc -l <"$downloaded_runs_file" | tr -d ' ') baseline artifact(s), latest $artifact_name from run $run_id into $BASELINE_OUTPUT_DIR"
- if (diagnosticRows.length > 0) {
- summaryLines.push(
- '',
- '',
- 'Diagnostic / ungated measurements (' + diagnosticRows.length + ')
',
- '',
- diagnosticTable(diagnosticRows),
- '',
- ' ',
- )
- }
+ - name: 'Download previous artifact: nix-closure-measurements'
+ shell: bash
+ env:
+ GH_TOKEN: ${{ github.token }}
+ BASELINE_ARTIFACT_NAME: nix-closure-measurements
+ BASELINE_OUTPUT_DIR: tmp/ci-measurement-report/baseline/nix-closure-measurements
+ BASELINE_WORKFLOW_NAME: ${{ github.workflow }}
+ BASELINE_BRANCH: ${{ github.base_ref || github.ref_name }}
+ BASELINE_SEED_RUNS_JSON: '[]'
+ BASELINE_MAX_RUNS: '20'
+ BASELINE_MAX_CANDIDATE_RUNS: '60'
+ BASELINE_REQUIRED_OBSERVATIONS_JSON: '[]'
+ run: |
+ set -euo pipefail
- summaryLines.push(
- '',
- '',
- 'All measurements
',
- '',
- allMeasurementsTable(allRows),
- '',
- ' ',
- )
+ mkdir -p "$BASELINE_OUTPUT_DIR"
- if (historyRows.length > 0) {
- summaryLines.push(
- '',
- '',
- 'Previous runs
',
- '',
- '| Commit | Status | Gate | Top changes |',
- '| --- | --- | --- | --- |',
- ...historyRows,
- '',
- ' ',
- )
- }
+ if command -v gh >/dev/null 2>&1; then
+ GH_BIN="$(command -v gh)"
+ else
+ echo "::notice::gh is not on PATH; resolving GitHub CLI through Nix"
+ if ! GH_BIN="$(nix build --no-link --print-out-paths nixpkgs#gh 2>/dev/null)/bin/gh"; then
+ echo "::notice::unable to resolve GitHub CLI through Nix; skipping previous artifact download"
+ exit 0
+ fi
+ fi
+ echo "Using GitHub CLI: $GH_BIN"
- summaryLines.push(
- '',
- '',
- 'Source-of-truth JSON
',
- '',
- '~~~json',
- JSON.stringify(sourceOfTruth, null, 2),
- '~~~',
- '',
- ' ',
- )
+ repo="${GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}"
+ workflow="${BASELINE_WORKFLOW_NAME:-CI}"
+ branch="${BASELINE_BRANCH:-${GITHUB_BASE_REF:-${GITHUB_REF_NAME:-main}}}"
+ seed_runs_file="$BASELINE_OUTPUT_DIR/baseline-seed-runs.json"
+ required_observations_file="$BASELINE_OUTPUT_DIR/baseline-required-observations.json"
+ printf '%s' "${BASELINE_SEED_RUNS_JSON:-[]}" >"$seed_runs_file"
+ printf '%s' "${BASELINE_REQUIRED_OBSERVATIONS_JSON:-[]}" >"$required_observations_file"
+ if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.runId | type == "string")) else false end' \
+ "$seed_runs_file" >/dev/null; then
+ echo "::error::BASELINE_SEED_RUNS_JSON must be an array of objects with string runId fields"
+ exit 1
+ fi
+ if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.id | type == "string") and (.minSources | type == "number")) else false end' \
+ "$required_observations_file" >/dev/null; then
+ echo "::error::BASELINE_REQUIRED_OBSERVATIONS_JSON must be an array of objects with string id and numeric minSources fields"
+ exit 1
+ fi
+ seed_run_ids="$(jq -r '.[].runId' "$seed_runs_file")"
+ required_observation_count="$(jq 'length' "$required_observations_file")"
+ max_candidate_runs="${BASELINE_MAX_CANDIDATE_RUNS:-${BASELINE_MAX_RUNS:-5}}"
+ if ! [[ "$max_candidate_runs" =~ ^[0-9]+$ ]] || [ "$max_candidate_runs" -lt 1 ]; then
+ max_candidate_runs=1
+ fi
- summaryLines.push('', marker, statePrefix + JSON.stringify(state, null, 2) + stateSuffix)
- writeFileSync(bodyPath, summaryLines.join('\n') + '\n')
- writeFileSync(commentIdPath, existing?.id ? String(existing.id) : '')
- EOF
+ candidate_runs="$(
+ "$GH_BIN" run list \
+ --repo "$repo" \
+ --workflow "$workflow" \
+ --branch "$branch" \
+ --event push \
+ --status success \
+ --json databaseId,headSha \
+ --limit "$max_candidate_runs" \
+ --jq '[.[] | select(.headSha != env.GITHUB_SHA) | .databaseId] | .[]'
+ )"
- node "$renderer_script" "$comparison_file" "$comments_json" "$comment_body" "$comment_id_file" "$chart_file" "$chart_dark_file"
+ candidate_runs="$seed_run_ids
+ $candidate_runs"
+
+ max_runs="${BASELINE_MAX_RUNS:-5}"
+ if ! [[ "$max_runs" =~ ^[0-9]+$ ]] || [ "$max_runs" -lt 1 ]; then
+ max_runs=1
+ fi
+
+ write_baseline_observation_counts() {
+ local measurement_index="$BASELINE_OUTPUT_DIR/baseline-measurement-files.txt"
+ local counts_file="$BASELINE_OUTPUT_DIR/baseline-observation-counts.json"
+ find "$BASELINE_OUTPUT_DIR" \
+ -mindepth 2 \
+ -maxdepth 2 \
+ -name measurements.json \
+ -type f \
+ -print \
+ | sort >"$measurement_index" || true
- if [ -s "$chart_file" ]; then
- if [ "$require_public_asset" = "true" ] && [ -z "$public_asset_command" ]; then
- echo "::error::CI measurement chart was rendered for a private repository, but CI_MEASUREMENT_PR_COMMENT_PUBLIC_ASSET_COMMAND is not configured. Private raw GitHub URLs cannot be embedded in PR comments."
- exit 1
- fi
+ if [ -s "$measurement_index" ]; then
+ xargs -r jq -s \
+ --slurpfile required "$required_observations_file" \
+ '
+ ([.[] | (.observations // [])[]? | select(.value | type == "number") | .id] | sort | group_by(.) | map({id: .[0], sources: length})) as $counts
+ | ($required[0] // []) as $requiredRows
+ | {
+ counts: $counts,
+ required: (
+ $requiredRows
+ | map(. as $requiredRow | ($counts | map(select(.id == $requiredRow.id)) | .[0].sources // 0) as $actual | $requiredRow + {sources:$actual, satisfied:($actual >= $requiredRow.minSources)})
+ )
+ }
+ ' <"$measurement_index" >"$counts_file"
+ else
+ jq -n --slurpfile required "$required_observations_file" \
+ '{counts: [], required: (($required[0] // []) | map(. + {sources:0, satisfied:false}))}' >"$counts_file"
+ fi
+ }
- if ensure_ci_measurement_tool resvg resvg; then
- resvg_font_args=()
- if command -v nix >/dev/null 2>&1; then
- if font_out="$(nix build --no-link --print-out-paths nixpkgs#dejavu_fonts 2>/dev/null)"; then
- resvg_font_args+=(--use-fonts-dir "$font_out/share/fonts/truetype")
- fi
- fi
- if ! resvg --background '#ffffff' "${resvg_font_args[@]}" "$chart_file" "$chart_png_file"; then
- echo "::notice::unable to render CI measurement chart PNG"
- rm -f "$chart_png_file"
- fi
- if [ -s "$chart_dark_file" ] && ! resvg --background '#0d1117' "${resvg_font_args[@]}" "$chart_dark_file" "$chart_dark_png_file"; then
- echo "::notice::unable to render dark CI measurement chart PNG"
- rm -f "$chart_dark_png_file"
- fi
- else
- echo "::notice::resvg is not available; skipping embedded CI measurement chart PNG"
- fi
+ baseline_requirements_satisfied() {
+ if [ "$required_observation_count" -eq 0 ]; then
+ return 1
+ fi
+ write_baseline_observation_counts
+ jq -e '.required | all(.satisfied == true)' "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" >/dev/null
+ }
- if ! gh api "repos/$repo/git/ref/heads/$asset_branch" >/dev/null 2>&1; then
- default_branch_sha="$(gh api "repos/$repo/git/ref/heads/${GITHUB_BASE_REF:-main}" --jq '.object.sha' 2>/dev/null || true)"
- if [ -z "$default_branch_sha" ]; then
- default_branch_sha="${GITHUB_SHA:-}"
- fi
- if [ -n "$default_branch_sha" ]; then
- gh api "repos/$repo/git/refs" --method POST --field ref="refs/heads/$asset_branch" --field sha="$default_branch_sha" >/dev/null || true
- fi
- fi
- chart_content="$(base64 <"$chart_file" | tr -d '\n')"
- if ! gh api "repos/$repo/contents/$asset_svg_path" --method PUT --field message="Update CI measurement chart SVG for PR #$pr_number" --field content="$chart_content" --field branch="$asset_branch" >/dev/null; then
- echo "::notice::unable to upload CI measurement chart SVG asset"
- if [ -z "$public_asset_command" ]; then
- sed -i.bak '/\[SVG source\]/d' "$comment_body"
- fi
- fi
- if [ -s "$chart_png_file" ]; then
- chart_png_content="$(base64 <"$chart_png_file" | tr -d '\n')"
- if ! gh api "repos/$repo/contents/$asset_png_path" --method PUT --field message="Update CI measurement chart PNG for PR #$pr_number" --field content="$chart_png_content" --field branch="$asset_branch" >/dev/null; then
- echo "::notice::unable to upload CI measurement chart PNG asset"
- if [ -z "$public_asset_command" ]; then
- sed -i.bak '/!\[Measurement change vs baseline chart\]/d; /!\[Perf change vs baseline chart\]/d; //,/<\\/picture>/d' "$comment_body"
- fi
- fi
- else
- sed -i.bak '/!\[Measurement change vs baseline chart\]/d; /!\[Perf change vs baseline chart\]/d; //,/<\\/picture>/d' "$comment_body"
- fi
- if [ -s "$chart_dark_png_file" ]; then
- chart_dark_png_content="$(base64 <"$chart_dark_png_file" | tr -d '\n')"
- if ! gh api "repos/$repo/contents/$asset_dark_png_path" --method PUT --field message="Update dark CI measurement chart PNG for PR #$pr_number" --field content="$chart_dark_png_content" --field branch="$asset_branch" >/dev/null; then
- echo "::notice::unable to upload dark CI measurement chart PNG asset"
- if [ -z "$public_asset_command" ]; then
- export CI_MEASUREMENT_PR_COMMENT_CHART_DARK_URL=""
- node "$renderer_script" "$comparison_file" "$comments_json" "$comment_body" "$comment_id_file" "$chart_file" "$chart_dark_file"
- fi
- fi
- fi
+ run_id=""
+ artifact_name=""
+ artifact_id=""
+ downloaded_runs_file="$BASELINE_OUTPUT_DIR/baseline-runs.jsonl"
+ seen_runs_file="$BASELINE_OUTPUT_DIR/baseline-seen-runs.txt"
+ : >"$downloaded_runs_file"
+ : >"$seen_runs_file"
+ for candidate_run in $candidate_runs; do
+ if [ -z "$candidate_run" ]; then
+ continue
+ fi
+ if grep -qxF "$candidate_run" "$seen_runs_file"; then
+ continue
+ fi
+ downloaded_count="$(wc -l <"$downloaded_runs_file" | tr -d ' ')"
+ if [ "$downloaded_count" -ge "$max_runs" ]; then
+ if baseline_requirements_satisfied; then
+ break
+ fi
+ echo "::notice::downloaded $downloaded_count baseline artifact(s), but required observation counts are not satisfied yet; continuing through bounded candidate history"
+ fi
+ if [ "$(wc -l <"$seen_runs_file" | tr -d ' ')" -ge "$max_candidate_runs" ]; then
+ break
+ fi
+ printf '%s\n' "$candidate_run" >>"$seen_runs_file"
- if [ -n "$public_asset_command" ] && [ -s "$chart_png_file" ]; then
- if public_chart_url="$(bash -c "$public_asset_command" _ "$chart_png_file" png)" && [ -n "$public_chart_url" ]; then
- chart_url="$public_chart_url"
- export CI_MEASUREMENT_PR_COMMENT_CHART_URL="$chart_url"
- else
- echo "::notice::unable to publish CI measurement chart PNG to public asset host"
- export CI_MEASUREMENT_PR_COMMENT_CHART_URL=""
- fi
- if [ -s "$chart_dark_png_file" ] && public_chart_dark_url="$(bash -c "$public_asset_command" _ "$chart_dark_png_file" png)" && [ -n "$public_chart_dark_url" ]; then
- chart_dark_url="$public_chart_dark_url"
- export CI_MEASUREMENT_PR_COMMENT_CHART_DARK_URL="$chart_dark_url"
- else
- echo "::notice::unable to publish dark CI measurement chart PNG to public asset host"
- export CI_MEASUREMENT_PR_COMMENT_CHART_DARK_URL=""
- fi
- if public_chart_source_url="$(bash -c "$public_asset_command" _ "$chart_file" svg)" && [ -n "$public_chart_source_url" ]; then
- chart_source_url="$public_chart_source_url"
- export CI_MEASUREMENT_PR_COMMENT_CHART_SOURCE_URL="$chart_source_url"
- else
- echo "::notice::unable to publish CI measurement chart SVG to public asset host"
- export CI_MEASUREMENT_PR_COMMENT_CHART_SOURCE_URL=""
- fi
- if [ "$require_public_asset" = "true" ] && [ -z "$chart_url" ]; then
- echo "::error::unable to publish CI measurement chart PNG to a public asset host for private repository $repo"
- exit 1
- fi
- if [ "$require_public_asset" = "true" ] && [ -s "$chart_dark_png_file" ] && [ -z "$chart_dark_url" ]; then
- echo "::error::unable to publish dark CI measurement chart PNG to a public asset host for private repository $repo"
- exit 1
- fi
- node "$renderer_script" "$comparison_file" "$comments_json" "$comment_body" "$comment_id_file" "$chart_file" "$chart_dark_file"
- fi
- fi
+ artifact_json="$(
+ "$GH_BIN" api "repos/$repo/actions/runs/$candidate_run/artifacts" \
+ | jq --arg artifactName "$BASELINE_ARTIFACT_NAME" '.artifacts
+ | map(select(.expired == false))
+ | map(select(.name == $artifactName or (.name | startswith($artifactName + "-"))))
+ | sort_by(.created_at // "")
+ | reverse
+ | .[0] // empty'
+ )"
- comment_id="$(cat "$comment_id_file")"
- comment_payload_file="$comment_body.payload.json"
- node -e "const fs=require('node:fs'); fs.writeFileSync(process.argv[2], JSON.stringify({ body: fs.readFileSync(process.argv[1], 'utf8') }))" "$comment_body" "$comment_payload_file"
- if [ -n "$comment_id" ]; then
- if ! gh api "repos/$repo/issues/comments/$comment_id" --method PATCH --input "$comment_payload_file" >/dev/null; then
- echo "::notice::unable to update CI measurement PR comment"
- fi
- else
- if ! gh api "repos/$repo/issues/$pr_number/comments" --method POST --input "$comment_payload_file" >/dev/null; then
- echo "::notice::unable to create CI measurement PR comment"
- fi
+ if [ -n "$artifact_json" ]; then
+ current_artifact_name="$(printf '%s' "$artifact_json" | jq -r '.name')"
+ current_artifact_id="$(printf '%s' "$artifact_json" | jq -r '.id')"
+ current_output_dir="$BASELINE_OUTPUT_DIR/run-$candidate_run"
+ mkdir -p "$current_output_dir"
+ if "$GH_BIN" run download "$candidate_run" \
+ --repo "$repo" \
+ --name "$current_artifact_name" \
+ --dir "$current_output_dir"; then
+ if [ -z "$run_id" ]; then
+ run_id="$candidate_run"
+ artifact_name="$current_artifact_name"
+ artifact_id="$current_artifact_id"
fi
+ jq -cn \
+ --arg runId "$candidate_run" \
+ --arg artifactName "$current_artifact_name" \
+ --arg artifactId "$current_artifact_id" \
+ --arg path "run-$candidate_run" \
+ '{runId:$runId, artifactName:$artifactName, artifactId:$artifactId, path:$path}' \
+ >>"$downloaded_runs_file"
+ else
+ echo "::notice::failed to download baseline artifact $current_artifact_name from run $candidate_run"
fi
fi
- fi
+ done
- if [ "$exit_code" -ne 0 ]; then
- exit "$exit_code"
- fi
+ write_baseline_observation_counts
- - name: 'Upload CI measurements: nix-closure-measurements'
- if: always()
- uses: actions/upload-artifact@v4
- with:
- name: nix-closure-measurements
- path: |
- tmp/nix-closure-ci
- !tmp/nix-closure-ci/baseline/**
- if-no-files-found: error
- retention-days: 30
- - name: Save pnpm state
- if: ${{ success() && steps.restore-pnpm-state.outputs.cache-hit != 'true' }}
- uses: actions/cache/save@v4
- with:
- path: |
- ${{ github.workspace }}/.pnpm-home
- ${{ runner.temp }}/pnpm-store/${{ github.job }}
- key: "pnpm-state-v1-${{ runner.os }}-${{ runner.arch }}-${{ hashFiles('**/pnpm-lock.yaml') }}"
- - name: Nix diagnostics summary
- if: failure()
- shell: bash
- run: |
- diag_dir="${NIX_STORE_DIAGNOSTICS_DIR:-}"
- if [ -z "$diag_dir" ] || [ ! -d "$diag_dir" ]; then
- echo "## Nix Store Diagnostics" >> "$GITHUB_STEP_SUMMARY"
- echo "" >> "$GITHUB_STEP_SUMMARY"
- echo "No diagnostics directory found (validation may have failed before capture)." >> "$GITHUB_STEP_SUMMARY"
+ if [ -z "$run_id" ] || [ -z "$artifact_name" ]; then
+ echo "::notice::no successful baseline run found for $repo workflow=$workflow branch=$branch"
exit 0
fi
- {
- echo "## Nix Store Diagnostics"
- echo ""
- echo "Temporary instrumentation for #272; remove after root cause is confirmed and CI is stable."
- echo ""
- echo "- Diagnostics directory: \`$diag_dir\`"
- echo "- Tracking issue: https://github.com/overengineeringstudio/effect-utils/issues/272"
- } >> "$GITHUB_STEP_SUMMARY"
+ jq -n \
+ --slurpfile runs "$downloaded_runs_file" \
+ --slurpfile seedRuns "$seed_runs_file" \
+ --slurpfile observationCounts "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" \
+ --argjson schemaVersion 1 \
+ --arg repository "$repo" \
+ --arg workflow "$workflow" \
+ --arg branch "$branch" \
+ --arg runId "$run_id" \
+ --arg artifactName "$artifact_name" \
+ --arg artifactId "$artifact_id" \
+ '{
+ schemaVersion: $schemaVersion,
+ source: "github-actions-artifact",
+ repository: $repository,
+ workflow: $workflow,
+ branch: $branch,
+ runId: $runId,
+ artifactName: $artifactName,
+ artifactId: $artifactId,
+ seedRuns: ($seedRuns[0] // []),
+ runs: $runs,
+ observationCounts: ($observationCounts[0] // null)
+ }' >"$BASELINE_OUTPUT_DIR/baseline-provenance.json"
- markers_file="${RUNNER_TEMP:-/tmp}/nix-store-signature-markers.txt"
- grep -R -n -E "config\\.cachix|cachix\\.package|error: path '/nix/store/.+ is not valid" --exclude="$(basename "$markers_file")" "$diag_dir" > "$markers_file" || true
+ echo "Downloaded $(wc -l <"$downloaded_runs_file" | tr -d ' ') baseline artifact(s), latest $artifact_name from run $run_id into $BASELINE_OUTPUT_DIR"
- if [ -s "$markers_file" ]; then
- {
- echo ""
- echo "### Signature markers"
- echo '```text'
- head -n 120 "$markers_file"
- echo '```'
- } >> "$GITHUB_STEP_SUMMARY"
- else
- echo "" >> "$GITHUB_STEP_SUMMARY"
- echo "- No signature markers found in captured diagnostics." >> "$GITHUB_STEP_SUMMARY"
- fi
- - name: Upload Nix diagnostics artifact
- if: failure() && env.NIX_STORE_DIAGNOSTICS_DIR != ''
- uses: actions/upload-artifact@v4
- with:
- name: 'nix-store-diagnostics-${{ github.job }}-${{ runner.os }}-run-${{ github.run_id }}-attempt-${{ github.run_attempt }}'
- path: ${{ env.NIX_STORE_DIAGNOSTICS_DIR }}
- if-no-files-found: ignore
- retention-days: 14
- - name: Failure note
- if: failure()
- shell: bash
- run: |
- echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
- echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
- concurrency:
- group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-nix-closure-sizes"
- cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
- source-shape:
- runs-on:
- [namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}']
- timeout-minutes: 30
- defaults:
- run:
- shell: bash
- permissions:
- actions: read
- contents: write
- issues: write
- pull-requests: write
- env:
- CI_MEASUREMENT_SUBJECT_REF: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.ref || github.ref }}
- CI_MEASUREMENT_SUBJECT_SHA: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.sha || github.sha }}
- CI_MEASUREMENT_SUBJECT_LABEL: ${{ inputs.measurement_baseline_label }}
- CI_MEASUREMENT_ALLOW_PROBE_FAILURES: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && '1' || '' }}
- steps:
- - uses: actions/checkout@v6
- - name: Checkout CI measurement baseline ref
- if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }}
- uses: actions/checkout@v6
- with:
- ref: ${{ inputs.measurement_baseline_ref }}
- name: 'Download previous artifact: source-shape'
shell: bash
env:
GH_TOKEN: ${{ github.token }}
BASELINE_ARTIFACT_NAME: source-shape
- BASELINE_OUTPUT_DIR: tmp/source-shape-ci/baseline
+ BASELINE_OUTPUT_DIR: tmp/ci-measurement-report/baseline/source-shape
BASELINE_WORKFLOW_NAME: ${{ github.workflow }}
BASELINE_BRANCH: ${{ github.base_ref || github.ref_name }}
BASELINE_SEED_RUNS_JSON: '[{"runId":"26085158592","label":"main baseline","sha":"ce7cf8f8ebfaa1da6c7e9122cd195a5f95ce2fca","source":"manual-backfill","artifacts":["source-shape"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."}]'
@@ -7502,198 +6763,16 @@ jobs:
echo "Downloaded $(wc -l <"$downloaded_runs_file" | tr -d ' ') baseline artifact(s), latest $artifact_name from run $run_id into $BASELINE_OUTPUT_DIR"
- if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
- - name: 'Measure source shape: effect-utils'
- shell: bash
- env:
- ARTIFACT_DIR: tmp/source-shape-ci/current/effect-utils
- RUNNER_CLASS: '${{ runner.os }}-${{ runner.arch }}'
- run: |
- set -euo pipefail
-
- mkdir -p "$ARTIFACT_DIR"
- target_id='effect_utils'
- target_name='effect-utils'
- target_label='effect-utils repository'
- target_group='source'
- artifact_file="$ARTIFACT_DIR/measurements.json"
- target_system="${DEVENV_SYSTEM:-${RUNNER_OS:-unknown}}"
-
- SCOPES_JSON='[{"id":"genie_ci_workflow","label":"Genie CI workflow helpers","group":"source / ci","path":["source","effect-utils","genie","ci-workflow"],"includePaths":["genie/ci-workflow",".github/workflows/ci.yml.genie.ts"],"includeExtensions":[".ts"]},{"id":"genie_runtime","label":"Genie runtime","group":"source / genie","path":["source","effect-utils","packages","genie"],"includePaths":["packages/@overeng/genie/src"],"includeExtensions":[".ts",".tsx"]},{"id":"nix_workspace_tools","label":"Nix workspace tools","group":"source / nix","path":["source","effect-utils","nix","workspace-tools"],"includePaths":["nix/workspace-tools"],"includeExtensions":[".nix"]}]' \
- TARGET_PATH_JSON='["source","effect-utils"]' \
- TARGET_ID="$target_id" \
- TARGET_NAME="$target_name" \
- TARGET_LABEL="$target_label" \
- TARGET_GROUP="$target_group" \
- TARGET_SYSTEM="$target_system" \
- node <<'NODE' >"$artifact_file"
- const cp = require('node:child_process')
- const fs = require('node:fs')
- const path = require('node:path')
-
- const normalize = (value) => {
- const normalized = value.replace(/\\/g, '/').replace(/^\.\//, '').replace(/\/+$/, '')
- return normalized === '.' ? '' : normalized
- }
- const scopes = JSON.parse(process.env.SCOPES_JSON || '[]')
- const targetPath = JSON.parse(process.env.TARGET_PATH_JSON || '["source"]')
- const gitFiles = cp
- .execFileSync('git', ['ls-files', '-z'], { encoding: 'buffer' })
- .toString('utf8')
- .split('\0')
- .filter(Boolean)
- .map(normalize)
-
- const includesPath = (file, candidates) => {
- if (!Array.isArray(candidates) || candidates.length === 0) return true
- return candidates.map(normalize).some((candidate) => candidate === '' || file === candidate || file.startsWith(candidate + '/'))
- }
-
- const excludesPath = (file, candidates) =>
- Array.isArray(candidates) &&
- candidates.map(normalize).some((candidate) => candidate !== '' && (file === candidate || file.startsWith(candidate + '/')))
-
- const matchesExtension = (file, extensions) => {
- if (!Array.isArray(extensions) || extensions.length === 0) return true
- const ext = path.extname(file).toLowerCase()
- return extensions.map((extension) => extension.toLowerCase()).some((extension) => ext === extension)
- }
-
- const countLines = (file) => {
- const buffer = fs.readFileSync(file)
- if (buffer.includes(0)) return undefined
- if (buffer.length === 0) return 0
- let lines = 0
- for (const byte of buffer) {
- if (byte === 10) lines += 1
- }
- return buffer[buffer.length - 1] === 10 ? lines : lines + 1
- }
-
- const observations = []
- const scopeSummaries = []
-
- for (const scope of scopes) {
- const root = normalize(scope.root || '.')
- const includePaths = Array.isArray(scope.includePaths) && scope.includePaths.length > 0 ? scope.includePaths : [root]
- const files = gitFiles
- .filter((file) => includesPath(file, includePaths))
- .filter((file) => !excludesPath(file, scope.excludePaths))
- .filter((file) => matchesExtension(file, scope.includeExtensions))
-
- let lineCount = 0
- let measuredFileCount = 0
- for (const file of files) {
- const lines = countLines(file)
- if (lines === undefined) continue
- lineCount += lines
- measuredFileCount += 1
- }
-
- const group = scope.group || 'source shape'
- const scopePath = Array.isArray(scope.path) ? scope.path : ['source', scope.id]
- const policy = scope.gate || { enabled: false, minBaselineSources: 3, minCurrentSamples: 1 }
- observations.push(
- {
- id: 'source.lines',
- label: scope.label + ' lines',
- group,
- path: scopePath,
- description: 'Tracked non-binary source lines in the configured scope.',
- measurementKind: 'deterministic',
- name: 'source.lines',
- unit: 'lines',
- value: lineCount,
- dimensions: { scope: scope.id },
- policy,
- statistics: { sampleCount: 1, measuredSampleCount: measuredFileCount },
- },
- {
- id: 'source.files',
- label: scope.label + ' files',
- group,
- path: scopePath,
- description: 'Tracked non-binary source files in the configured scope.',
- measurementKind: 'deterministic',
- name: 'source.files',
- unit: 'count',
- value: measuredFileCount,
- dimensions: { scope: scope.id },
- policy,
- statistics: { sampleCount: 1, measuredSampleCount: measuredFileCount },
- },
- )
- scopeSummaries.push({
- id: scope.id,
- label: scope.label,
- root,
- includePaths,
- excludePaths: scope.excludePaths || [],
- includeExtensions: scope.includeExtensions || [],
- fileCount: measuredFileCount,
- lineCount,
- })
- }
-
- const artifact = {
- schemaVersion: 1,
- generatedAt: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z'),
- producer: {
- name: 'effect-utils-ci-measurement',
- version: 1,
- measurementProtocol: 'source-shape-v1',
- },
- subject: {
- repo: process.env.GITHUB_REPOSITORY || 'unknown',
- branchKind: process.env.GITHUB_EVENT_NAME || 'unknown',
- ref: process.env.CI_MEASUREMENT_SUBJECT_REF || process.env.GITHUB_REF || 'unknown',
- headSha: process.env.CI_MEASUREMENT_SUBJECT_SHA || process.env.GITHUB_SHA || 'unknown',
- baseSha: process.env.GITHUB_BASE_SHA || '',
- },
- execution: {
- provider: process.env.GITHUB_RUN_ID && process.env.GITHUB_RUN_ID !== 'unknown' ? 'github-actions' : 'local',
- workflow: 'CI',
- job: process.env.GITHUB_JOB || 'unknown',
- runId: process.env.GITHUB_RUN_ID || 'unknown',
- runAttempt: process.env.GITHUB_RUN_ATTEMPT || 'unknown',
- taskId: process.env.CROSSTASK_TASK_ID || '',
- attemptId: process.env.CROSSTASK_ATTEMPT_ID || '',
- traceId: process.env.TRACE_ID || '',
- runner: {
- name: process.env.RUNNER_NAME || 'unknown',
- os: process.env.RUNNER_OS || 'unknown',
- arch: process.env.RUNNER_ARCH || 'unknown',
- class: process.env.RUNNER_CLASS || 'unknown',
- },
- },
- target: {
- kind: 'source-shape',
- id: process.env.TARGET_ID,
- name: process.env.TARGET_NAME,
- label: process.env.TARGET_LABEL,
- group: process.env.TARGET_GROUP,
- path: targetPath,
- system: process.env.TARGET_SYSTEM,
- },
- observations,
- details: { scopes: scopeSummaries },
- }
-
- process.stdout.write(JSON.stringify(artifact, null, 2) + '\n')
- NODE
-
- cat "$artifact_file"
-
- name: Compare CI measurements with baseline
shell: bash
env:
- CI_MEASUREMENT_CURRENT_DIR: tmp/source-shape-ci/current
- CI_MEASUREMENT_BASELINE_DIR: tmp/source-shape-ci/baseline
- CI_MEASUREMENT_COMPARISON_FILE: tmp/source-shape-ci/measurement-comparison.json
+ CI_MEASUREMENT_CURRENT_DIR: tmp/ci-measurement-report/current
+ CI_MEASUREMENT_BASELINE_DIR: tmp/ci-measurement-report/baseline
+ CI_MEASUREMENT_COMPARISON_FILE: tmp/ci-measurement-report/measurement-comparison.json
CI_MEASUREMENT_REGRESSION_MODE: warn
CI_MEASUREMENT_PR_COMMENT_ENABLED: 'true'
- CI_MEASUREMENT_PR_COMMENT_TITLE: Source Shape Measurements
- CI_MEASUREMENT_PR_COMMENT_MAX_ROWS: '12'
+ CI_MEASUREMENT_PR_COMMENT_TITLE: CI Measurements
+ CI_MEASUREMENT_PR_COMMENT_MAX_ROWS: '16'
CI_MEASUREMENT_PR_COMMENT_MAX_HISTORY: '20'
CI_MEASUREMENT_PR_COMMENT_ASSET_BRANCH: ci-measurement-assets
GH_TOKEN: ${{ github.token }}
@@ -9244,23 +8323,23 @@ jobs:
fi
fi
+
if [ "$exit_code" -ne 0 ]; then
exit "$exit_code"
fi
- if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
- - name: 'Upload CI measurements: source-shape'
+ - name: 'Upload CI measurements: ci-measurements-report'
if: always()
uses: actions/upload-artifact@v4
with:
- name: source-shape
+ name: ci-measurements-report
path: |
- tmp/source-shape-ci
- !tmp/source-shape-ci/baseline/**
+ tmp/ci-measurement-report
+ !tmp/ci-measurement-report/baseline/**
if-no-files-found: error
retention-days: 30
concurrency:
- group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-source-shape"
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-ci-measurements-report"
cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
test-integration-notion:
if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
diff --git a/.github/workflows/ci.yml.genie.ts b/.github/workflows/ci.yml.genie.ts
index f3dcb128a..016fa0b20 100644
--- a/.github/workflows/ci.yml.genie.ts
+++ b/.github/workflows/ci.yml.genie.ts
@@ -280,6 +280,29 @@ const jobs: Record | ReturnType
+ ({
+ name: `Download current measurement artifact: ${artifactName}`,
+ uses: 'actions/download-artifact@v4',
+ with: {
+ name: artifactName,
+ path: outputDir,
+ },
+ }) as const
+
+const ciMeasurementReportToolStep = {
+ name: 'Provide CI measurement report tools',
+ shell: 'bash',
+ run: [
+ 'set -euo pipefail',
+ 'for out in $(nix build --no-link --print-out-paths nixpkgs#jq nixpkgs#nodejs nixpkgs#gh nixpkgs#resvg); do',
+ ' echo "$out/bin" >> "$GITHUB_PATH"',
+ 'done',
+ ].join('\n'),
+} as const
+
const nixClosureMeasurementTargets = [
{
installable: '.#genie',
@@ -434,7 +457,7 @@ const extraJobs: Record = {
],
permissions: ciMeasurementsCommentPermissions,
prComment: {
- enabled: true,
+ enabled: false,
title: 'Devenv Performance',
maxRows: 8,
maxHistory: 20,
@@ -462,7 +485,7 @@ const extraJobs: Record = {
buckets: defaultNixClosureMeasurementBuckets,
regressionMode: 'warn',
prComment: {
- enabled: true,
+ enabled: false,
title: 'Nix Closure Measurements',
maxRows: 8,
maxHistory: 20,
@@ -546,7 +569,7 @@ const extraJobs: Record = {
outputFile: `${sourceShapeMeasurementsDir}/measurement-comparison.json`,
regressionMode: 'warn',
prComment: {
- enabled: true,
+ enabled: false,
title: 'Source Shape Measurements',
maxRows: 12,
maxHistory: 20,
@@ -560,6 +583,78 @@ const extraJobs: Record = {
}),
],
},
+ 'ci-measurements-report': {
+ name: 'ci/measurements-report',
+ if: normalCiIf,
+ needs: ['devenv-perf', 'nix-closure-sizes', 'source-shape'],
+ 'runs-on': namespaceRunner({
+ profile: 'namespace-profile-linux-x86-64',
+ runId: '${{ github.run_id }}',
+ }),
+ 'timeout-minutes': jobTimeoutMinutes,
+ defaults: bashShellDefaults,
+ permissions: ciMeasurementsCommentPermissions,
+ env: ciMeasurementSubjectEnv,
+ steps: [
+ checkoutStep(),
+ installNixStep(),
+ ciMeasurementReportToolStep,
+ downloadCurrentMeasurementArtifactStep(
+ 'devenv-perf',
+ `${ciMeasurementReportDir}/current/devenv-perf`,
+ ),
+ downloadCurrentMeasurementArtifactStep(
+ 'nix-closure-measurements',
+ `${ciMeasurementReportDir}/current/nix-closure-measurements`,
+ ),
+ downloadCurrentMeasurementArtifactStep(
+ 'source-shape',
+ `${ciMeasurementReportDir}/current/source-shape`,
+ ),
+ downloadPreviousGitHubArtifactStep({
+ artifactName: 'devenv-perf',
+ outputDir: `${ciMeasurementReportDir}/baseline/devenv-perf`,
+ maxRuns: 20,
+ }),
+ downloadPreviousGitHubArtifactStep({
+ artifactName: 'nix-closure-measurements',
+ outputDir: `${ciMeasurementReportDir}/baseline/nix-closure-measurements`,
+ maxRuns: 20,
+ }),
+ downloadPreviousGitHubArtifactStep({
+ artifactName: 'source-shape',
+ outputDir: `${ciMeasurementReportDir}/baseline/source-shape`,
+ seedRuns: [
+ {
+ runId: '26085158592',
+ label: 'main baseline',
+ sha: 'ce7cf8f8ebfaa1da6c7e9122cd195a5f95ce2fca',
+ source: 'manual-backfill',
+ artifacts: ['source-shape'],
+ notes:
+ 'Backfilled with the current measurement workflow for the effect-utils #658 rollout.',
+ },
+ ],
+ maxRuns: 20,
+ }),
+ compareCiMeasurementsStep({
+ currentDir: `${ciMeasurementReportDir}/current`,
+ baselineDir: `${ciMeasurementReportDir}/baseline`,
+ outputFile: `${ciMeasurementReportDir}/measurement-comparison.json`,
+ regressionMode: 'warn',
+ prComment: {
+ enabled: true,
+ title: 'CI Measurements',
+ maxRows: 16,
+ maxHistory: 20,
+ },
+ }),
+ ciMeasurementsArtifactStep({
+ artifactName: 'ci-measurements-report',
+ path: ciMeasurementReportDir,
+ }),
+ ],
+ },
/** Integration tests for Notion API (requires NOTION_TOKEN secret) */
'test-integration-notion': {
if: normalCiIf,
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index da7216dec..2e6816d9a 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -2343,7 +2343,7 @@ if [ -n "${dollar}{GITHUB_STEP_SUMMARY:-}" ]; then
} >>"$GITHUB_STEP_SUMMARY"
fi
-if [ "${dollar}{CI_MEASUREMENT_PR_COMMENT_ENABLED:-false}" = "true" ]; then
+${opts?.prComment?.enabled === true ? String.raw`if [ "${dollar}{CI_MEASUREMENT_PR_COMMENT_ENABLED:-false}" = "true" ]; then
if [ "${dollar}{GITHUB_EVENT_NAME:-}" != "pull_request" ]; then
echo "::notice::CI measurement PR comments are produced only by pull_request workflows; skipping comment for event ${dollar}{GITHUB_EVENT_NAME:-unknown}"
exit 0
@@ -3402,6 +3402,7 @@ EOF
fi
fi
fi
+` : ''}
if [ "$exit_code" -ne 0 ]; then
exit "$exit_code"
From 6002014b4ab623acce42902d99de12d9dd5c02c4 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Wed, 20 May 2026 11:21:18 +0200
Subject: [PATCH 68/81] Format CI workflow helper test
---
.../github-workflow/ci-workflow-helpers.unit.test.ts | 10 +++-------
1 file changed, 3 insertions(+), 7 deletions(-)
diff --git a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
index 8ba661a1a..70073acf9 100644
--- a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
+++ b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
@@ -497,9 +497,7 @@ describe('ci workflow devenv perf helpers', () => {
expect(ciWorkflowSource).toContain(
'CI measurement PR comments are produced only by pull_request workflows',
)
- expect(ciWorkflowSource).toContain(
- 'unable to publish required CI measurement PR comment',
- )
+ expect(ciWorkflowSource).toContain('unable to publish required CI measurement PR comment')
expect(ciWorkflowSource).toContain('seedRuns: ($seedRuns[0] // [])')
expect(ciWorkflowSource).toContain('baselineProvenance: ($baselineProvenance[0] // null)')
expect(ciWorkflowSource).toContain(
@@ -516,7 +514,7 @@ describe('ci workflow devenv perf helpers', () => {
'No regressions. Comparable movement is below the semantic impact threshold; neutral rows are collapsed below.',
)
expect(generatedCiWorkflowYamlSource).toContain(
- "github.workflow }}-${{ github.event_name }}-${{ github.ref }}",
+ 'github.workflow }}-${{ github.event_name }}-${{ github.ref }}',
)
expect(generatedCiWorkflowYamlSource).not.toMatch(/^concurrency:/m)
expect(generatedCiWorkflowYamlSource).toContain('concurrency:\n group:')
@@ -538,9 +536,7 @@ describe('ci workflow devenv perf helpers', () => {
expect(ciWorkflowSource).toContain('Unchanged / 0-impact measurements (')
expect(ciWorkflowSource).toContain('Source-of-truth JSON
')
expect(ciWorkflowSource).toContain('const sourceOfTruth = {')
- expect(ciWorkflowSource).toContain(
- 'No non-zero actionable measurement impact detected.',
- )
+ expect(ciWorkflowSource).toContain('No non-zero actionable measurement impact detected.')
expect(ciWorkflowSource).toContain('readiness ')
expect(ciWorkflowSource).toContain('renderPerfChangeSvg')
expect(ciWorkflowSource).toContain('Actionable measurement impact')
From f878f6c1d600eb82457b3e03fa4a94864c4146a2 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Wed, 20 May 2026 11:29:01 +0200
Subject: [PATCH 69/81] Split CI measurement production from reporting
---
.github/workflows/ci.yml | 3873 +++++++----------------------
.github/workflows/ci.yml.genie.ts | 36 +-
genie/ci-workflow/measurements.ts | 68 +-
3 files changed, 910 insertions(+), 3067 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c528a3d2f..2dfeb2475 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2429,214 +2429,6 @@ jobs:
EOF
echo "::warning::Intentional failure for diagnostics validation (#272)"
exit 1
- - name: 'Download previous artifact: devenv-perf'
- shell: bash
- env:
- GH_TOKEN: ${{ github.token }}
- BASELINE_ARTIFACT_NAME: devenv-perf
- BASELINE_OUTPUT_DIR: tmp/devenv-perf-ci/baseline
- BASELINE_WORKFLOW_NAME: ${{ github.workflow }}
- BASELINE_BRANCH: ${{ github.base_ref || github.ref_name }}
- BASELINE_SEED_RUNS_JSON: '[{"runId":"25959801150","label":"PR #655","sha":"df0420cd0397ffc6928d3c6ccc9c23052d6bc255","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959802067","label":"PR #657","sha":"62833cba5d83b1c13462728edeafa684e61c006f","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959802958","label":"PR #656","sha":"21029998522a0e9435df151259611650fb948a20","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959803805","label":"PR #651","sha":"95515f971b27ef279e39c982f52e46cf9e8270e9","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959804678","label":"PR #654","sha":"58e96b9a2b87b3703de6920b6d9571f3805d0171","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959805512","label":"PR #653","sha":"d1cca16339f19d7e1a27b001edc4c2c7ecd13dc4","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959806473","label":"PR #652","sha":"acd6c63f5e235e7e5f2710fc62b2231e0ba904a6","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959807303","label":"PR #648","sha":"a5a07703ff951fb7396a40844e9491d88ed40edf","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959808097","label":"PR #649","sha":"360ff47c59a206064711dfcb6c610afd0e6b0d53","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959808775","label":"PR #647","sha":"8d1810b2c359ae95f245e56329018aab5020f8c0","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959809449","label":"PR #646","sha":"89e1396766ccd2a813680acd440cb78f540ca6c1","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959810069","label":"PR #643","sha":"239715520370436901a3f2218d162dc7b12f4b4c","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959810666","label":"PR #641","sha":"6b3751b4684ba45f496f1a1bff8b86ef6ba8275b","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959811321","label":"PR #640","sha":"fed50ae2502ac0a65395bbef5af43fcf384d5d04","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959811864","label":"PR #639","sha":"0e03df2c6f20e4d154f286fd69a4e2980d21a12d","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959812634","label":"PR #636","sha":"7efdbee4b571f2c80f5b6173bc9a84b51fbef5eb","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959813189","label":"PR #638","sha":"350d1b98baa943dcae63412eeffded7b5160bc8a","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959813761","label":"PR #637","sha":"f25336193b9f6b042eb027eca27acc4cc75a69d6","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959814335","label":"PR #634","sha":"4ba441d4ad8b6c49e9ee03d9cdfd2f04a129b714","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."},{"runId":"25959814835","label":"PR #632","sha":"1ad5fd735c7f45ad5e07c8033e5b68a642ada69c","source":"manual-backfill","artifacts":["devenv-perf"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."}]'
- BASELINE_MAX_RUNS: '20'
- BASELINE_MAX_CANDIDATE_RUNS: '60'
- BASELINE_REQUIRED_OBSERVATIONS_JSON: '[{"id":"devenv.shell_eval_warm.duration","minSources":10},{"id":"devenv.tasks_list.duration","minSources":10},{"id":"devenv.processes_help.duration","minSources":10},{"id":"devenv.task_pnpm_install.duration","minSources":10},{"id":"devenv.task_genie_run.duration","minSources":10},{"id":"devenv.task_check_quick_warm.duration","minSources":10},{"id":"devenv.task_check_quick_forced.duration","minSources":10},{"id":"devenv.genie_check_direct.duration","minSources":10}]'
- run: |
- set -euo pipefail
-
- mkdir -p "$BASELINE_OUTPUT_DIR"
-
- if command -v gh >/dev/null 2>&1; then
- GH_BIN="$(command -v gh)"
- else
- echo "::notice::gh is not on PATH; resolving GitHub CLI through Nix"
- if ! GH_BIN="$(nix build --no-link --print-out-paths nixpkgs#gh 2>/dev/null)/bin/gh"; then
- echo "::notice::unable to resolve GitHub CLI through Nix; skipping previous artifact download"
- exit 0
- fi
- fi
- echo "Using GitHub CLI: $GH_BIN"
-
- repo="${GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}"
- workflow="${BASELINE_WORKFLOW_NAME:-CI}"
- branch="${BASELINE_BRANCH:-${GITHUB_BASE_REF:-${GITHUB_REF_NAME:-main}}}"
- seed_runs_file="$BASELINE_OUTPUT_DIR/baseline-seed-runs.json"
- required_observations_file="$BASELINE_OUTPUT_DIR/baseline-required-observations.json"
- printf '%s' "${BASELINE_SEED_RUNS_JSON:-[]}" >"$seed_runs_file"
- printf '%s' "${BASELINE_REQUIRED_OBSERVATIONS_JSON:-[]}" >"$required_observations_file"
- if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.runId | type == "string")) else false end' \
- "$seed_runs_file" >/dev/null; then
- echo "::error::BASELINE_SEED_RUNS_JSON must be an array of objects with string runId fields"
- exit 1
- fi
- if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.id | type == "string") and (.minSources | type == "number")) else false end' \
- "$required_observations_file" >/dev/null; then
- echo "::error::BASELINE_REQUIRED_OBSERVATIONS_JSON must be an array of objects with string id and numeric minSources fields"
- exit 1
- fi
- seed_run_ids="$(jq -r '.[].runId' "$seed_runs_file")"
- required_observation_count="$(jq 'length' "$required_observations_file")"
- max_candidate_runs="${BASELINE_MAX_CANDIDATE_RUNS:-${BASELINE_MAX_RUNS:-5}}"
- if ! [[ "$max_candidate_runs" =~ ^[0-9]+$ ]] || [ "$max_candidate_runs" -lt 1 ]; then
- max_candidate_runs=1
- fi
-
- candidate_runs="$(
- "$GH_BIN" run list \
- --repo "$repo" \
- --workflow "$workflow" \
- --branch "$branch" \
- --event push \
- --status success \
- --json databaseId,headSha \
- --limit "$max_candidate_runs" \
- --jq '[.[] | select(.headSha != env.GITHUB_SHA) | .databaseId] | .[]'
- )"
-
- candidate_runs="$seed_run_ids
- $candidate_runs"
-
- max_runs="${BASELINE_MAX_RUNS:-5}"
- if ! [[ "$max_runs" =~ ^[0-9]+$ ]] || [ "$max_runs" -lt 1 ]; then
- max_runs=1
- fi
-
- write_baseline_observation_counts() {
- local measurement_index="$BASELINE_OUTPUT_DIR/baseline-measurement-files.txt"
- local counts_file="$BASELINE_OUTPUT_DIR/baseline-observation-counts.json"
- find "$BASELINE_OUTPUT_DIR" \
- -mindepth 2 \
- -maxdepth 2 \
- -name measurements.json \
- -type f \
- -print \
- | sort >"$measurement_index" || true
-
- if [ -s "$measurement_index" ]; then
- xargs -r jq -s \
- --slurpfile required "$required_observations_file" \
- '
- ([.[] | (.observations // [])[]? | select(.value | type == "number") | .id] | sort | group_by(.) | map({id: .[0], sources: length})) as $counts
- | ($required[0] // []) as $requiredRows
- | {
- counts: $counts,
- required: (
- $requiredRows
- | map(. as $requiredRow | ($counts | map(select(.id == $requiredRow.id)) | .[0].sources // 0) as $actual | $requiredRow + {sources:$actual, satisfied:($actual >= $requiredRow.minSources)})
- )
- }
- ' <"$measurement_index" >"$counts_file"
- else
- jq -n --slurpfile required "$required_observations_file" \
- '{counts: [], required: (($required[0] // []) | map(. + {sources:0, satisfied:false}))}' >"$counts_file"
- fi
- }
-
- baseline_requirements_satisfied() {
- if [ "$required_observation_count" -eq 0 ]; then
- return 1
- fi
- write_baseline_observation_counts
- jq -e '.required | all(.satisfied == true)' "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" >/dev/null
- }
-
- run_id=""
- artifact_name=""
- artifact_id=""
- downloaded_runs_file="$BASELINE_OUTPUT_DIR/baseline-runs.jsonl"
- seen_runs_file="$BASELINE_OUTPUT_DIR/baseline-seen-runs.txt"
- : >"$downloaded_runs_file"
- : >"$seen_runs_file"
- for candidate_run in $candidate_runs; do
- if [ -z "$candidate_run" ]; then
- continue
- fi
- if grep -qxF "$candidate_run" "$seen_runs_file"; then
- continue
- fi
- downloaded_count="$(wc -l <"$downloaded_runs_file" | tr -d ' ')"
- if [ "$downloaded_count" -ge "$max_runs" ]; then
- if baseline_requirements_satisfied; then
- break
- fi
- echo "::notice::downloaded $downloaded_count baseline artifact(s), but required observation counts are not satisfied yet; continuing through bounded candidate history"
- fi
- if [ "$(wc -l <"$seen_runs_file" | tr -d ' ')" -ge "$max_candidate_runs" ]; then
- break
- fi
- printf '%s\n' "$candidate_run" >>"$seen_runs_file"
-
- artifact_json="$(
- "$GH_BIN" api "repos/$repo/actions/runs/$candidate_run/artifacts" \
- | jq --arg artifactName "$BASELINE_ARTIFACT_NAME" '.artifacts
- | map(select(.expired == false))
- | map(select(.name == $artifactName or (.name | startswith($artifactName + "-"))))
- | sort_by(.created_at // "")
- | reverse
- | .[0] // empty'
- )"
-
- if [ -n "$artifact_json" ]; then
- current_artifact_name="$(printf '%s' "$artifact_json" | jq -r '.name')"
- current_artifact_id="$(printf '%s' "$artifact_json" | jq -r '.id')"
- current_output_dir="$BASELINE_OUTPUT_DIR/run-$candidate_run"
- mkdir -p "$current_output_dir"
- if "$GH_BIN" run download "$candidate_run" \
- --repo "$repo" \
- --name "$current_artifact_name" \
- --dir "$current_output_dir"; then
- if [ -z "$run_id" ]; then
- run_id="$candidate_run"
- artifact_name="$current_artifact_name"
- artifact_id="$current_artifact_id"
- fi
- jq -cn \
- --arg runId "$candidate_run" \
- --arg artifactName "$current_artifact_name" \
- --arg artifactId "$current_artifact_id" \
- --arg path "run-$candidate_run" \
- '{runId:$runId, artifactName:$artifactName, artifactId:$artifactId, path:$path}' \
- >>"$downloaded_runs_file"
- else
- echo "::notice::failed to download baseline artifact $current_artifact_name from run $candidate_run"
- fi
- fi
- done
-
- write_baseline_observation_counts
-
- if [ -z "$run_id" ] || [ -z "$artifact_name" ]; then
- echo "::notice::no successful baseline run found for $repo workflow=$workflow branch=$branch"
- exit 0
- fi
-
- jq -n \
- --slurpfile runs "$downloaded_runs_file" \
- --slurpfile seedRuns "$seed_runs_file" \
- --slurpfile observationCounts "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" \
- --argjson schemaVersion 1 \
- --arg repository "$repo" \
- --arg workflow "$workflow" \
- --arg branch "$branch" \
- --arg runId "$run_id" \
- --arg artifactName "$artifact_name" \
- --arg artifactId "$artifact_id" \
- '{
- schemaVersion: $schemaVersion,
- source: "github-actions-artifact",
- repository: $repository,
- workflow: $workflow,
- branch: $branch,
- runId: $runId,
- artifactName: $artifactName,
- artifactId: $artifactId,
- seedRuns: ($seedRuns[0] // []),
- runs: $runs,
- observationCounts: ($observationCounts[0] // null)
- }' >"$BASELINE_OUTPUT_DIR/baseline-provenance.json"
-
- echo "Downloaded $(wc -l <"$downloaded_runs_file" | tr -d ' ') baseline artifact(s), latest $artifact_name from run $run_id into $BASELINE_OUTPUT_DIR"
-
- name: Benchmark devenv surfaces
shell: bash
run: |
@@ -3237,2841 +3029,908 @@ jobs:
cat "$ARTIFACT_DIR/timings.pretty.json"
- - name: Compare CI measurements with baseline
+ - name: Upload devenv perf artifacts
+ if: always()
+ uses: actions/upload-artifact@v4
+ with:
+ name: devenv-perf
+ path: |
+ tmp/devenv-perf-ci
+ !tmp/devenv-perf-ci/baseline/**
+ if-no-files-found: error
+ retention-days: 30
+ timeout-minutes: 30
+ concurrency:
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-devenv-perf"
+ cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
+ nix-closure-sizes:
+ if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
+ runs-on:
+ [namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}']
+ timeout-minutes: 30
+ defaults:
+ run:
shell: bash
- env:
- CI_MEASUREMENT_CURRENT_DIR: tmp/devenv-perf-ci
- CI_MEASUREMENT_BASELINE_DIR: tmp/devenv-perf-ci/baseline
- CI_MEASUREMENT_COMPARISON_FILE: tmp/devenv-perf-ci/measurement-comparison.json
- CI_MEASUREMENT_REGRESSION_MODE: warn
- CI_MEASUREMENT_PR_COMMENT_ENABLED: 'false'
- CI_MEASUREMENT_PR_COMMENT_TITLE: Devenv Performance
- CI_MEASUREMENT_PR_COMMENT_MAX_ROWS: '8'
- CI_MEASUREMENT_PR_COMMENT_MAX_HISTORY: '20'
- CI_MEASUREMENT_PR_COMMENT_ASSET_BRANCH: ci-measurement-assets
- run: |
- set -euo pipefail
-
- export PATH="/run/current-system/sw/bin:/usr/bin:/bin:$PATH"
-
- current_dir="${CI_MEASUREMENT_CURRENT_DIR:?CI_MEASUREMENT_CURRENT_DIR not set}"
- baseline_dir="${CI_MEASUREMENT_BASELINE_DIR:?CI_MEASUREMENT_BASELINE_DIR not set}"
- comparison_file="${CI_MEASUREMENT_COMPARISON_FILE:?CI_MEASUREMENT_COMPARISON_FILE not set}"
- mode="${CI_MEASUREMENT_REGRESSION_MODE:-warn}"
- mkdir -p "$(dirname "$comparison_file")"
-
- if [ "$mode" = "off" ]; then
- jq -n --argjson schemaVersion 1 --arg status skipped --arg mode "$mode" \
- '{schemaVersion:$schemaVersion,status:$status,mode:$mode,comparisons:{}}' \
- >"$comparison_file"
- exit 0
- fi
-
- current_index="$(mktemp)"
- baseline_index="$(mktemp)"
- find "$current_dir" -name baseline -type d -prune -o -name measurements.json -type f -print | sort >"$current_index" || true
- {
- find "$baseline_dir" -name baseline -type d ! -path "$baseline_dir" -prune -o -name measurements.json -type f -print
- } | sort -u >"$baseline_index" || true
-
- if [ ! -s "$current_index" ]; then
- echo "::error::no current measurements.json files found under $current_dir"
+ permissions:
+ actions: read
+ contents: write
+ issues: write
+ pull-requests: write
+ env:
+ CI_MEASUREMENT_SUBJECT_REF: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.ref || github.ref }}
+ CI_MEASUREMENT_SUBJECT_SHA: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.sha || github.sha }}
+ CI_MEASUREMENT_SUBJECT_LABEL: ${{ inputs.measurement_baseline_label }}
+ CI_MEASUREMENT_ALLOW_PROBE_FAILURES: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && '1' || '' }}
+ steps:
+ - uses: actions/checkout@v6
+ - name: Checkout CI measurement baseline ref
+ if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }}
+ uses: actions/checkout@v6
+ with:
+ ref: ${{ inputs.measurement_baseline_ref }}
+ - name: Install Nix
+ uses: DeterminateSystems/determinate-nix-action@v3
+ with:
+ extra-conf: |
+ experimental-features = nix-command flakes
+ accept-flake-config = true
+ extra-substituters = https://devenv.cachix.org
+ extra-trusted-public-keys = devenv.cachix.org-1:w1cLUi8dv3hnoSPGAuibQv+f9TZLr6cv/Hm9XgU50cw=
+ access-tokens = github.com=${{ github.token }}
+ summarize: true
+ - name: Provide cachix CLI from nixpkgs
+ shell: bash
+ run: |
+ set -euo pipefail
+ out=$(nix build --no-link --print-out-paths nixpkgs#cachix)
+ echo "$out/bin" >> "$GITHUB_PATH"
+ - name: Enable Cachix cache
+ uses: cachix/cachix-action@v17
+ with:
+ name: overeng-effect-utils
+ authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
+ - name: Use pinned devenv from lock
+ run: |
+ DEVENV_REV=$(jq -r .nodes.devenv.locked.rev devenv.lock)
+ if [ -z "$DEVENV_REV" ] || [ "$DEVENV_REV" = "null" ]; then
+ echo '::error::devenv.lock missing .nodes.devenv.locked.rev'
exit 1
fi
-
- current_json="$comparison_file.current.json"
- baseline_json="$comparison_file.baseline.json"
- xargs -r jq -s '.' <"$current_index" >"$current_json"
- if [ -s "$baseline_index" ]; then
- xargs -r jq -s '.' <"$baseline_index" >"$baseline_json"
- else
- printf '[]\n' >"$baseline_json"
+ echo "DEVENV_REV=$DEVENV_REV" >> "$GITHUB_ENV"
+ echo "Pinned devenv rev: $DEVENV_REV"
+ shell: bash
+ - name: Isolate pnpm state
+ shell: bash
+ run: |
+ echo "PNPM_STORE_DIR=${{ runner.temp }}/pnpm-store/${{ github.job }}" >> "$GITHUB_ENV"
+ echo "PNPM_HOME=${{ github.workspace }}/.pnpm-home" >> "$GITHUB_ENV"
+ - id: restore-pnpm-state
+ name: Restore pnpm state
+ uses: actions/cache/restore@v4
+ with:
+ path: |
+ ${{ github.workspace }}/.pnpm-home
+ ${{ runner.temp }}/pnpm-store/${{ github.job }}
+ key: "pnpm-state-v1-${{ runner.os }}-${{ runner.arch }}-${{ hashFiles('**/pnpm-lock.yaml') }}"
+ - name: Resolve devenv
+ run: |
+ DEVENV_REV=$(jq -r .nodes.devenv.locked.rev devenv.lock)
+ if [ -z "$DEVENV_REV" ] || [ "$DEVENV_REV" = "null" ]; then
+ echo '::error::devenv.lock missing .nodes.devenv.locked.rev'
+ exit 1
fi
- jq -n \
- --slurpfile current "$current_json" \
- --slurpfile baseline "$baseline_json" \
- --argjson schemaVersion 1 \
- --arg mode "$mode" \
- --arg currentDir "$current_dir" \
- --arg baselineDir "$baseline_dir" \
- '
- def identity_dimensions:
- (.dimensions // {})
- | to_entries
- | map(select(.key as $key | ["devenvRev", "otelServiceName", "status", "probeLabel", "sampleCount", "measuredSampleCount"] | index($key) | not))
- | sort_by(.key)
- | map("\(.key)=\(.value|tostring)")
- | join(",");
+ resolve_devenv() {
+ nix build \
+ --accept-flake-config \
+ --option extra-substituters https://devenv.cachix.org \
+ --option extra-trusted-public-keys devenv.cachix.org-1:w1cLUi8dv3hnoSPGAuibQv+f9TZLr6cv/Hm9XgU50cw= \
+ --no-link \
+ --print-out-paths \
+ "github:cachix/devenv/$DEVENV_REV#devenv"
+ }
- def observation_key($doc):
- [
- ($doc.target.kind // "unknown"),
- ($doc.target.id // $doc.target.name // "unknown"),
- ($doc.target.system // "unknown"),
- (.id // .name // "unknown"),
- (.unit // "unknown"),
- identity_dimensions
- ] | join("|");
+ # Temporary: capture diagnostics dir for #272 root-cause analysis.
+ DIAG_ROOT="${RUNNER_TEMP:-/tmp}/nix-store-diagnostics-${GITHUB_JOB:-job}-${RUNNER_OS:-unknown}-${GITHUB_RUN_ATTEMPT:-0}"
+ mkdir -p "$DIAG_ROOT"
+ echo "NIX_STORE_DIAGNOSTICS_DIR=$DIAG_ROOT" >> "$GITHUB_ENV"
- def median:
- sort as $sorted
- | ($sorted | length) as $count
- | if $count == 0 then null
- elif ($count % 2) == 1 then $sorted[($count / 2 | floor)]
- else (($sorted[($count / 2 - 1)] + $sorted[($count / 2)]) / 2)
- end;
+ {
+ echo "timestamp_utc=$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+ echo "runner_name=${RUNNER_NAME:-unknown}"
+ echo "runner_os=${RUNNER_OS:-unknown}"
+ echo "runner_arch=${RUNNER_ARCH:-unknown}"
+ echo "github_job=${GITHUB_JOB:-unknown}"
+ echo "github_run_id=${GITHUB_RUN_ID:-unknown}"
+ echo "nix_user_conf_files=${NIX_USER_CONF_FILES:-}"
+ nix --version || true
+ } > "$DIAG_ROOT/environment.txt" 2>&1
- def percentile($p):
- sort as $sorted
- | ($sorted | length) as $count
- | if $count == 0 then null
- else $sorted[(($p * ($count - 1)) | floor)]
- end;
+ if ! DEVENV_OUT=$(resolve_devenv 2> >(tee "$DIAG_ROOT/resolve-devenv.log" >&2)); then
+ echo "::error::resolve_devenv failed. Last 30 lines of log:"
+ tail -30 "$DIAG_ROOT/resolve-devenv.log" || true
+ exit 1
+ fi
+ DEVENV_BIN="$DEVENV_OUT/bin/devenv"
- def abs_value: if . < 0 then -. else . end;
+ # Fast validity check on the devenv store path (~1-2s vs ~25s for devenv info).
+ if ! nix-store --check-validity "$DEVENV_OUT" 2>/dev/null; then
+ echo "::warning::devenv store path invalid, repairing targeted path..."
+ nix-store --repair-path "$DEVENV_OUT" > "$DIAG_ROOT/nix-store-verify-repair.log" 2>&1 || true
+ rm -rf "${XDG_CACHE_HOME:-$HOME/.cache}"/nix/eval-cache-* ~/.cache/nix/eval-cache-*
+ if ! DEVENV_OUT=$(resolve_devenv 2> >(tee "$DIAG_ROOT/resolve-devenv-post-repair.log" >&2)); then
+ echo "::error::resolve_devenv failed after repair. Last 30 lines of log:"
+ tail -30 "$DIAG_ROOT/resolve-devenv-post-repair.log" || true
+ exit 1
+ fi
+ DEVENV_BIN="$DEVENV_OUT/bin/devenv"
+ fi
- def observations_by_key($docs):
- reduce $docs[]? as $doc
- ({};
- reduce (($doc.observations // [])[]? | select(.value | type == "number")) as $obs
- (.;
- ($obs | observation_key($doc)) as $key
- | .[$key] = ((.[$key] // []) + [{
- target: $doc.target,
- observation: $obs,
- generatedAt: $doc.generatedAt
- }])
- )
- );
+ echo "DEVENV_BIN=$DEVENV_BIN" >> "$GITHUB_ENV"
+ "$DEVENV_BIN" version | tee "$DIAG_ROOT/devenv-version.txt"
+ shell: bash
+ - name: Evict cached pnpm deps for oxlint-npm
+ shell: bash
+ run: |
+ targetRef='.#oxlint-npm'
+ entriesJson=$(mktemp)
+ if nix eval --json "$targetRef.passthru.depsBuildEntries" >"$entriesJson" 2>/dev/null; then
+ while IFS=$'\t' read -r attrName drv; do
+ [ -n "$drv" ] || continue
+ while IFS= read -r outPath; do
+ [ -n "$outPath" ] || continue
+ if nix path-info "$outPath" >/dev/null 2>&1; then
+ echo "evicting cached: $(basename "$outPath")"
+ if ! nix store delete --ignore-liveness "$outPath" >/dev/null 2>&1; then
+ echo "::error::failed to evict cached pnpm-deps output: $outPath"
+ exit 1
+ fi
+ if nix path-info "$outPath" >/dev/null 2>&1; then
+ echo "::error::cached pnpm-deps output still present after eviction: $outPath"
+ exit 1
+ fi
+ fi
+ done < <(nix-store -q --outputs "$drv" 2>/dev/null || true)
+ done < <(jq -r '.[] | [.attrName, (.drvPath // "")] | @tsv' "$entriesJson")
+ else
+ topDrv=$(nix path-info --derivation "$targetRef" 2>/dev/null || true)
+ if [ -n "$topDrv" ]; then
+ while IFS= read -r drv; do
+ [ -n "$drv" ] || continue
+ attrName=""
+ while IFS= read -r outPath; do
+ [ -n "$outPath" ] || continue
+ if nix path-info "$outPath" >/dev/null 2>&1; then
+ echo "evicting cached: $(basename "$outPath")"
+ if ! nix store delete --ignore-liveness "$outPath" >/dev/null 2>&1; then
+ echo "::error::failed to evict cached pnpm-deps output: $outPath"
+ exit 1
+ fi
+ if nix path-info "$outPath" >/dev/null 2>&1; then
+ echo "::error::cached pnpm-deps output still present after eviction: $outPath"
+ exit 1
+ fi
+ fi
+ done < <(nix-store -q --outputs "$drv" 2>/dev/null || true)
+ done < <(nix-store -qR "$topDrv" 2>/dev/null | grep "pnpm-deps-[a-z0-9-]*-v[0-9].*\.drv$" || true)
+ fi
+ fi
+ rm -f "$entriesJson"
+ - name: Force diagnostics failure (debug)
+ if: ${{ github.event_name == 'workflow_dispatch' && (inputs.debug_force_nix_diagnostics_failure == true || inputs.debug_force_nix_diagnostics_failure == 'true') }}
+ shell: bash
+ run: |
+ diag_dir="${NIX_STORE_DIAGNOSTICS_DIR:-${RUNNER_TEMP:-/tmp}/nix-store-diagnostics-missing}"
+ mkdir -p "$diag_dir"
+ cat > "$diag_dir/synthetic-signature.log" <<'EOF'
+ Failed to convert config.cachix to JSON
+ ... while evaluating the option `cachix.package`
+ error: path '/nix/store/synthetic-invalid-path' is not valid
+ EOF
+ echo "::warning::Intentional failure for diagnostics validation (#272)"
+ exit 1
+ - name: 'Measure Nix closure: genie'
+ shell: bash
+ env:
+ ARTIFACT_DIR: tmp/nix-closure-ci/current/genie_package
+ RUNNER_CLASS: '${{ runner.os }}-${{ runner.arch }}'
+ run: |
+ set -euo pipefail
- def observation_stats($items):
- ($items | map(.observation.value)) as $values
- | ($items | map(.observation.comparison.baseline // empty)) as $pairedBaselineValues
- | ($items | map(.observation.statistics.pairedDeltaMedian // empty)) as $pairedDeltaMedianValues
- | ($items | map(.observation.statistics.pairedDeltaP25 // empty)) as $pairedDeltaP25Values
- | ($items | map(.observation.statistics.pairedDeltaP75 // empty)) as $pairedDeltaP75Values
- | ($items | map(.observation.statistics.pairedDeltaMad // empty)) as $pairedDeltaMadValues
- | ($items | map(.observation.statistics.pairedDeltaSamples // []) | add // []) as $pairedDeltaSampleValues
- | ($items | map(.observation.statistics.measuredSampleCount // .observation.statistics.sampleCount // 1) | add // ($items | length)) as $sampleCount
- | ($values | median) as $median
- | {
- target: ($items[0].target // {}),
- observation: ($items[-1].observation // {}),
- measurementKind: ($items[-1].observation.measurementKind // null),
- value: $median,
- min: ($values | min),
- max: ($values | max),
- p25: ($values | percentile(0.25)),
- p75: ($values | percentile(0.75)),
- p95: ($values | percentile(0.95)),
- mad: ($values | map(. - $median | if . < 0 then -. else . end) | median),
- sourceCount: ($items | length),
- sampleCount: $sampleCount,
- pairedSampleCount: ($items | map(.observation.statistics.pairedSampleCount // .observation.comparison.pairedSampleCount // 0) | add // 0),
- pairedBaselineValue: (if ($pairedBaselineValues | length) == 0 then null else ($pairedBaselineValues | median) end),
- pairedDeltaMedianValue: (if ($pairedDeltaMedianValues | length) == 0 then null else ($pairedDeltaMedianValues | median) end),
- pairedDeltaP25Value: (if ($pairedDeltaP25Values | length) == 0 then null else ($pairedDeltaP25Values | median) end),
- pairedDeltaP75Value: (if ($pairedDeltaP75Values | length) == 0 then null else ($pairedDeltaP75Values | median) end),
- pairedDeltaMadValue: (if ($pairedDeltaMadValues | length) == 0 then null else ($pairedDeltaMadValues | median) end),
- pairedDeltaSampleValues: $pairedDeltaSampleValues,
- generatedAt: ($items[-1].generatedAt // null)
- };
-
- def budget($metric; $unit):
- if $metric == "nix.closure.nar_size" then
- {warnRatio:1.05, failRatio:1.10, warnAbs:52428800, failAbs:209715200, statisticalToleranceRatio:0.02, statisticalToleranceAbs:10485760}
- elif $metric == "nix.closure.bucket.nar_size" then
- {warnRatio:1.10, failRatio:1.20, warnAbs:52428800, failAbs:209715200, statisticalToleranceRatio:0.05, statisticalToleranceAbs:10485760}
- elif $metric == "nix.closure.path_count" then
- {warnRatio:1.05, failRatio:1.10, warnAbs:100, failAbs:500, statisticalToleranceRatio:0.02, statisticalToleranceAbs:10}
- elif $unit == "seconds" then
- {warnRatio:1.10, failRatio:1.20, warnAbs:0.25, failAbs:1, statisticalToleranceRatio:0.10, statisticalToleranceAbs:0.25}
- else
- {warnRatio:1.25, failRatio:1.50, warnAbs:1, failAbs:3, statisticalToleranceRatio:0.10, statisticalToleranceAbs:1}
- end;
+ mkdir -p "$ARTIFACT_DIR"
+ installable='.#genie'
+ target_id='genie_package'
+ target_name='genie'
+ target_label='Genie package'
+ target_group='packages'
+ target_description='the packaged Genie CLI closure'
+ artifact_file="$ARTIFACT_DIR/measurements.json"
+ target_system='x86_64-linux'
- def noise_floor($metric; $unit):
- if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" then 10485760
- elif $metric == "nix.closure.path_count" then 10
- elif $unit == "seconds" then 0.1
- else 0
- end;
- def default_policy($metric; $unit):
- budget($metric; $unit) as $b
- | noise_floor($metric; $unit) as $noise
- | $b + {
- enabled:true,
- comparisonMode:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" or $unit != "seconds" then "budget" else "historical" end),
- minBaselineSources:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" or $unit != "seconds" then 1 else 10 end),
- minCurrentSamples:(if $unit == "seconds" then 3 else 1 end),
- minPairedSamples:(if $unit == "seconds" then 5 else 0 end),
- noiseFloor:$noise
- };
- def observation_policy($obs):
- default_policy($obs.name // "unknown"; $obs.unit // "unknown") + ($obs.policy // {});
- def policy_enabled($policy):
- if ($policy | has("enabled")) then $policy.enabled else true end;
+ out_path="$(nix build --no-update-lock-file --no-link --print-out-paths "$installable")"
+ path_info="$ARTIFACT_DIR/nix-closure-path-info.json"
+ paths_file="$ARTIFACT_DIR/nix-closure-paths.json"
- def classify($metric; $unit; $measurementKind; $policy; $current; $currentP25; $currentP75; $currentMad; $baseline; $baselineMin; $baselineMax; $baselineP25; $baselineP75; $baselineP95; $baselineMad; $currentSamples; $baselineSources; $pairedSamples; $pairedDeltaMedian; $pairedDeltaP25; $pairedDeltaP75; $pairedDeltaMad; $pairedDeltaValues):
- $policy as $b
- | ($policy.comparisonMode // (if $measurementKind == "deterministic" or $unit != "seconds" then "budget" elif $measurementKind == "diagnostic" then "diagnostic" else "historical" end)) as $comparisonMode
- | ($policy.noiseFloor // noise_floor($metric; $unit)) as $noise
- | ($current - $baseline) as $delta
- | (if $comparisonMode == "paired" and $pairedDeltaMedian != null then $pairedDeltaMedian else $delta end) as $evidenceDelta
- | (($policy.pairedEvidenceQuantile // 0.25) | tonumber) as $pairedEvidenceQuantile
- | (if $baseline > 0 then ($current / $baseline) else null end) as $ratio
- | (($baselineP75 // $baseline) - ($baselineP25 // $baseline)) as $iqr
- | (($currentP75 // $current) - ($currentP25 // $current)) as $currentIqr
- | (($pairedDeltaP75 // $evidenceDelta) - ($pairedDeltaP25 // $evidenceDelta)) as $pairedDeltaIqr
- | ([
- $noise,
- (($policy.statisticalToleranceAbs // 0) | tonumber),
- (if $baseline > 0 then ($baseline * (($policy.statisticalToleranceRatio // 0) | tonumber)) else 0 end),
- (($baselineMad // 0) * 3),
- (($iqr // 0) * 1.5)
- ] | max) as $robustTolerance
- | (if $currentSamples > 1 then ([
- $noise,
- (($policy.statisticalToleranceAbs // 0) | tonumber),
- (if $current > 0 then ($current * (($policy.statisticalToleranceRatio // 0) | tonumber)) else 0 end),
- (($currentMad // 0) * 3),
- (($currentIqr // 0) * 1.5)
- ] | max) else 0 end) as $currentRobustTolerance
- | ([
- $noise,
- (($policy.statisticalToleranceAbs // 0) | tonumber),
- (if $baseline > 0 then ($baseline * (($policy.statisticalToleranceRatio // 0) | tonumber)) else 0 end),
- (($pairedDeltaMad // 0) * 3),
- (($pairedDeltaIqr // 0) * 1.5)
- ] | max) as $pairedDeltaTolerance
- | ($baseline + $robustTolerance) as $robustUpper
- | ($baseline - $robustTolerance) as $robustLower
- | ($current + $currentRobustTolerance) as $currentRobustUpper
- | ($current - $currentRobustTolerance) as $currentRobustLower
- | (if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then ($pairedDeltaValues | percentile($pairedEvidenceQuantile)) else ($evidenceDelta - $pairedDeltaTolerance) end) as $evidenceDeltaLower
- | (if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then ($pairedDeltaValues | percentile(1 - $pairedEvidenceQuantile)) else ($evidenceDelta + $pairedDeltaTolerance) end) as $evidenceDeltaUpper
- | ([($b.warnAbs // 0), (if $baseline > 0 then ($baseline * (($b.warnRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $warnBudget
- | ([($b.failAbs // 0), (if $baseline > 0 then ($baseline * (($b.failRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $failBudget
- | ($comparisonMode != "paired") as $needsHistoricalBaselineCount
- | (
- ($current >= $robustLower and $current <= $robustUpper)
- or ($currentRobustTolerance > 0 and $currentRobustLower <= $robustUpper and $currentRobustUpper >= $robustLower)
- ) as $withinRobustBand
- | ($comparisonMode == "historical" and $measurementKind != "deterministic") as $canUseRobustBandSuppression
- | (
- $baselineMin != null
- and $baselineMax != null
- and $current >= $baselineMin
- and $current <= $baselineMax
- ) as $withinBaselineRange
- | (
- if $baseline <= 0 then "unknown"
- elif $comparisonMode == "paired" and $evidenceDeltaLower > $failBudget then "fail"
- elif $comparisonMode == "paired" and $evidenceDeltaLower > $warnBudget then "warn"
- elif $comparisonMode == "paired" then "pass"
- elif ($delta > $b.failAbs and $current > ($baseline * $b.failRatio)) then "fail"
- elif ($delta > $b.warnAbs and $current > ($baseline * $b.warnRatio)) then "warn"
- else "pass"
- end
- ) as $thresholdStatus
- | (
- policy_enabled($policy) == true
- and $baseline > 0
- and (if $needsHistoricalBaselineCount then $baselineSources >= ($policy.minBaselineSources // 1) else true end)
- and $currentSamples >= ($policy.minCurrentSamples // 1)
- and (if $comparisonMode == "paired" then $pairedSamples >= ($policy.minPairedSamples // 1) else true end)
- and (if $comparisonMode == "paired" then $pairedDeltaMedian != null else true end)
- ) as $gateable
- | (
- if (policy_enabled($policy) != true) then "disabled"
- elif $baseline <= 0 then "missing_baseline"
- elif $needsHistoricalBaselineCount and $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
- elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
- elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
- elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta"
- else "eligible"
- end
- ) as $gateReason
- | (
- if $baseline <= 0 then "unknown"
- elif (policy_enabled($policy) != true) then "diagnostic"
- elif ($delta | abs_value) <= $noise then "noise_floor"
- elif $needsHistoricalBaselineCount and $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
- elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
- elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
- elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta"
- elif $comparisonMode == "paired" and $thresholdStatus == "pass" and $evidenceDelta > $warnBudget then "paired_uncertain"
- elif ($canUseRobustBandSuppression and $thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band"
- elif $thresholdStatus == "pass" then "within_budget"
- else "threshold_exceeded"
- end
- ) as $confidence
- | (
- if ($gateable and $confidence == "threshold_exceeded") then $thresholdStatus
- elif $thresholdStatus == "unknown" then "unknown"
- else "pass"
- end
- ) as $status
- | (
- if $baseline <= 0 then "unknown"
- elif $comparisonMode == "paired" and ($evidenceDelta | abs_value) <= $noise then "unchanged"
- elif $comparisonMode == "paired" and $evidenceDeltaLower <= 0 and $evidenceDeltaUpper >= 0 then "unchanged"
- elif $comparisonMode == "paired" and $evidenceDelta < 0 then "improved"
- elif $comparisonMode == "paired" then "regressed"
- elif ($delta | abs_value) <= $noise then "unchanged"
- elif $canUseRobustBandSuppression and $withinRobustBand then "unchanged"
- elif $delta < 0 then "improved"
- else "regressed"
- end
- ) as $direction
- | (
- if $baseline <= 0 then null
- elif (policy_enabled($policy) != true) then null
- elif $comparisonMode == "paired" and ($evidenceDeltaLower <= 0 and $evidenceDeltaUpper >= 0) then 0
- elif $comparisonMode == "paired" and ($evidenceDelta | abs_value) <= $noise then 0
- elif $comparisonMode == "paired" and $evidenceDelta > 0 then ([0, $evidenceDeltaLower] | max) / $warnBudget
- elif $comparisonMode == "paired" then -(([0, (-$evidenceDeltaUpper)] | max) / $warnBudget)
- elif $canUseRobustBandSuppression and $withinRobustBand then 0
- elif ($delta | abs_value) <= $noise then 0
- elif ($confidence == "threshold_exceeded" and $delta > 0) then ([0, ($currentRobustLower - $robustUpper), $delta] | max) / $warnBudget
- elif ($confidence == "threshold_exceeded" and $delta < 0) then -(([0, ($robustLower - $currentRobustUpper), (-$delta)] | max) / $warnBudget)
- elif $delta > 0 then ([0, ($currentRobustLower - $robustUpper)] | max) / $warnBudget
- else -(([0, ($robustLower - $currentRobustUpper)] | max) / $warnBudget)
- end
- ) as $semanticImpactScore
- | (
- if (policy_enabled($policy) != true) then "diagnostic"
- elif $semanticImpactScore == null then "unknown"
- elif $semanticImpactScore == 0 then "neutral"
- elif $semanticImpactScore >= ($failBudget / $warnBudget) then "fail_boundary"
- elif $semanticImpactScore >= 1 then "warn_boundary"
- elif $semanticImpactScore > 0 then "below_warn_boundary"
- else "improvement"
- end
- ) as $semanticImpactKind
- | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,comparisonMode:$comparisonMode,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,semanticImpactScore:$semanticImpactScore,semanticImpactKind:$semanticImpactKind,semanticWarnBudget:$warnBudget,semanticFailBudget:$failBudget,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange,pairedSamples:$pairedSamples,evidenceDelta:$evidenceDelta,evidenceDeltaLower:$evidenceDeltaLower,evidenceDeltaUpper:$evidenceDeltaUpper,evidenceDeltaTolerance:$pairedDeltaTolerance,pairedEvidenceQuantile:$pairedEvidenceQuantile,pairedEvidenceProtocol:(if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then "paired-delta-quantile-v1" elif $comparisonMode == "paired" then "paired-summary-robust-band-v1" else null end)};
+ nix path-info --recursive --json "$out_path" >"$path_info"
+ jq 'to_entries | map({ path: .key, narSize: (.value.narSize // 0) })' "$path_info" >"$paths_file"
- (observations_by_key($current[0]) | with_entries(.value = observation_stats(.value))) as $currentObs
- | (observations_by_key($baseline[0]) | with_entries(.value = observation_stats(.value))) as $baselineObs
- | (
- $currentObs
- | to_entries
- | map(
- .key as $key
- | .value as $currentValue
- | ($baselineObs[$key] // null) as $baselineValue
- | ($currentValue.observation | observation_policy(.)) as $policy
- | ($policy.comparisonMode // (if ($currentValue.observation.measurementKind // $currentValue.measurementKind) == "deterministic" or ($currentValue.observation.unit // "") != "seconds" then "budget" elif ($currentValue.observation.measurementKind // $currentValue.measurementKind) == "diagnostic" then "diagnostic" else "historical" end)) as $comparisonMode
- | ($currentValue.pairedBaselineValue // null) as $pairedBaselineValue
- | (if $comparisonMode == "paired" and $pairedBaselineValue != null then {
- value: $pairedBaselineValue,
- min: $pairedBaselineValue,
- max: $pairedBaselineValue,
- p25: $pairedBaselineValue,
- p75: $pairedBaselineValue,
- p95: $pairedBaselineValue,
- mad: 0,
- sourceCount: $currentValue.pairedSampleCount
- } else $baselineValue end) as $effectiveBaselineValue
- | {
- key: $key,
- value: (
- if $effectiveBaselineValue == null then
- {
- status: "missing_baseline",
- target: $currentValue.target,
- observation: $currentValue.observation,
- current: $currentValue.value,
- currentSamples: $currentValue.sampleCount,
- baselineSources: 0,
- gatePolicy: $policy,
- comparisonMode: $comparisonMode,
- gateable: false,
- gateReason: "missing_baseline",
- confidence: "missing_baseline",
- direction: "unknown"
- }
- else
- classify(
- $currentValue.observation.name;
- $currentValue.observation.unit;
- ($currentValue.observation.measurementKind // $currentValue.measurementKind);
- $policy;
- $currentValue.value;
- $currentValue.p25;
- $currentValue.p75;
- $currentValue.mad;
- $effectiveBaselineValue.value;
- $effectiveBaselineValue.min;
- $effectiveBaselineValue.max;
- $effectiveBaselineValue.p25;
- $effectiveBaselineValue.p75;
- $effectiveBaselineValue.p95;
- $effectiveBaselineValue.mad;
- $currentValue.sampleCount;
- $effectiveBaselineValue.sourceCount;
- $currentValue.pairedSampleCount;
- $currentValue.pairedDeltaMedianValue;
- $currentValue.pairedDeltaP25Value;
- $currentValue.pairedDeltaP75Value;
- $currentValue.pairedDeltaMadValue;
- ($currentValue.pairedDeltaSampleValues // [])
- ) + {
- target: $currentValue.target,
- observation: $currentValue.observation,
- currentSamples: $currentValue.sampleCount,
- baselineSources: $effectiveBaselineValue.sourceCount,
- baselineMin: $effectiveBaselineValue.min,
- baselineMax: $effectiveBaselineValue.max,
- baselineP25: $effectiveBaselineValue.p25,
- baselineP75: $effectiveBaselineValue.p75,
- baselineP95: $effectiveBaselineValue.p95
- ,baselineMad: $effectiveBaselineValue.mad
- }
- end
- )
- }
- )
- | from_entries
- ) as $comparisons
- | (
- if any($comparisons[]?; .status == "fail") then "fail"
- elif any($comparisons[]?; .status == "warn") then "warn"
- elif any($comparisons[]?;
- (if (.gatePolicy | has("enabled")) then .gatePolicy.enabled else true end)
- and (.gateReason == "missing_baseline"
- or .gateReason == "low_baseline_count"
- or .gateReason == "low_current_sample_count"
- or .gateReason == "low_paired_sample_count"
- or .gateReason == "missing_paired_delta")
- ) then "partial"
- else "pass"
- end
- ) as $status
- | (
- [$comparisons[]?]
+ jq -n \
+ --slurpfile paths "$paths_file" \
+ --argjson schemaVersion 1 \
+ --arg generatedAt "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
+ --arg repository "${GITHUB_REPOSITORY:-unknown}" \
+ --arg branchKind "${GITHUB_EVENT_NAME:-unknown}" \
+ --arg ref "${CI_MEASUREMENT_SUBJECT_REF:-${GITHUB_REF:-unknown}}" \
+ --arg headSha "${CI_MEASUREMENT_SUBJECT_SHA:-${GITHUB_SHA:-unknown}}" \
+ --arg baseSha "${GITHUB_BASE_SHA:-}" \
+ --arg runnerName "${RUNNER_NAME:-unknown}" \
+ --arg runnerOs "${RUNNER_OS:-unknown}" \
+ --arg runnerArch "${RUNNER_ARCH:-unknown}" \
+ --arg runnerClass "${RUNNER_CLASS:-unknown}" \
+ --arg githubRunId "${GITHUB_RUN_ID:-unknown}" \
+ --arg githubRunAttempt "${GITHUB_RUN_ATTEMPT:-unknown}" \
+ --arg githubJob "${GITHUB_JOB:-unknown}" \
+ --arg taskId "${CROSSTASK_TASK_ID:-}" \
+ --arg taskAttemptId "${CROSSTASK_ATTEMPT_ID:-}" \
+ --arg traceId "${TRACE_ID:-}" \
+ --arg targetName "$target_name" \
+ --arg targetId "$target_id" \
+ --arg targetLabel "$target_label" \
+ --arg targetGroup "$target_group" \
+ --arg targetDescription "$target_description" \
+ --arg targetSystem "$target_system" \
+ --arg outPath "$out_path" \
+ --argjson buckets '[{"name":"node","label":"Node / pnpm","pathRegex":"node_modules|npm-deps|pnpm"},{"name":"nix-sources","label":"Nix sources","pathRegex":"-source$"},{"name":"rust","label":"Rust","pathRegex":"cargo|rust|rustc"}]' \
+ --argjson targetPath '["nix","closures","packages","genie"]' \
+ --argjson gatePolicy '{}' \
+ '
+ ($paths[0] // []) as $closurePaths
+ | ($closurePaths | map(.narSize) | add // 0) as $totalNarSize
+ | ($closurePaths | length) as $pathCount
+ | ($buckets | map(
+ . as $bucket
| {
- enabledCount: (map(select((if (.gatePolicy | has("enabled")) then .gatePolicy.enabled else true end))) | length),
- gateableCount: (map(select(.gateable == true)) | length),
- missingBaselineCount: (map(select(.gateReason == "missing_baseline")) | length),
- lowBaselineCount: (map(select(.gateReason == "low_baseline_count")) | length),
- lowCurrentSampleCount: (map(select(.gateReason == "low_current_sample_count")) | length),
- lowPairedSampleCount: (map(select(.gateReason == "low_paired_sample_count")) | length),
- missingPairedDeltaCount: (map(select(.gateReason == "missing_paired_delta")) | length)
- }
- | . + {
- nonGateableCount: (.enabledCount - .gateableCount),
- enforceable: (.enabledCount == .gateableCount)
+ name: "nix.closure.bucket.nar_size",
+ id: "nix.closure.bucket.nar_size",
+ label: (($bucket.label // $bucket.name) + " closure size"),
+ group: "nix closure buckets",
+ path: ($targetPath + ["buckets", $bucket.name]),
+ description: ("NAR size contributed by closure paths matching " + $bucket.pathRegex),
+ measurementKind: "deterministic",
+ unit: "bytes",
+ value: (
+ $closurePaths
+ | map(select(.path | test($bucket.pathRegex)) | .narSize)
+ | add // 0
+ ),
+ policy: $gatePolicy,
+ dimensions: { bucket: $bucket.name }
}
- ) as $readiness
+ )) as $bucketObservations
| {
- schemaVersion:$schemaVersion,
- status:$status,
- mode:$mode,
- readiness:$readiness,
- currentDir:$currentDir,
- baselineDir:$baselineDir,
- comparisons:$comparisons
+ schemaVersion: $schemaVersion,
+ generatedAt: $generatedAt,
+ producer: { name: "effect-utils-ci-measurement", version: 1 },
+ subject: {
+ repo: $repository,
+ branchKind: (if $branchKind == "" then "unknown" else $branchKind end),
+ ref: $ref,
+ headSha: $headSha,
+ baseSha: $baseSha
+ },
+ execution: {
+ provider: (if ($githubRunId != "" and $githubRunId != "unknown") then "github-actions" else "local" end),
+ workflow: "CI",
+ job: $githubJob,
+ runId: $githubRunId,
+ runAttempt: $githubRunAttempt,
+ taskId: $taskId,
+ attemptId: $taskAttemptId,
+ traceId: $traceId,
+ runner: { name: $runnerName, os: $runnerOs, arch: $runnerArch, class: $runnerClass }
+ },
+ target: { kind: "nix-closure", id: $targetId, name: $targetName, label: $targetLabel, group: $targetGroup, path: $targetPath, system: $targetSystem },
+ observations: ([
+ {
+ id: "nix.closure.nar_size",
+ label: "Total closure size",
+ group: "nix closure",
+ path: ($targetPath + ["total", "nar-size"]),
+ description: ("Total NAR size for all paths in " + $targetDescription),
+ name: "nix.closure.nar_size",
+ measurementKind: "deterministic",
+ unit: "bytes",
+ value: $totalNarSize,
+ policy: $gatePolicy,
+ dimensions: { bucket: "total" }
+ },
+ {
+ id: "nix.closure.path_count",
+ label: "Total closure path count",
+ group: "nix closure",
+ path: ($targetPath + ["total", "path-count"]),
+ description: ("Number of store paths in " + $targetDescription),
+ name: "nix.closure.path_count",
+ measurementKind: "deterministic",
+ unit: "count",
+ value: $pathCount,
+ policy: $gatePolicy,
+ dimensions: { bucket: "total" }
+ }
+ ] + $bucketObservations),
+ artifacts: [
+ { name: "nix-closure-path-info", path: "nix-closure-path-info.json", contentType: "application/json" },
+ { name: "nix-closure-paths", path: "nix-closure-paths.json", contentType: "application/json" }
+ ],
+ details: {
+ outPath: $outPath,
+ topPaths: ($closurePaths | sort_by(.narSize) | reverse | .[:30])
+ }
}
- ' >"$comparison_file"
-
- baseline_provenance_file="$baseline_dir/baseline-provenance.json"
- if [ -f "$baseline_provenance_file" ]; then
- comparison_with_provenance="$(mktemp)"
- jq --slurpfile baselineProvenance "$baseline_provenance_file" \
- '. + {baselineProvenance: ($baselineProvenance[0] // null)}' \
- "$comparison_file" >"$comparison_with_provenance"
- mv "$comparison_with_provenance" "$comparison_file"
- fi
+ ' >"$artifact_file"
- status="$(jq -r '.status' "$comparison_file")"
- exit_code=0
- case "$status:$mode" in
- fail:fail)
- echo "::error::CI measurement regression detected"
- exit_code=1
- ;;
- fail:*|warn:*)
- echo "::warning::CI measurement regression threshold exceeded"
- ;;
- partial:*)
- echo "::notice::CI measurement comparison is partial because one or more enabled observations are not gateable"
- ;;
- esac
+ cat "$artifact_file"
- if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
- {
- echo "### ${CI_MEASUREMENT_PR_COMMENT_TITLE:-CI Measurements}"
- echo ""
- jq -r '"- Status: " + .status + "\n- Gate: " + (if .mode == "fail" then "enforced" elif .mode == "warn" then "advisory" elif .mode == "off" then "off" else (.mode // "unknown") end) + "\n- Baseline: " + .baselineDir' "$comparison_file"
- echo ""
- echo "| Status | Gate | Target | Observation | Current | Baseline | Delta | Ratio |"
- echo "| --- | --- | --- | --- | ---: | ---: | ---: | ---: |"
- jq -r '
- .comparisons
- | to_entries
- | sort_by(
- if .value.status == "fail" then 0
- elif .value.status == "warn" then 1
- elif .value.status == "missing_baseline" then 2
- else 3
- end
- )
- | .[:20]
- | .[]
- | .value as $v
- | [
- $v.status,
- (if ($v.gateable // false) then "yes" else ($v.gateReason // "no") end),
- (($v.target.kind // "unknown") + "/" + ($v.target.name // "unknown") + "/" + ($v.target.system // "unknown")),
- ($v.observation.name // "unknown"),
- (($v.current // $v.observation.value // 0) | tostring),
- (($v.baseline // "") | tostring),
- (($v.delta // "") | tostring),
- (if $v.ratio == null or $v.ratio == "" then "" else (($v.ratio * 100 | round / 100) | tostring) end)
- ]
- | "| " + (map(gsub("\\|"; "\\\\|")) | join(" | ")) + " |"
- ' "$comparison_file"
- } >>"$GITHUB_STEP_SUMMARY"
- fi
+ - name: 'Measure Nix closure: megarepo'
+ shell: bash
+ env:
+ ARTIFACT_DIR: tmp/nix-closure-ci/current/megarepo_package
+ RUNNER_CLASS: '${{ runner.os }}-${{ runner.arch }}'
+ run: |
+ set -euo pipefail
+ mkdir -p "$ARTIFACT_DIR"
+ installable='.#megarepo'
+ target_id='megarepo_package'
+ target_name='megarepo'
+ target_label='Megarepo package'
+ target_group='packages'
+ target_description='the packaged megarepo CLI closure'
+ artifact_file="$ARTIFACT_DIR/measurements.json"
+ target_system='x86_64-linux'
+ out_path="$(nix build --no-update-lock-file --no-link --print-out-paths "$installable")"
+ path_info="$ARTIFACT_DIR/nix-closure-path-info.json"
+ paths_file="$ARTIFACT_DIR/nix-closure-paths.json"
- if [ "$exit_code" -ne 0 ]; then
- exit "$exit_code"
- fi
+ nix path-info --recursive --json "$out_path" >"$path_info"
+ jq 'to_entries | map({ path: .key, narSize: (.value.narSize // 0) })' "$path_info" >"$paths_file"
- - name: Upload devenv perf artifacts
- if: always()
- uses: actions/upload-artifact@v4
- with:
- name: devenv-perf
- path: |
- tmp/devenv-perf-ci
- !tmp/devenv-perf-ci/baseline/**
- if-no-files-found: error
- retention-days: 30
- timeout-minutes: 30
- concurrency:
- group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-devenv-perf"
- cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
- nix-closure-sizes:
- if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
- runs-on:
- [namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}']
- timeout-minutes: 30
- defaults:
- run:
- shell: bash
- permissions:
- actions: read
- contents: write
- issues: write
- pull-requests: write
- env:
- CI_MEASUREMENT_SUBJECT_REF: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.ref || github.ref }}
- CI_MEASUREMENT_SUBJECT_SHA: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.sha || github.sha }}
- CI_MEASUREMENT_SUBJECT_LABEL: ${{ inputs.measurement_baseline_label }}
- CI_MEASUREMENT_ALLOW_PROBE_FAILURES: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && '1' || '' }}
- steps:
- - uses: actions/checkout@v6
- - name: Checkout CI measurement baseline ref
- if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }}
- uses: actions/checkout@v6
- with:
- ref: ${{ inputs.measurement_baseline_ref }}
- - name: Install Nix
- uses: DeterminateSystems/determinate-nix-action@v3
- with:
- extra-conf: |
- experimental-features = nix-command flakes
- accept-flake-config = true
- extra-substituters = https://devenv.cachix.org
- extra-trusted-public-keys = devenv.cachix.org-1:w1cLUi8dv3hnoSPGAuibQv+f9TZLr6cv/Hm9XgU50cw=
- access-tokens = github.com=${{ github.token }}
- summarize: true
- - name: Provide cachix CLI from nixpkgs
- shell: bash
- run: |
- set -euo pipefail
- out=$(nix build --no-link --print-out-paths nixpkgs#cachix)
- echo "$out/bin" >> "$GITHUB_PATH"
- - name: Enable Cachix cache
- uses: cachix/cachix-action@v17
- with:
- name: overeng-effect-utils
- authToken: ${{ secrets.CACHIX_AUTH_TOKEN }}
- - name: Use pinned devenv from lock
- run: |
- DEVENV_REV=$(jq -r .nodes.devenv.locked.rev devenv.lock)
- if [ -z "$DEVENV_REV" ] || [ "$DEVENV_REV" = "null" ]; then
- echo '::error::devenv.lock missing .nodes.devenv.locked.rev'
- exit 1
- fi
- echo "DEVENV_REV=$DEVENV_REV" >> "$GITHUB_ENV"
- echo "Pinned devenv rev: $DEVENV_REV"
- shell: bash
- - name: Isolate pnpm state
- shell: bash
- run: |
- echo "PNPM_STORE_DIR=${{ runner.temp }}/pnpm-store/${{ github.job }}" >> "$GITHUB_ENV"
- echo "PNPM_HOME=${{ github.workspace }}/.pnpm-home" >> "$GITHUB_ENV"
- - id: restore-pnpm-state
- name: Restore pnpm state
- uses: actions/cache/restore@v4
- with:
- path: |
- ${{ github.workspace }}/.pnpm-home
- ${{ runner.temp }}/pnpm-store/${{ github.job }}
- key: "pnpm-state-v1-${{ runner.os }}-${{ runner.arch }}-${{ hashFiles('**/pnpm-lock.yaml') }}"
- - name: Resolve devenv
- run: |
- DEVENV_REV=$(jq -r .nodes.devenv.locked.rev devenv.lock)
- if [ -z "$DEVENV_REV" ] || [ "$DEVENV_REV" = "null" ]; then
- echo '::error::devenv.lock missing .nodes.devenv.locked.rev'
- exit 1
- fi
-
- resolve_devenv() {
- nix build \
- --accept-flake-config \
- --option extra-substituters https://devenv.cachix.org \
- --option extra-trusted-public-keys devenv.cachix.org-1:w1cLUi8dv3hnoSPGAuibQv+f9TZLr6cv/Hm9XgU50cw= \
- --no-link \
- --print-out-paths \
- "github:cachix/devenv/$DEVENV_REV#devenv"
- }
-
- # Temporary: capture diagnostics dir for #272 root-cause analysis.
- DIAG_ROOT="${RUNNER_TEMP:-/tmp}/nix-store-diagnostics-${GITHUB_JOB:-job}-${RUNNER_OS:-unknown}-${GITHUB_RUN_ATTEMPT:-0}"
- mkdir -p "$DIAG_ROOT"
- echo "NIX_STORE_DIAGNOSTICS_DIR=$DIAG_ROOT" >> "$GITHUB_ENV"
-
- {
- echo "timestamp_utc=$(date -u +%Y-%m-%dT%H:%M:%SZ)"
- echo "runner_name=${RUNNER_NAME:-unknown}"
- echo "runner_os=${RUNNER_OS:-unknown}"
- echo "runner_arch=${RUNNER_ARCH:-unknown}"
- echo "github_job=${GITHUB_JOB:-unknown}"
- echo "github_run_id=${GITHUB_RUN_ID:-unknown}"
- echo "nix_user_conf_files=${NIX_USER_CONF_FILES:-}"
- nix --version || true
- } > "$DIAG_ROOT/environment.txt" 2>&1
-
- if ! DEVENV_OUT=$(resolve_devenv 2> >(tee "$DIAG_ROOT/resolve-devenv.log" >&2)); then
- echo "::error::resolve_devenv failed. Last 30 lines of log:"
- tail -30 "$DIAG_ROOT/resolve-devenv.log" || true
- exit 1
- fi
- DEVENV_BIN="$DEVENV_OUT/bin/devenv"
-
- # Fast validity check on the devenv store path (~1-2s vs ~25s for devenv info).
- if ! nix-store --check-validity "$DEVENV_OUT" 2>/dev/null; then
- echo "::warning::devenv store path invalid, repairing targeted path..."
- nix-store --repair-path "$DEVENV_OUT" > "$DIAG_ROOT/nix-store-verify-repair.log" 2>&1 || true
- rm -rf "${XDG_CACHE_HOME:-$HOME/.cache}"/nix/eval-cache-* ~/.cache/nix/eval-cache-*
- if ! DEVENV_OUT=$(resolve_devenv 2> >(tee "$DIAG_ROOT/resolve-devenv-post-repair.log" >&2)); then
- echo "::error::resolve_devenv failed after repair. Last 30 lines of log:"
- tail -30 "$DIAG_ROOT/resolve-devenv-post-repair.log" || true
- exit 1
- fi
- DEVENV_BIN="$DEVENV_OUT/bin/devenv"
- fi
-
- echo "DEVENV_BIN=$DEVENV_BIN" >> "$GITHUB_ENV"
- "$DEVENV_BIN" version | tee "$DIAG_ROOT/devenv-version.txt"
- shell: bash
- - name: Evict cached pnpm deps for oxlint-npm
- shell: bash
- run: |
- targetRef='.#oxlint-npm'
- entriesJson=$(mktemp)
- if nix eval --json "$targetRef.passthru.depsBuildEntries" >"$entriesJson" 2>/dev/null; then
- while IFS=$'\t' read -r attrName drv; do
- [ -n "$drv" ] || continue
- while IFS= read -r outPath; do
- [ -n "$outPath" ] || continue
- if nix path-info "$outPath" >/dev/null 2>&1; then
- echo "evicting cached: $(basename "$outPath")"
- if ! nix store delete --ignore-liveness "$outPath" >/dev/null 2>&1; then
- echo "::error::failed to evict cached pnpm-deps output: $outPath"
- exit 1
- fi
- if nix path-info "$outPath" >/dev/null 2>&1; then
- echo "::error::cached pnpm-deps output still present after eviction: $outPath"
- exit 1
- fi
- fi
- done < <(nix-store -q --outputs "$drv" 2>/dev/null || true)
- done < <(jq -r '.[] | [.attrName, (.drvPath // "")] | @tsv' "$entriesJson")
- else
- topDrv=$(nix path-info --derivation "$targetRef" 2>/dev/null || true)
- if [ -n "$topDrv" ]; then
- while IFS= read -r drv; do
- [ -n "$drv" ] || continue
- attrName=""
- while IFS= read -r outPath; do
- [ -n "$outPath" ] || continue
- if nix path-info "$outPath" >/dev/null 2>&1; then
- echo "evicting cached: $(basename "$outPath")"
- if ! nix store delete --ignore-liveness "$outPath" >/dev/null 2>&1; then
- echo "::error::failed to evict cached pnpm-deps output: $outPath"
- exit 1
- fi
- if nix path-info "$outPath" >/dev/null 2>&1; then
- echo "::error::cached pnpm-deps output still present after eviction: $outPath"
- exit 1
- fi
- fi
- done < <(nix-store -q --outputs "$drv" 2>/dev/null || true)
- done < <(nix-store -qR "$topDrv" 2>/dev/null | grep "pnpm-deps-[a-z0-9-]*-v[0-9].*\.drv$" || true)
- fi
- fi
- rm -f "$entriesJson"
- - name: Force diagnostics failure (debug)
- if: ${{ github.event_name == 'workflow_dispatch' && (inputs.debug_force_nix_diagnostics_failure == true || inputs.debug_force_nix_diagnostics_failure == 'true') }}
- shell: bash
- run: |
- diag_dir="${NIX_STORE_DIAGNOSTICS_DIR:-${RUNNER_TEMP:-/tmp}/nix-store-diagnostics-missing}"
- mkdir -p "$diag_dir"
- cat > "$diag_dir/synthetic-signature.log" <<'EOF'
- Failed to convert config.cachix to JSON
- ... while evaluating the option `cachix.package`
- error: path '/nix/store/synthetic-invalid-path' is not valid
- EOF
- echo "::warning::Intentional failure for diagnostics validation (#272)"
- exit 1
- - name: 'Download previous artifact: nix-closure-measurements'
- shell: bash
- env:
- GH_TOKEN: ${{ github.token }}
- BASELINE_ARTIFACT_NAME: nix-closure-measurements
- BASELINE_OUTPUT_DIR: tmp/nix-closure-ci/baseline
- BASELINE_WORKFLOW_NAME: ${{ github.workflow }}
- BASELINE_BRANCH: ${{ github.base_ref || github.ref_name }}
- BASELINE_SEED_RUNS_JSON: '[]'
- BASELINE_MAX_RUNS: '20'
- BASELINE_MAX_CANDIDATE_RUNS: '60'
- BASELINE_REQUIRED_OBSERVATIONS_JSON: '[]'
- run: |
- set -euo pipefail
-
- mkdir -p "$BASELINE_OUTPUT_DIR"
-
- if command -v gh >/dev/null 2>&1; then
- GH_BIN="$(command -v gh)"
- else
- echo "::notice::gh is not on PATH; resolving GitHub CLI through Nix"
- if ! GH_BIN="$(nix build --no-link --print-out-paths nixpkgs#gh 2>/dev/null)/bin/gh"; then
- echo "::notice::unable to resolve GitHub CLI through Nix; skipping previous artifact download"
- exit 0
- fi
- fi
- echo "Using GitHub CLI: $GH_BIN"
-
- repo="${GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}"
- workflow="${BASELINE_WORKFLOW_NAME:-CI}"
- branch="${BASELINE_BRANCH:-${GITHUB_BASE_REF:-${GITHUB_REF_NAME:-main}}}"
- seed_runs_file="$BASELINE_OUTPUT_DIR/baseline-seed-runs.json"
- required_observations_file="$BASELINE_OUTPUT_DIR/baseline-required-observations.json"
- printf '%s' "${BASELINE_SEED_RUNS_JSON:-[]}" >"$seed_runs_file"
- printf '%s' "${BASELINE_REQUIRED_OBSERVATIONS_JSON:-[]}" >"$required_observations_file"
- if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.runId | type == "string")) else false end' \
- "$seed_runs_file" >/dev/null; then
- echo "::error::BASELINE_SEED_RUNS_JSON must be an array of objects with string runId fields"
- exit 1
- fi
- if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.id | type == "string") and (.minSources | type == "number")) else false end' \
- "$required_observations_file" >/dev/null; then
- echo "::error::BASELINE_REQUIRED_OBSERVATIONS_JSON must be an array of objects with string id and numeric minSources fields"
- exit 1
- fi
- seed_run_ids="$(jq -r '.[].runId' "$seed_runs_file")"
- required_observation_count="$(jq 'length' "$required_observations_file")"
- max_candidate_runs="${BASELINE_MAX_CANDIDATE_RUNS:-${BASELINE_MAX_RUNS:-5}}"
- if ! [[ "$max_candidate_runs" =~ ^[0-9]+$ ]] || [ "$max_candidate_runs" -lt 1 ]; then
- max_candidate_runs=1
- fi
-
- candidate_runs="$(
- "$GH_BIN" run list \
- --repo "$repo" \
- --workflow "$workflow" \
- --branch "$branch" \
- --event push \
- --status success \
- --json databaseId,headSha \
- --limit "$max_candidate_runs" \
- --jq '[.[] | select(.headSha != env.GITHUB_SHA) | .databaseId] | .[]'
- )"
-
- candidate_runs="$seed_run_ids
- $candidate_runs"
-
- max_runs="${BASELINE_MAX_RUNS:-5}"
- if ! [[ "$max_runs" =~ ^[0-9]+$ ]] || [ "$max_runs" -lt 1 ]; then
- max_runs=1
- fi
-
- write_baseline_observation_counts() {
- local measurement_index="$BASELINE_OUTPUT_DIR/baseline-measurement-files.txt"
- local counts_file="$BASELINE_OUTPUT_DIR/baseline-observation-counts.json"
- find "$BASELINE_OUTPUT_DIR" \
- -mindepth 2 \
- -maxdepth 2 \
- -name measurements.json \
- -type f \
- -print \
- | sort >"$measurement_index" || true
-
- if [ -s "$measurement_index" ]; then
- xargs -r jq -s \
- --slurpfile required "$required_observations_file" \
- '
- ([.[] | (.observations // [])[]? | select(.value | type == "number") | .id] | sort | group_by(.) | map({id: .[0], sources: length})) as $counts
- | ($required[0] // []) as $requiredRows
- | {
- counts: $counts,
- required: (
- $requiredRows
- | map(. as $requiredRow | ($counts | map(select(.id == $requiredRow.id)) | .[0].sources // 0) as $actual | $requiredRow + {sources:$actual, satisfied:($actual >= $requiredRow.minSources)})
- )
- }
- ' <"$measurement_index" >"$counts_file"
- else
- jq -n --slurpfile required "$required_observations_file" \
- '{counts: [], required: (($required[0] // []) | map(. + {sources:0, satisfied:false}))}' >"$counts_file"
- fi
- }
-
- baseline_requirements_satisfied() {
- if [ "$required_observation_count" -eq 0 ]; then
- return 1
- fi
- write_baseline_observation_counts
- jq -e '.required | all(.satisfied == true)' "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" >/dev/null
- }
-
- run_id=""
- artifact_name=""
- artifact_id=""
- downloaded_runs_file="$BASELINE_OUTPUT_DIR/baseline-runs.jsonl"
- seen_runs_file="$BASELINE_OUTPUT_DIR/baseline-seen-runs.txt"
- : >"$downloaded_runs_file"
- : >"$seen_runs_file"
- for candidate_run in $candidate_runs; do
- if [ -z "$candidate_run" ]; then
- continue
- fi
- if grep -qxF "$candidate_run" "$seen_runs_file"; then
- continue
- fi
- downloaded_count="$(wc -l <"$downloaded_runs_file" | tr -d ' ')"
- if [ "$downloaded_count" -ge "$max_runs" ]; then
- if baseline_requirements_satisfied; then
- break
- fi
- echo "::notice::downloaded $downloaded_count baseline artifact(s), but required observation counts are not satisfied yet; continuing through bounded candidate history"
- fi
- if [ "$(wc -l <"$seen_runs_file" | tr -d ' ')" -ge "$max_candidate_runs" ]; then
- break
- fi
- printf '%s\n' "$candidate_run" >>"$seen_runs_file"
-
- artifact_json="$(
- "$GH_BIN" api "repos/$repo/actions/runs/$candidate_run/artifacts" \
- | jq --arg artifactName "$BASELINE_ARTIFACT_NAME" '.artifacts
- | map(select(.expired == false))
- | map(select(.name == $artifactName or (.name | startswith($artifactName + "-"))))
- | sort_by(.created_at // "")
- | reverse
- | .[0] // empty'
- )"
-
- if [ -n "$artifact_json" ]; then
- current_artifact_name="$(printf '%s' "$artifact_json" | jq -r '.name')"
- current_artifact_id="$(printf '%s' "$artifact_json" | jq -r '.id')"
- current_output_dir="$BASELINE_OUTPUT_DIR/run-$candidate_run"
- mkdir -p "$current_output_dir"
- if "$GH_BIN" run download "$candidate_run" \
- --repo "$repo" \
- --name "$current_artifact_name" \
- --dir "$current_output_dir"; then
- if [ -z "$run_id" ]; then
- run_id="$candidate_run"
- artifact_name="$current_artifact_name"
- artifact_id="$current_artifact_id"
- fi
- jq -cn \
- --arg runId "$candidate_run" \
- --arg artifactName "$current_artifact_name" \
- --arg artifactId "$current_artifact_id" \
- --arg path "run-$candidate_run" \
- '{runId:$runId, artifactName:$artifactName, artifactId:$artifactId, path:$path}' \
- >>"$downloaded_runs_file"
- else
- echo "::notice::failed to download baseline artifact $current_artifact_name from run $candidate_run"
- fi
- fi
- done
-
- write_baseline_observation_counts
-
- if [ -z "$run_id" ] || [ -z "$artifact_name" ]; then
- echo "::notice::no successful baseline run found for $repo workflow=$workflow branch=$branch"
- exit 0
- fi
-
- jq -n \
- --slurpfile runs "$downloaded_runs_file" \
- --slurpfile seedRuns "$seed_runs_file" \
- --slurpfile observationCounts "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" \
- --argjson schemaVersion 1 \
- --arg repository "$repo" \
- --arg workflow "$workflow" \
- --arg branch "$branch" \
- --arg runId "$run_id" \
- --arg artifactName "$artifact_name" \
- --arg artifactId "$artifact_id" \
- '{
- schemaVersion: $schemaVersion,
- source: "github-actions-artifact",
- repository: $repository,
- workflow: $workflow,
- branch: $branch,
- runId: $runId,
- artifactName: $artifactName,
- artifactId: $artifactId,
- seedRuns: ($seedRuns[0] // []),
- runs: $runs,
- observationCounts: ($observationCounts[0] // null)
- }' >"$BASELINE_OUTPUT_DIR/baseline-provenance.json"
-
- echo "Downloaded $(wc -l <"$downloaded_runs_file" | tr -d ' ') baseline artifact(s), latest $artifact_name from run $run_id into $BASELINE_OUTPUT_DIR"
-
- - name: 'Measure Nix closure: genie'
- shell: bash
- env:
- ARTIFACT_DIR: tmp/nix-closure-ci/current/genie_package
- RUNNER_CLASS: '${{ runner.os }}-${{ runner.arch }}'
- run: |
- set -euo pipefail
-
- mkdir -p "$ARTIFACT_DIR"
- installable='.#genie'
- target_id='genie_package'
- target_name='genie'
- target_label='Genie package'
- target_group='packages'
- target_description='the packaged Genie CLI closure'
- artifact_file="$ARTIFACT_DIR/measurements.json"
- target_system='x86_64-linux'
-
- out_path="$(nix build --no-update-lock-file --no-link --print-out-paths "$installable")"
- path_info="$ARTIFACT_DIR/nix-closure-path-info.json"
- paths_file="$ARTIFACT_DIR/nix-closure-paths.json"
-
- nix path-info --recursive --json "$out_path" >"$path_info"
- jq 'to_entries | map({ path: .key, narSize: (.value.narSize // 0) })' "$path_info" >"$paths_file"
-
- jq -n \
- --slurpfile paths "$paths_file" \
- --argjson schemaVersion 1 \
- --arg generatedAt "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
- --arg repository "${GITHUB_REPOSITORY:-unknown}" \
- --arg branchKind "${GITHUB_EVENT_NAME:-unknown}" \
- --arg ref "${CI_MEASUREMENT_SUBJECT_REF:-${GITHUB_REF:-unknown}}" \
- --arg headSha "${CI_MEASUREMENT_SUBJECT_SHA:-${GITHUB_SHA:-unknown}}" \
- --arg baseSha "${GITHUB_BASE_SHA:-}" \
- --arg runnerName "${RUNNER_NAME:-unknown}" \
- --arg runnerOs "${RUNNER_OS:-unknown}" \
- --arg runnerArch "${RUNNER_ARCH:-unknown}" \
- --arg runnerClass "${RUNNER_CLASS:-unknown}" \
- --arg githubRunId "${GITHUB_RUN_ID:-unknown}" \
- --arg githubRunAttempt "${GITHUB_RUN_ATTEMPT:-unknown}" \
- --arg githubJob "${GITHUB_JOB:-unknown}" \
- --arg taskId "${CROSSTASK_TASK_ID:-}" \
- --arg taskAttemptId "${CROSSTASK_ATTEMPT_ID:-}" \
- --arg traceId "${TRACE_ID:-}" \
- --arg targetName "$target_name" \
- --arg targetId "$target_id" \
- --arg targetLabel "$target_label" \
- --arg targetGroup "$target_group" \
- --arg targetDescription "$target_description" \
- --arg targetSystem "$target_system" \
- --arg outPath "$out_path" \
- --argjson buckets '[{"name":"node","label":"Node / pnpm","pathRegex":"node_modules|npm-deps|pnpm"},{"name":"nix-sources","label":"Nix sources","pathRegex":"-source$"},{"name":"rust","label":"Rust","pathRegex":"cargo|rust|rustc"}]' \
- --argjson targetPath '["nix","closures","packages","genie"]' \
- --argjson gatePolicy '{}' \
- '
- ($paths[0] // []) as $closurePaths
- | ($closurePaths | map(.narSize) | add // 0) as $totalNarSize
- | ($closurePaths | length) as $pathCount
- | ($buckets | map(
- . as $bucket
- | {
- name: "nix.closure.bucket.nar_size",
- id: "nix.closure.bucket.nar_size",
- label: (($bucket.label // $bucket.name) + " closure size"),
- group: "nix closure buckets",
- path: ($targetPath + ["buckets", $bucket.name]),
- description: ("NAR size contributed by closure paths matching " + $bucket.pathRegex),
- measurementKind: "deterministic",
- unit: "bytes",
- value: (
- $closurePaths
- | map(select(.path | test($bucket.pathRegex)) | .narSize)
- | add // 0
- ),
- policy: $gatePolicy,
- dimensions: { bucket: $bucket.name }
- }
- )) as $bucketObservations
- | {
- schemaVersion: $schemaVersion,
- generatedAt: $generatedAt,
- producer: { name: "effect-utils-ci-measurement", version: 1 },
- subject: {
- repo: $repository,
- branchKind: (if $branchKind == "" then "unknown" else $branchKind end),
- ref: $ref,
- headSha: $headSha,
- baseSha: $baseSha
- },
- execution: {
- provider: (if ($githubRunId != "" and $githubRunId != "unknown") then "github-actions" else "local" end),
- workflow: "CI",
- job: $githubJob,
- runId: $githubRunId,
- runAttempt: $githubRunAttempt,
- taskId: $taskId,
- attemptId: $taskAttemptId,
- traceId: $traceId,
- runner: { name: $runnerName, os: $runnerOs, arch: $runnerArch, class: $runnerClass }
- },
- target: { kind: "nix-closure", id: $targetId, name: $targetName, label: $targetLabel, group: $targetGroup, path: $targetPath, system: $targetSystem },
- observations: ([
- {
- id: "nix.closure.nar_size",
- label: "Total closure size",
- group: "nix closure",
- path: ($targetPath + ["total", "nar-size"]),
- description: ("Total NAR size for all paths in " + $targetDescription),
- name: "nix.closure.nar_size",
- measurementKind: "deterministic",
- unit: "bytes",
- value: $totalNarSize,
- policy: $gatePolicy,
- dimensions: { bucket: "total" }
- },
- {
- id: "nix.closure.path_count",
- label: "Total closure path count",
- group: "nix closure",
- path: ($targetPath + ["total", "path-count"]),
- description: ("Number of store paths in " + $targetDescription),
- name: "nix.closure.path_count",
- measurementKind: "deterministic",
- unit: "count",
- value: $pathCount,
- policy: $gatePolicy,
- dimensions: { bucket: "total" }
- }
- ] + $bucketObservations),
- artifacts: [
- { name: "nix-closure-path-info", path: "nix-closure-path-info.json", contentType: "application/json" },
- { name: "nix-closure-paths", path: "nix-closure-paths.json", contentType: "application/json" }
- ],
- details: {
- outPath: $outPath,
- topPaths: ($closurePaths | sort_by(.narSize) | reverse | .[:30])
- }
- }
- ' >"$artifact_file"
-
- cat "$artifact_file"
-
- - name: 'Measure Nix closure: megarepo'
- shell: bash
- env:
- ARTIFACT_DIR: tmp/nix-closure-ci/current/megarepo_package
- RUNNER_CLASS: '${{ runner.os }}-${{ runner.arch }}'
- run: |
- set -euo pipefail
-
- mkdir -p "$ARTIFACT_DIR"
- installable='.#megarepo'
- target_id='megarepo_package'
- target_name='megarepo'
- target_label='Megarepo package'
- target_group='packages'
- target_description='the packaged megarepo CLI closure'
- artifact_file="$ARTIFACT_DIR/measurements.json"
- target_system='x86_64-linux'
-
- out_path="$(nix build --no-update-lock-file --no-link --print-out-paths "$installable")"
- path_info="$ARTIFACT_DIR/nix-closure-path-info.json"
- paths_file="$ARTIFACT_DIR/nix-closure-paths.json"
-
- nix path-info --recursive --json "$out_path" >"$path_info"
- jq 'to_entries | map({ path: .key, narSize: (.value.narSize // 0) })' "$path_info" >"$paths_file"
-
- jq -n \
- --slurpfile paths "$paths_file" \
- --argjson schemaVersion 1 \
- --arg generatedAt "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
- --arg repository "${GITHUB_REPOSITORY:-unknown}" \
- --arg branchKind "${GITHUB_EVENT_NAME:-unknown}" \
- --arg ref "${CI_MEASUREMENT_SUBJECT_REF:-${GITHUB_REF:-unknown}}" \
- --arg headSha "${CI_MEASUREMENT_SUBJECT_SHA:-${GITHUB_SHA:-unknown}}" \
- --arg baseSha "${GITHUB_BASE_SHA:-}" \
- --arg runnerName "${RUNNER_NAME:-unknown}" \
- --arg runnerOs "${RUNNER_OS:-unknown}" \
- --arg runnerArch "${RUNNER_ARCH:-unknown}" \
- --arg runnerClass "${RUNNER_CLASS:-unknown}" \
- --arg githubRunId "${GITHUB_RUN_ID:-unknown}" \
- --arg githubRunAttempt "${GITHUB_RUN_ATTEMPT:-unknown}" \
- --arg githubJob "${GITHUB_JOB:-unknown}" \
- --arg taskId "${CROSSTASK_TASK_ID:-}" \
- --arg taskAttemptId "${CROSSTASK_ATTEMPT_ID:-}" \
- --arg traceId "${TRACE_ID:-}" \
- --arg targetName "$target_name" \
- --arg targetId "$target_id" \
- --arg targetLabel "$target_label" \
- --arg targetGroup "$target_group" \
- --arg targetDescription "$target_description" \
- --arg targetSystem "$target_system" \
- --arg outPath "$out_path" \
- --argjson buckets '[{"name":"node","label":"Node / pnpm","pathRegex":"node_modules|npm-deps|pnpm"},{"name":"nix-sources","label":"Nix sources","pathRegex":"-source$"},{"name":"rust","label":"Rust","pathRegex":"cargo|rust|rustc"}]' \
- --argjson targetPath '["nix","closures","packages","megarepo"]' \
- --argjson gatePolicy '{}' \
- '
- ($paths[0] // []) as $closurePaths
- | ($closurePaths | map(.narSize) | add // 0) as $totalNarSize
- | ($closurePaths | length) as $pathCount
- | ($buckets | map(
- . as $bucket
- | {
- name: "nix.closure.bucket.nar_size",
- id: "nix.closure.bucket.nar_size",
- label: (($bucket.label // $bucket.name) + " closure size"),
- group: "nix closure buckets",
- path: ($targetPath + ["buckets", $bucket.name]),
- description: ("NAR size contributed by closure paths matching " + $bucket.pathRegex),
- measurementKind: "deterministic",
- unit: "bytes",
- value: (
- $closurePaths
- | map(select(.path | test($bucket.pathRegex)) | .narSize)
- | add // 0
- ),
- policy: $gatePolicy,
- dimensions: { bucket: $bucket.name }
- }
- )) as $bucketObservations
- | {
- schemaVersion: $schemaVersion,
- generatedAt: $generatedAt,
- producer: { name: "effect-utils-ci-measurement", version: 1 },
- subject: {
- repo: $repository,
- branchKind: (if $branchKind == "" then "unknown" else $branchKind end),
- ref: $ref,
- headSha: $headSha,
- baseSha: $baseSha
- },
- execution: {
- provider: (if ($githubRunId != "" and $githubRunId != "unknown") then "github-actions" else "local" end),
- workflow: "CI",
- job: $githubJob,
- runId: $githubRunId,
- runAttempt: $githubRunAttempt,
- taskId: $taskId,
- attemptId: $taskAttemptId,
- traceId: $traceId,
- runner: { name: $runnerName, os: $runnerOs, arch: $runnerArch, class: $runnerClass }
- },
- target: { kind: "nix-closure", id: $targetId, name: $targetName, label: $targetLabel, group: $targetGroup, path: $targetPath, system: $targetSystem },
- observations: ([
- {
- id: "nix.closure.nar_size",
- label: "Total closure size",
- group: "nix closure",
- path: ($targetPath + ["total", "nar-size"]),
- description: ("Total NAR size for all paths in " + $targetDescription),
- name: "nix.closure.nar_size",
- measurementKind: "deterministic",
- unit: "bytes",
- value: $totalNarSize,
- policy: $gatePolicy,
- dimensions: { bucket: "total" }
- },
- {
- id: "nix.closure.path_count",
- label: "Total closure path count",
- group: "nix closure",
- path: ($targetPath + ["total", "path-count"]),
- description: ("Number of store paths in " + $targetDescription),
- name: "nix.closure.path_count",
- measurementKind: "deterministic",
- unit: "count",
- value: $pathCount,
- policy: $gatePolicy,
- dimensions: { bucket: "total" }
- }
- ] + $bucketObservations),
- artifacts: [
- { name: "nix-closure-path-info", path: "nix-closure-path-info.json", contentType: "application/json" },
- { name: "nix-closure-paths", path: "nix-closure-paths.json", contentType: "application/json" }
- ],
- details: {
- outPath: $outPath,
- topPaths: ($closurePaths | sort_by(.narSize) | reverse | .[:30])
- }
- }
- ' >"$artifact_file"
-
- cat "$artifact_file"
-
- - name: 'Measure Nix closure: oxlint-npm'
- shell: bash
- env:
- ARTIFACT_DIR: tmp/nix-closure-ci/current/oxlint_npm_package
- RUNNER_CLASS: '${{ runner.os }}-${{ runner.arch }}'
- run: |
- set -euo pipefail
-
- mkdir -p "$ARTIFACT_DIR"
- installable='.#oxlint-npm'
- target_id='oxlint_npm_package'
- target_name='oxlint-npm'
- target_label='oxlint npm package'
- target_group='packages'
- target_description='the packaged oxlint npm compatibility wrapper closure'
- artifact_file="$ARTIFACT_DIR/measurements.json"
- target_system='x86_64-linux'
-
- out_path="$(nix build --no-update-lock-file --no-link --print-out-paths "$installable")"
- path_info="$ARTIFACT_DIR/nix-closure-path-info.json"
- paths_file="$ARTIFACT_DIR/nix-closure-paths.json"
-
- nix path-info --recursive --json "$out_path" >"$path_info"
- jq 'to_entries | map({ path: .key, narSize: (.value.narSize // 0) })' "$path_info" >"$paths_file"
-
- jq -n \
- --slurpfile paths "$paths_file" \
- --argjson schemaVersion 1 \
- --arg generatedAt "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
- --arg repository "${GITHUB_REPOSITORY:-unknown}" \
- --arg branchKind "${GITHUB_EVENT_NAME:-unknown}" \
- --arg ref "${CI_MEASUREMENT_SUBJECT_REF:-${GITHUB_REF:-unknown}}" \
- --arg headSha "${CI_MEASUREMENT_SUBJECT_SHA:-${GITHUB_SHA:-unknown}}" \
- --arg baseSha "${GITHUB_BASE_SHA:-}" \
- --arg runnerName "${RUNNER_NAME:-unknown}" \
- --arg runnerOs "${RUNNER_OS:-unknown}" \
- --arg runnerArch "${RUNNER_ARCH:-unknown}" \
- --arg runnerClass "${RUNNER_CLASS:-unknown}" \
- --arg githubRunId "${GITHUB_RUN_ID:-unknown}" \
- --arg githubRunAttempt "${GITHUB_RUN_ATTEMPT:-unknown}" \
- --arg githubJob "${GITHUB_JOB:-unknown}" \
- --arg taskId "${CROSSTASK_TASK_ID:-}" \
- --arg taskAttemptId "${CROSSTASK_ATTEMPT_ID:-}" \
- --arg traceId "${TRACE_ID:-}" \
- --arg targetName "$target_name" \
- --arg targetId "$target_id" \
- --arg targetLabel "$target_label" \
- --arg targetGroup "$target_group" \
- --arg targetDescription "$target_description" \
- --arg targetSystem "$target_system" \
- --arg outPath "$out_path" \
- --argjson buckets '[{"name":"node","label":"Node / pnpm","pathRegex":"node_modules|npm-deps|pnpm"},{"name":"nix-sources","label":"Nix sources","pathRegex":"-source$"},{"name":"rust","label":"Rust","pathRegex":"cargo|rust|rustc"}]' \
- --argjson targetPath '["nix","closures","packages","oxlint-npm"]' \
- --argjson gatePolicy '{}' \
- '
- ($paths[0] // []) as $closurePaths
- | ($closurePaths | map(.narSize) | add // 0) as $totalNarSize
- | ($closurePaths | length) as $pathCount
- | ($buckets | map(
- . as $bucket
- | {
- name: "nix.closure.bucket.nar_size",
- id: "nix.closure.bucket.nar_size",
- label: (($bucket.label // $bucket.name) + " closure size"),
- group: "nix closure buckets",
- path: ($targetPath + ["buckets", $bucket.name]),
- description: ("NAR size contributed by closure paths matching " + $bucket.pathRegex),
- measurementKind: "deterministic",
- unit: "bytes",
- value: (
- $closurePaths
- | map(select(.path | test($bucket.pathRegex)) | .narSize)
- | add // 0
- ),
- policy: $gatePolicy,
- dimensions: { bucket: $bucket.name }
- }
- )) as $bucketObservations
- | {
- schemaVersion: $schemaVersion,
- generatedAt: $generatedAt,
- producer: { name: "effect-utils-ci-measurement", version: 1 },
- subject: {
- repo: $repository,
- branchKind: (if $branchKind == "" then "unknown" else $branchKind end),
- ref: $ref,
- headSha: $headSha,
- baseSha: $baseSha
- },
- execution: {
- provider: (if ($githubRunId != "" and $githubRunId != "unknown") then "github-actions" else "local" end),
- workflow: "CI",
- job: $githubJob,
- runId: $githubRunId,
- runAttempt: $githubRunAttempt,
- taskId: $taskId,
- attemptId: $taskAttemptId,
- traceId: $traceId,
- runner: { name: $runnerName, os: $runnerOs, arch: $runnerArch, class: $runnerClass }
- },
- target: { kind: "nix-closure", id: $targetId, name: $targetName, label: $targetLabel, group: $targetGroup, path: $targetPath, system: $targetSystem },
- observations: ([
- {
- id: "nix.closure.nar_size",
- label: "Total closure size",
- group: "nix closure",
- path: ($targetPath + ["total", "nar-size"]),
- description: ("Total NAR size for all paths in " + $targetDescription),
- name: "nix.closure.nar_size",
- measurementKind: "deterministic",
- unit: "bytes",
- value: $totalNarSize,
- policy: $gatePolicy,
- dimensions: { bucket: "total" }
- },
- {
- id: "nix.closure.path_count",
- label: "Total closure path count",
- group: "nix closure",
- path: ($targetPath + ["total", "path-count"]),
- description: ("Number of store paths in " + $targetDescription),
- name: "nix.closure.path_count",
- measurementKind: "deterministic",
- unit: "count",
- value: $pathCount,
- policy: $gatePolicy,
- dimensions: { bucket: "total" }
- }
- ] + $bucketObservations),
- artifacts: [
- { name: "nix-closure-path-info", path: "nix-closure-path-info.json", contentType: "application/json" },
- { name: "nix-closure-paths", path: "nix-closure-paths.json", contentType: "application/json" }
- ],
- details: {
- outPath: $outPath,
- topPaths: ($closurePaths | sort_by(.narSize) | reverse | .[:30])
- }
- }
- ' >"$artifact_file"
-
- cat "$artifact_file"
-
- - name: Compare CI measurements with baseline
- shell: bash
- env:
- CI_MEASUREMENT_CURRENT_DIR: tmp/nix-closure-ci/current
- CI_MEASUREMENT_BASELINE_DIR: tmp/nix-closure-ci/baseline
- CI_MEASUREMENT_COMPARISON_FILE: tmp/nix-closure-ci/measurement-comparison.json
- CI_MEASUREMENT_REGRESSION_MODE: warn
- CI_MEASUREMENT_PR_COMMENT_ENABLED: 'false'
- CI_MEASUREMENT_PR_COMMENT_TITLE: Nix Closure Measurements
- CI_MEASUREMENT_PR_COMMENT_MAX_ROWS: '8'
- CI_MEASUREMENT_PR_COMMENT_MAX_HISTORY: '20'
- CI_MEASUREMENT_PR_COMMENT_ASSET_BRANCH: ci-measurement-assets
- run: |
- set -euo pipefail
-
- export PATH="/run/current-system/sw/bin:/usr/bin:/bin:$PATH"
-
- current_dir="${CI_MEASUREMENT_CURRENT_DIR:?CI_MEASUREMENT_CURRENT_DIR not set}"
- baseline_dir="${CI_MEASUREMENT_BASELINE_DIR:?CI_MEASUREMENT_BASELINE_DIR not set}"
- comparison_file="${CI_MEASUREMENT_COMPARISON_FILE:?CI_MEASUREMENT_COMPARISON_FILE not set}"
- mode="${CI_MEASUREMENT_REGRESSION_MODE:-warn}"
- mkdir -p "$(dirname "$comparison_file")"
-
- if [ "$mode" = "off" ]; then
- jq -n --argjson schemaVersion 1 --arg status skipped --arg mode "$mode" \
- '{schemaVersion:$schemaVersion,status:$status,mode:$mode,comparisons:{}}' \
- >"$comparison_file"
- exit 0
- fi
-
- current_index="$(mktemp)"
- baseline_index="$(mktemp)"
- find "$current_dir" -name baseline -type d -prune -o -name measurements.json -type f -print | sort >"$current_index" || true
- {
- find "$baseline_dir" -name baseline -type d ! -path "$baseline_dir" -prune -o -name measurements.json -type f -print
- } | sort -u >"$baseline_index" || true
-
- if [ ! -s "$current_index" ]; then
- echo "::error::no current measurements.json files found under $current_dir"
- exit 1
- fi
-
- current_json="$comparison_file.current.json"
- baseline_json="$comparison_file.baseline.json"
- xargs -r jq -s '.' <"$current_index" >"$current_json"
- if [ -s "$baseline_index" ]; then
- xargs -r jq -s '.' <"$baseline_index" >"$baseline_json"
- else
- printf '[]\n' >"$baseline_json"
- fi
-
- jq -n \
- --slurpfile current "$current_json" \
- --slurpfile baseline "$baseline_json" \
- --argjson schemaVersion 1 \
- --arg mode "$mode" \
- --arg currentDir "$current_dir" \
- --arg baselineDir "$baseline_dir" \
- '
- def identity_dimensions:
- (.dimensions // {})
- | to_entries
- | map(select(.key as $key | ["devenvRev", "otelServiceName", "status", "probeLabel", "sampleCount", "measuredSampleCount"] | index($key) | not))
- | sort_by(.key)
- | map("\(.key)=\(.value|tostring)")
- | join(",");
-
- def observation_key($doc):
- [
- ($doc.target.kind // "unknown"),
- ($doc.target.id // $doc.target.name // "unknown"),
- ($doc.target.system // "unknown"),
- (.id // .name // "unknown"),
- (.unit // "unknown"),
- identity_dimensions
- ] | join("|");
-
- def median:
- sort as $sorted
- | ($sorted | length) as $count
- | if $count == 0 then null
- elif ($count % 2) == 1 then $sorted[($count / 2 | floor)]
- else (($sorted[($count / 2 - 1)] + $sorted[($count / 2)]) / 2)
- end;
-
- def percentile($p):
- sort as $sorted
- | ($sorted | length) as $count
- | if $count == 0 then null
- else $sorted[(($p * ($count - 1)) | floor)]
- end;
-
- def abs_value: if . < 0 then -. else . end;
-
- def observations_by_key($docs):
- reduce $docs[]? as $doc
- ({};
- reduce (($doc.observations // [])[]? | select(.value | type == "number")) as $obs
- (.;
- ($obs | observation_key($doc)) as $key
- | .[$key] = ((.[$key] // []) + [{
- target: $doc.target,
- observation: $obs,
- generatedAt: $doc.generatedAt
- }])
- )
- );
-
- def observation_stats($items):
- ($items | map(.observation.value)) as $values
- | ($items | map(.observation.comparison.baseline // empty)) as $pairedBaselineValues
- | ($items | map(.observation.statistics.pairedDeltaMedian // empty)) as $pairedDeltaMedianValues
- | ($items | map(.observation.statistics.pairedDeltaP25 // empty)) as $pairedDeltaP25Values
- | ($items | map(.observation.statistics.pairedDeltaP75 // empty)) as $pairedDeltaP75Values
- | ($items | map(.observation.statistics.pairedDeltaMad // empty)) as $pairedDeltaMadValues
- | ($items | map(.observation.statistics.pairedDeltaSamples // []) | add // []) as $pairedDeltaSampleValues
- | ($items | map(.observation.statistics.measuredSampleCount // .observation.statistics.sampleCount // 1) | add // ($items | length)) as $sampleCount
- | ($values | median) as $median
- | {
- target: ($items[0].target // {}),
- observation: ($items[-1].observation // {}),
- measurementKind: ($items[-1].observation.measurementKind // null),
- value: $median,
- min: ($values | min),
- max: ($values | max),
- p25: ($values | percentile(0.25)),
- p75: ($values | percentile(0.75)),
- p95: ($values | percentile(0.95)),
- mad: ($values | map(. - $median | if . < 0 then -. else . end) | median),
- sourceCount: ($items | length),
- sampleCount: $sampleCount,
- pairedSampleCount: ($items | map(.observation.statistics.pairedSampleCount // .observation.comparison.pairedSampleCount // 0) | add // 0),
- pairedBaselineValue: (if ($pairedBaselineValues | length) == 0 then null else ($pairedBaselineValues | median) end),
- pairedDeltaMedianValue: (if ($pairedDeltaMedianValues | length) == 0 then null else ($pairedDeltaMedianValues | median) end),
- pairedDeltaP25Value: (if ($pairedDeltaP25Values | length) == 0 then null else ($pairedDeltaP25Values | median) end),
- pairedDeltaP75Value: (if ($pairedDeltaP75Values | length) == 0 then null else ($pairedDeltaP75Values | median) end),
- pairedDeltaMadValue: (if ($pairedDeltaMadValues | length) == 0 then null else ($pairedDeltaMadValues | median) end),
- pairedDeltaSampleValues: $pairedDeltaSampleValues,
- generatedAt: ($items[-1].generatedAt // null)
- };
-
- def budget($metric; $unit):
- if $metric == "nix.closure.nar_size" then
- {warnRatio:1.05, failRatio:1.10, warnAbs:52428800, failAbs:209715200, statisticalToleranceRatio:0.02, statisticalToleranceAbs:10485760}
- elif $metric == "nix.closure.bucket.nar_size" then
- {warnRatio:1.10, failRatio:1.20, warnAbs:52428800, failAbs:209715200, statisticalToleranceRatio:0.05, statisticalToleranceAbs:10485760}
- elif $metric == "nix.closure.path_count" then
- {warnRatio:1.05, failRatio:1.10, warnAbs:100, failAbs:500, statisticalToleranceRatio:0.02, statisticalToleranceAbs:10}
- elif $unit == "seconds" then
- {warnRatio:1.10, failRatio:1.20, warnAbs:0.25, failAbs:1, statisticalToleranceRatio:0.10, statisticalToleranceAbs:0.25}
- else
- {warnRatio:1.25, failRatio:1.50, warnAbs:1, failAbs:3, statisticalToleranceRatio:0.10, statisticalToleranceAbs:1}
- end;
-
- def noise_floor($metric; $unit):
- if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" then 10485760
- elif $metric == "nix.closure.path_count" then 10
- elif $unit == "seconds" then 0.1
- else 0
- end;
- def default_policy($metric; $unit):
- budget($metric; $unit) as $b
- | noise_floor($metric; $unit) as $noise
- | $b + {
- enabled:true,
- comparisonMode:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" or $unit != "seconds" then "budget" else "historical" end),
- minBaselineSources:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" or $unit != "seconds" then 1 else 10 end),
- minCurrentSamples:(if $unit == "seconds" then 3 else 1 end),
- minPairedSamples:(if $unit == "seconds" then 5 else 0 end),
- noiseFloor:$noise
- };
- def observation_policy($obs):
- default_policy($obs.name // "unknown"; $obs.unit // "unknown") + ($obs.policy // {});
- def policy_enabled($policy):
- if ($policy | has("enabled")) then $policy.enabled else true end;
-
- def classify($metric; $unit; $measurementKind; $policy; $current; $currentP25; $currentP75; $currentMad; $baseline; $baselineMin; $baselineMax; $baselineP25; $baselineP75; $baselineP95; $baselineMad; $currentSamples; $baselineSources; $pairedSamples; $pairedDeltaMedian; $pairedDeltaP25; $pairedDeltaP75; $pairedDeltaMad; $pairedDeltaValues):
- $policy as $b
- | ($policy.comparisonMode // (if $measurementKind == "deterministic" or $unit != "seconds" then "budget" elif $measurementKind == "diagnostic" then "diagnostic" else "historical" end)) as $comparisonMode
- | ($policy.noiseFloor // noise_floor($metric; $unit)) as $noise
- | ($current - $baseline) as $delta
- | (if $comparisonMode == "paired" and $pairedDeltaMedian != null then $pairedDeltaMedian else $delta end) as $evidenceDelta
- | (($policy.pairedEvidenceQuantile // 0.25) | tonumber) as $pairedEvidenceQuantile
- | (if $baseline > 0 then ($current / $baseline) else null end) as $ratio
- | (($baselineP75 // $baseline) - ($baselineP25 // $baseline)) as $iqr
- | (($currentP75 // $current) - ($currentP25 // $current)) as $currentIqr
- | (($pairedDeltaP75 // $evidenceDelta) - ($pairedDeltaP25 // $evidenceDelta)) as $pairedDeltaIqr
- | ([
- $noise,
- (($policy.statisticalToleranceAbs // 0) | tonumber),
- (if $baseline > 0 then ($baseline * (($policy.statisticalToleranceRatio // 0) | tonumber)) else 0 end),
- (($baselineMad // 0) * 3),
- (($iqr // 0) * 1.5)
- ] | max) as $robustTolerance
- | (if $currentSamples > 1 then ([
- $noise,
- (($policy.statisticalToleranceAbs // 0) | tonumber),
- (if $current > 0 then ($current * (($policy.statisticalToleranceRatio // 0) | tonumber)) else 0 end),
- (($currentMad // 0) * 3),
- (($currentIqr // 0) * 1.5)
- ] | max) else 0 end) as $currentRobustTolerance
- | ([
- $noise,
- (($policy.statisticalToleranceAbs // 0) | tonumber),
- (if $baseline > 0 then ($baseline * (($policy.statisticalToleranceRatio // 0) | tonumber)) else 0 end),
- (($pairedDeltaMad // 0) * 3),
- (($pairedDeltaIqr // 0) * 1.5)
- ] | max) as $pairedDeltaTolerance
- | ($baseline + $robustTolerance) as $robustUpper
- | ($baseline - $robustTolerance) as $robustLower
- | ($current + $currentRobustTolerance) as $currentRobustUpper
- | ($current - $currentRobustTolerance) as $currentRobustLower
- | (if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then ($pairedDeltaValues | percentile($pairedEvidenceQuantile)) else ($evidenceDelta - $pairedDeltaTolerance) end) as $evidenceDeltaLower
- | (if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then ($pairedDeltaValues | percentile(1 - $pairedEvidenceQuantile)) else ($evidenceDelta + $pairedDeltaTolerance) end) as $evidenceDeltaUpper
- | ([($b.warnAbs // 0), (if $baseline > 0 then ($baseline * (($b.warnRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $warnBudget
- | ([($b.failAbs // 0), (if $baseline > 0 then ($baseline * (($b.failRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $failBudget
- | ($comparisonMode != "paired") as $needsHistoricalBaselineCount
- | (
- ($current >= $robustLower and $current <= $robustUpper)
- or ($currentRobustTolerance > 0 and $currentRobustLower <= $robustUpper and $currentRobustUpper >= $robustLower)
- ) as $withinRobustBand
- | ($comparisonMode == "historical" and $measurementKind != "deterministic") as $canUseRobustBandSuppression
- | (
- $baselineMin != null
- and $baselineMax != null
- and $current >= $baselineMin
- and $current <= $baselineMax
- ) as $withinBaselineRange
- | (
- if $baseline <= 0 then "unknown"
- elif $comparisonMode == "paired" and $evidenceDeltaLower > $failBudget then "fail"
- elif $comparisonMode == "paired" and $evidenceDeltaLower > $warnBudget then "warn"
- elif $comparisonMode == "paired" then "pass"
- elif ($delta > $b.failAbs and $current > ($baseline * $b.failRatio)) then "fail"
- elif ($delta > $b.warnAbs and $current > ($baseline * $b.warnRatio)) then "warn"
- else "pass"
- end
- ) as $thresholdStatus
- | (
- policy_enabled($policy) == true
- and $baseline > 0
- and (if $needsHistoricalBaselineCount then $baselineSources >= ($policy.minBaselineSources // 1) else true end)
- and $currentSamples >= ($policy.minCurrentSamples // 1)
- and (if $comparisonMode == "paired" then $pairedSamples >= ($policy.minPairedSamples // 1) else true end)
- and (if $comparisonMode == "paired" then $pairedDeltaMedian != null else true end)
- ) as $gateable
- | (
- if (policy_enabled($policy) != true) then "disabled"
- elif $baseline <= 0 then "missing_baseline"
- elif $needsHistoricalBaselineCount and $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
- elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
- elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
- elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta"
- else "eligible"
- end
- ) as $gateReason
- | (
- if $baseline <= 0 then "unknown"
- elif (policy_enabled($policy) != true) then "diagnostic"
- elif ($delta | abs_value) <= $noise then "noise_floor"
- elif $needsHistoricalBaselineCount and $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
- elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
- elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
- elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta"
- elif $comparisonMode == "paired" and $thresholdStatus == "pass" and $evidenceDelta > $warnBudget then "paired_uncertain"
- elif ($canUseRobustBandSuppression and $thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band"
- elif $thresholdStatus == "pass" then "within_budget"
- else "threshold_exceeded"
- end
- ) as $confidence
- | (
- if ($gateable and $confidence == "threshold_exceeded") then $thresholdStatus
- elif $thresholdStatus == "unknown" then "unknown"
- else "pass"
- end
- ) as $status
- | (
- if $baseline <= 0 then "unknown"
- elif $comparisonMode == "paired" and ($evidenceDelta | abs_value) <= $noise then "unchanged"
- elif $comparisonMode == "paired" and $evidenceDeltaLower <= 0 and $evidenceDeltaUpper >= 0 then "unchanged"
- elif $comparisonMode == "paired" and $evidenceDelta < 0 then "improved"
- elif $comparisonMode == "paired" then "regressed"
- elif ($delta | abs_value) <= $noise then "unchanged"
- elif $canUseRobustBandSuppression and $withinRobustBand then "unchanged"
- elif $delta < 0 then "improved"
- else "regressed"
- end
- ) as $direction
- | (
- if $baseline <= 0 then null
- elif (policy_enabled($policy) != true) then null
- elif $comparisonMode == "paired" and ($evidenceDeltaLower <= 0 and $evidenceDeltaUpper >= 0) then 0
- elif $comparisonMode == "paired" and ($evidenceDelta | abs_value) <= $noise then 0
- elif $comparisonMode == "paired" and $evidenceDelta > 0 then ([0, $evidenceDeltaLower] | max) / $warnBudget
- elif $comparisonMode == "paired" then -(([0, (-$evidenceDeltaUpper)] | max) / $warnBudget)
- elif $canUseRobustBandSuppression and $withinRobustBand then 0
- elif ($delta | abs_value) <= $noise then 0
- elif ($confidence == "threshold_exceeded" and $delta > 0) then ([0, ($currentRobustLower - $robustUpper), $delta] | max) / $warnBudget
- elif ($confidence == "threshold_exceeded" and $delta < 0) then -(([0, ($robustLower - $currentRobustUpper), (-$delta)] | max) / $warnBudget)
- elif $delta > 0 then ([0, ($currentRobustLower - $robustUpper)] | max) / $warnBudget
- else -(([0, ($robustLower - $currentRobustUpper)] | max) / $warnBudget)
- end
- ) as $semanticImpactScore
- | (
- if (policy_enabled($policy) != true) then "diagnostic"
- elif $semanticImpactScore == null then "unknown"
- elif $semanticImpactScore == 0 then "neutral"
- elif $semanticImpactScore >= ($failBudget / $warnBudget) then "fail_boundary"
- elif $semanticImpactScore >= 1 then "warn_boundary"
- elif $semanticImpactScore > 0 then "below_warn_boundary"
- else "improvement"
- end
- ) as $semanticImpactKind
- | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,comparisonMode:$comparisonMode,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,semanticImpactScore:$semanticImpactScore,semanticImpactKind:$semanticImpactKind,semanticWarnBudget:$warnBudget,semanticFailBudget:$failBudget,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange,pairedSamples:$pairedSamples,evidenceDelta:$evidenceDelta,evidenceDeltaLower:$evidenceDeltaLower,evidenceDeltaUpper:$evidenceDeltaUpper,evidenceDeltaTolerance:$pairedDeltaTolerance,pairedEvidenceQuantile:$pairedEvidenceQuantile,pairedEvidenceProtocol:(if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then "paired-delta-quantile-v1" elif $comparisonMode == "paired" then "paired-summary-robust-band-v1" else null end)};
-
- (observations_by_key($current[0]) | with_entries(.value = observation_stats(.value))) as $currentObs
- | (observations_by_key($baseline[0]) | with_entries(.value = observation_stats(.value))) as $baselineObs
- | (
- $currentObs
- | to_entries
- | map(
- .key as $key
- | .value as $currentValue
- | ($baselineObs[$key] // null) as $baselineValue
- | ($currentValue.observation | observation_policy(.)) as $policy
- | ($policy.comparisonMode // (if ($currentValue.observation.measurementKind // $currentValue.measurementKind) == "deterministic" or ($currentValue.observation.unit // "") != "seconds" then "budget" elif ($currentValue.observation.measurementKind // $currentValue.measurementKind) == "diagnostic" then "diagnostic" else "historical" end)) as $comparisonMode
- | ($currentValue.pairedBaselineValue // null) as $pairedBaselineValue
- | (if $comparisonMode == "paired" and $pairedBaselineValue != null then {
- value: $pairedBaselineValue,
- min: $pairedBaselineValue,
- max: $pairedBaselineValue,
- p25: $pairedBaselineValue,
- p75: $pairedBaselineValue,
- p95: $pairedBaselineValue,
- mad: 0,
- sourceCount: $currentValue.pairedSampleCount
- } else $baselineValue end) as $effectiveBaselineValue
- | {
- key: $key,
- value: (
- if $effectiveBaselineValue == null then
- {
- status: "missing_baseline",
- target: $currentValue.target,
- observation: $currentValue.observation,
- current: $currentValue.value,
- currentSamples: $currentValue.sampleCount,
- baselineSources: 0,
- gatePolicy: $policy,
- comparisonMode: $comparisonMode,
- gateable: false,
- gateReason: "missing_baseline",
- confidence: "missing_baseline",
- direction: "unknown"
- }
- else
- classify(
- $currentValue.observation.name;
- $currentValue.observation.unit;
- ($currentValue.observation.measurementKind // $currentValue.measurementKind);
- $policy;
- $currentValue.value;
- $currentValue.p25;
- $currentValue.p75;
- $currentValue.mad;
- $effectiveBaselineValue.value;
- $effectiveBaselineValue.min;
- $effectiveBaselineValue.max;
- $effectiveBaselineValue.p25;
- $effectiveBaselineValue.p75;
- $effectiveBaselineValue.p95;
- $effectiveBaselineValue.mad;
- $currentValue.sampleCount;
- $effectiveBaselineValue.sourceCount;
- $currentValue.pairedSampleCount;
- $currentValue.pairedDeltaMedianValue;
- $currentValue.pairedDeltaP25Value;
- $currentValue.pairedDeltaP75Value;
- $currentValue.pairedDeltaMadValue;
- ($currentValue.pairedDeltaSampleValues // [])
- ) + {
- target: $currentValue.target,
- observation: $currentValue.observation,
- currentSamples: $currentValue.sampleCount,
- baselineSources: $effectiveBaselineValue.sourceCount,
- baselineMin: $effectiveBaselineValue.min,
- baselineMax: $effectiveBaselineValue.max,
- baselineP25: $effectiveBaselineValue.p25,
- baselineP75: $effectiveBaselineValue.p75,
- baselineP95: $effectiveBaselineValue.p95
- ,baselineMad: $effectiveBaselineValue.mad
- }
- end
- )
- }
- )
- | from_entries
- ) as $comparisons
- | (
- if any($comparisons[]?; .status == "fail") then "fail"
- elif any($comparisons[]?; .status == "warn") then "warn"
- elif any($comparisons[]?;
- (if (.gatePolicy | has("enabled")) then .gatePolicy.enabled else true end)
- and (.gateReason == "missing_baseline"
- or .gateReason == "low_baseline_count"
- or .gateReason == "low_current_sample_count"
- or .gateReason == "low_paired_sample_count"
- or .gateReason == "missing_paired_delta")
- ) then "partial"
- else "pass"
- end
- ) as $status
- | (
- [$comparisons[]?]
- | {
- enabledCount: (map(select((if (.gatePolicy | has("enabled")) then .gatePolicy.enabled else true end))) | length),
- gateableCount: (map(select(.gateable == true)) | length),
- missingBaselineCount: (map(select(.gateReason == "missing_baseline")) | length),
- lowBaselineCount: (map(select(.gateReason == "low_baseline_count")) | length),
- lowCurrentSampleCount: (map(select(.gateReason == "low_current_sample_count")) | length),
- lowPairedSampleCount: (map(select(.gateReason == "low_paired_sample_count")) | length),
- missingPairedDeltaCount: (map(select(.gateReason == "missing_paired_delta")) | length)
- }
- | . + {
- nonGateableCount: (.enabledCount - .gateableCount),
- enforceable: (.enabledCount == .gateableCount)
- }
- ) as $readiness
- | {
- schemaVersion:$schemaVersion,
- status:$status,
- mode:$mode,
- readiness:$readiness,
- currentDir:$currentDir,
- baselineDir:$baselineDir,
- comparisons:$comparisons
- }
- ' >"$comparison_file"
-
- baseline_provenance_file="$baseline_dir/baseline-provenance.json"
- if [ -f "$baseline_provenance_file" ]; then
- comparison_with_provenance="$(mktemp)"
- jq --slurpfile baselineProvenance "$baseline_provenance_file" \
- '. + {baselineProvenance: ($baselineProvenance[0] // null)}' \
- "$comparison_file" >"$comparison_with_provenance"
- mv "$comparison_with_provenance" "$comparison_file"
- fi
-
- status="$(jq -r '.status' "$comparison_file")"
- exit_code=0
- case "$status:$mode" in
- fail:fail)
- echo "::error::CI measurement regression detected"
- exit_code=1
- ;;
- fail:*|warn:*)
- echo "::warning::CI measurement regression threshold exceeded"
- ;;
- partial:*)
- echo "::notice::CI measurement comparison is partial because one or more enabled observations are not gateable"
- ;;
- esac
-
- if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
- {
- echo "### ${CI_MEASUREMENT_PR_COMMENT_TITLE:-CI Measurements}"
- echo ""
- jq -r '"- Status: " + .status + "\n- Gate: " + (if .mode == "fail" then "enforced" elif .mode == "warn" then "advisory" elif .mode == "off" then "off" else (.mode // "unknown") end) + "\n- Baseline: " + .baselineDir' "$comparison_file"
- echo ""
- echo "| Status | Gate | Target | Observation | Current | Baseline | Delta | Ratio |"
- echo "| --- | --- | --- | --- | ---: | ---: | ---: | ---: |"
- jq -r '
- .comparisons
- | to_entries
- | sort_by(
- if .value.status == "fail" then 0
- elif .value.status == "warn" then 1
- elif .value.status == "missing_baseline" then 2
- else 3
- end
- )
- | .[:20]
- | .[]
- | .value as $v
- | [
- $v.status,
- (if ($v.gateable // false) then "yes" else ($v.gateReason // "no") end),
- (($v.target.kind // "unknown") + "/" + ($v.target.name // "unknown") + "/" + ($v.target.system // "unknown")),
- ($v.observation.name // "unknown"),
- (($v.current // $v.observation.value // 0) | tostring),
- (($v.baseline // "") | tostring),
- (($v.delta // "") | tostring),
- (if $v.ratio == null or $v.ratio == "" then "" else (($v.ratio * 100 | round / 100) | tostring) end)
- ]
- | "| " + (map(gsub("\\|"; "\\\\|")) | join(" | ")) + " |"
- ' "$comparison_file"
- } >>"$GITHUB_STEP_SUMMARY"
- fi
-
-
-
- if [ "$exit_code" -ne 0 ]; then
- exit "$exit_code"
- fi
-
- - name: 'Upload CI measurements: nix-closure-measurements'
- if: always()
- uses: actions/upload-artifact@v4
- with:
- name: nix-closure-measurements
- path: |
- tmp/nix-closure-ci
- !tmp/nix-closure-ci/baseline/**
- if-no-files-found: error
- retention-days: 30
- - name: Save pnpm state
- if: ${{ success() && steps.restore-pnpm-state.outputs.cache-hit != 'true' }}
- uses: actions/cache/save@v4
- with:
- path: |
- ${{ github.workspace }}/.pnpm-home
- ${{ runner.temp }}/pnpm-store/${{ github.job }}
- key: "pnpm-state-v1-${{ runner.os }}-${{ runner.arch }}-${{ hashFiles('**/pnpm-lock.yaml') }}"
- - name: Nix diagnostics summary
- if: failure()
- shell: bash
- run: |
- diag_dir="${NIX_STORE_DIAGNOSTICS_DIR:-}"
- if [ -z "$diag_dir" ] || [ ! -d "$diag_dir" ]; then
- echo "## Nix Store Diagnostics" >> "$GITHUB_STEP_SUMMARY"
- echo "" >> "$GITHUB_STEP_SUMMARY"
- echo "No diagnostics directory found (validation may have failed before capture)." >> "$GITHUB_STEP_SUMMARY"
- exit 0
- fi
-
- {
- echo "## Nix Store Diagnostics"
- echo ""
- echo "Temporary instrumentation for #272; remove after root cause is confirmed and CI is stable."
- echo ""
- echo "- Diagnostics directory: \`$diag_dir\`"
- echo "- Tracking issue: https://github.com/overengineeringstudio/effect-utils/issues/272"
- } >> "$GITHUB_STEP_SUMMARY"
-
- markers_file="${RUNNER_TEMP:-/tmp}/nix-store-signature-markers.txt"
- grep -R -n -E "config\\.cachix|cachix\\.package|error: path '/nix/store/.+ is not valid" --exclude="$(basename "$markers_file")" "$diag_dir" > "$markers_file" || true
-
- if [ -s "$markers_file" ]; then
- {
- echo ""
- echo "### Signature markers"
- echo '```text'
- head -n 120 "$markers_file"
- echo '```'
- } >> "$GITHUB_STEP_SUMMARY"
- else
- echo "" >> "$GITHUB_STEP_SUMMARY"
- echo "- No signature markers found in captured diagnostics." >> "$GITHUB_STEP_SUMMARY"
- fi
- - name: Upload Nix diagnostics artifact
- if: failure() && env.NIX_STORE_DIAGNOSTICS_DIR != ''
- uses: actions/upload-artifact@v4
- with:
- name: 'nix-store-diagnostics-${{ github.job }}-${{ runner.os }}-run-${{ github.run_id }}-attempt-${{ github.run_attempt }}'
- path: ${{ env.NIX_STORE_DIAGNOSTICS_DIR }}
- if-no-files-found: ignore
- retention-days: 14
- - name: Failure note
- if: failure()
- shell: bash
- run: |
- echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
- echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
- concurrency:
- group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-nix-closure-sizes"
- cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
- source-shape:
- runs-on:
- [namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}']
- timeout-minutes: 30
- defaults:
- run:
- shell: bash
- permissions:
- actions: read
- contents: write
- issues: write
- pull-requests: write
- env:
- CI_MEASUREMENT_SUBJECT_REF: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.ref || github.ref }}
- CI_MEASUREMENT_SUBJECT_SHA: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.sha || github.sha }}
- CI_MEASUREMENT_SUBJECT_LABEL: ${{ inputs.measurement_baseline_label }}
- CI_MEASUREMENT_ALLOW_PROBE_FAILURES: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && '1' || '' }}
- steps:
- - uses: actions/checkout@v6
- - name: Checkout CI measurement baseline ref
- if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }}
- uses: actions/checkout@v6
- with:
- ref: ${{ inputs.measurement_baseline_ref }}
- - name: 'Download previous artifact: source-shape'
- shell: bash
- env:
- GH_TOKEN: ${{ github.token }}
- BASELINE_ARTIFACT_NAME: source-shape
- BASELINE_OUTPUT_DIR: tmp/source-shape-ci/baseline
- BASELINE_WORKFLOW_NAME: ${{ github.workflow }}
- BASELINE_BRANCH: ${{ github.base_ref || github.ref_name }}
- BASELINE_SEED_RUNS_JSON: '[{"runId":"26085158592","label":"main baseline","sha":"ce7cf8f8ebfaa1da6c7e9122cd195a5f95ce2fca","source":"manual-backfill","artifacts":["source-shape"],"notes":"Backfilled with the current measurement workflow for the effect-utils #658 rollout."}]'
- BASELINE_MAX_RUNS: '20'
- BASELINE_MAX_CANDIDATE_RUNS: '60'
- BASELINE_REQUIRED_OBSERVATIONS_JSON: '[]'
- run: |
- set -euo pipefail
-
- mkdir -p "$BASELINE_OUTPUT_DIR"
-
- if command -v gh >/dev/null 2>&1; then
- GH_BIN="$(command -v gh)"
- else
- echo "::notice::gh is not on PATH; resolving GitHub CLI through Nix"
- if ! GH_BIN="$(nix build --no-link --print-out-paths nixpkgs#gh 2>/dev/null)/bin/gh"; then
- echo "::notice::unable to resolve GitHub CLI through Nix; skipping previous artifact download"
- exit 0
- fi
- fi
- echo "Using GitHub CLI: $GH_BIN"
-
- repo="${GITHUB_REPOSITORY:?GITHUB_REPOSITORY not set}"
- workflow="${BASELINE_WORKFLOW_NAME:-CI}"
- branch="${BASELINE_BRANCH:-${GITHUB_BASE_REF:-${GITHUB_REF_NAME:-main}}}"
- seed_runs_file="$BASELINE_OUTPUT_DIR/baseline-seed-runs.json"
- required_observations_file="$BASELINE_OUTPUT_DIR/baseline-required-observations.json"
- printf '%s' "${BASELINE_SEED_RUNS_JSON:-[]}" >"$seed_runs_file"
- printf '%s' "${BASELINE_REQUIRED_OBSERVATIONS_JSON:-[]}" >"$required_observations_file"
- if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.runId | type == "string")) else false end' \
- "$seed_runs_file" >/dev/null; then
- echo "::error::BASELINE_SEED_RUNS_JSON must be an array of objects with string runId fields"
- exit 1
- fi
- if ! jq -e 'if type == "array" then all(.[]; type == "object" and (.id | type == "string") and (.minSources | type == "number")) else false end' \
- "$required_observations_file" >/dev/null; then
- echo "::error::BASELINE_REQUIRED_OBSERVATIONS_JSON must be an array of objects with string id and numeric minSources fields"
- exit 1
- fi
- seed_run_ids="$(jq -r '.[].runId' "$seed_runs_file")"
- required_observation_count="$(jq 'length' "$required_observations_file")"
- max_candidate_runs="${BASELINE_MAX_CANDIDATE_RUNS:-${BASELINE_MAX_RUNS:-5}}"
- if ! [[ "$max_candidate_runs" =~ ^[0-9]+$ ]] || [ "$max_candidate_runs" -lt 1 ]; then
- max_candidate_runs=1
- fi
-
- candidate_runs="$(
- "$GH_BIN" run list \
- --repo "$repo" \
- --workflow "$workflow" \
- --branch "$branch" \
- --event push \
- --status success \
- --json databaseId,headSha \
- --limit "$max_candidate_runs" \
- --jq '[.[] | select(.headSha != env.GITHUB_SHA) | .databaseId] | .[]'
- )"
-
- candidate_runs="$seed_run_ids
- $candidate_runs"
-
- max_runs="${BASELINE_MAX_RUNS:-5}"
- if ! [[ "$max_runs" =~ ^[0-9]+$ ]] || [ "$max_runs" -lt 1 ]; then
- max_runs=1
- fi
-
- write_baseline_observation_counts() {
- local measurement_index="$BASELINE_OUTPUT_DIR/baseline-measurement-files.txt"
- local counts_file="$BASELINE_OUTPUT_DIR/baseline-observation-counts.json"
- find "$BASELINE_OUTPUT_DIR" \
- -mindepth 2 \
- -maxdepth 2 \
- -name measurements.json \
- -type f \
- -print \
- | sort >"$measurement_index" || true
-
- if [ -s "$measurement_index" ]; then
- xargs -r jq -s \
- --slurpfile required "$required_observations_file" \
- '
- ([.[] | (.observations // [])[]? | select(.value | type == "number") | .id] | sort | group_by(.) | map({id: .[0], sources: length})) as $counts
- | ($required[0] // []) as $requiredRows
- | {
- counts: $counts,
- required: (
- $requiredRows
- | map(. as $requiredRow | ($counts | map(select(.id == $requiredRow.id)) | .[0].sources // 0) as $actual | $requiredRow + {sources:$actual, satisfied:($actual >= $requiredRow.minSources)})
- )
- }
- ' <"$measurement_index" >"$counts_file"
- else
- jq -n --slurpfile required "$required_observations_file" \
- '{counts: [], required: (($required[0] // []) | map(. + {sources:0, satisfied:false}))}' >"$counts_file"
- fi
- }
-
- baseline_requirements_satisfied() {
- if [ "$required_observation_count" -eq 0 ]; then
- return 1
- fi
- write_baseline_observation_counts
- jq -e '.required | all(.satisfied == true)' "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" >/dev/null
- }
-
- run_id=""
- artifact_name=""
- artifact_id=""
- downloaded_runs_file="$BASELINE_OUTPUT_DIR/baseline-runs.jsonl"
- seen_runs_file="$BASELINE_OUTPUT_DIR/baseline-seen-runs.txt"
- : >"$downloaded_runs_file"
- : >"$seen_runs_file"
- for candidate_run in $candidate_runs; do
- if [ -z "$candidate_run" ]; then
- continue
- fi
- if grep -qxF "$candidate_run" "$seen_runs_file"; then
- continue
- fi
- downloaded_count="$(wc -l <"$downloaded_runs_file" | tr -d ' ')"
- if [ "$downloaded_count" -ge "$max_runs" ]; then
- if baseline_requirements_satisfied; then
- break
- fi
- echo "::notice::downloaded $downloaded_count baseline artifact(s), but required observation counts are not satisfied yet; continuing through bounded candidate history"
- fi
- if [ "$(wc -l <"$seen_runs_file" | tr -d ' ')" -ge "$max_candidate_runs" ]; then
- break
- fi
- printf '%s\n' "$candidate_run" >>"$seen_runs_file"
-
- artifact_json="$(
- "$GH_BIN" api "repos/$repo/actions/runs/$candidate_run/artifacts" \
- | jq --arg artifactName "$BASELINE_ARTIFACT_NAME" '.artifacts
- | map(select(.expired == false))
- | map(select(.name == $artifactName or (.name | startswith($artifactName + "-"))))
- | sort_by(.created_at // "")
- | reverse
- | .[0] // empty'
- )"
-
- if [ -n "$artifact_json" ]; then
- current_artifact_name="$(printf '%s' "$artifact_json" | jq -r '.name')"
- current_artifact_id="$(printf '%s' "$artifact_json" | jq -r '.id')"
- current_output_dir="$BASELINE_OUTPUT_DIR/run-$candidate_run"
- mkdir -p "$current_output_dir"
- if "$GH_BIN" run download "$candidate_run" \
- --repo "$repo" \
- --name "$current_artifact_name" \
- --dir "$current_output_dir"; then
- if [ -z "$run_id" ]; then
- run_id="$candidate_run"
- artifact_name="$current_artifact_name"
- artifact_id="$current_artifact_id"
- fi
- jq -cn \
- --arg runId "$candidate_run" \
- --arg artifactName "$current_artifact_name" \
- --arg artifactId "$current_artifact_id" \
- --arg path "run-$candidate_run" \
- '{runId:$runId, artifactName:$artifactName, artifactId:$artifactId, path:$path}' \
- >>"$downloaded_runs_file"
- else
- echo "::notice::failed to download baseline artifact $current_artifact_name from run $candidate_run"
- fi
- fi
- done
-
- write_baseline_observation_counts
-
- if [ -z "$run_id" ] || [ -z "$artifact_name" ]; then
- echo "::notice::no successful baseline run found for $repo workflow=$workflow branch=$branch"
- exit 0
- fi
-
- jq -n \
- --slurpfile runs "$downloaded_runs_file" \
- --slurpfile seedRuns "$seed_runs_file" \
- --slurpfile observationCounts "$BASELINE_OUTPUT_DIR/baseline-observation-counts.json" \
- --argjson schemaVersion 1 \
- --arg repository "$repo" \
- --arg workflow "$workflow" \
- --arg branch "$branch" \
- --arg runId "$run_id" \
- --arg artifactName "$artifact_name" \
- --arg artifactId "$artifact_id" \
- '{
- schemaVersion: $schemaVersion,
- source: "github-actions-artifact",
- repository: $repository,
- workflow: $workflow,
- branch: $branch,
- runId: $runId,
- artifactName: $artifactName,
- artifactId: $artifactId,
- seedRuns: ($seedRuns[0] // []),
- runs: $runs,
- observationCounts: ($observationCounts[0] // null)
- }' >"$BASELINE_OUTPUT_DIR/baseline-provenance.json"
-
- echo "Downloaded $(wc -l <"$downloaded_runs_file" | tr -d ' ') baseline artifact(s), latest $artifact_name from run $run_id into $BASELINE_OUTPUT_DIR"
-
- if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
- - name: 'Measure source shape: effect-utils'
- shell: bash
- env:
- ARTIFACT_DIR: tmp/source-shape-ci/current/effect-utils
- RUNNER_CLASS: '${{ runner.os }}-${{ runner.arch }}'
- run: |
- set -euo pipefail
-
- mkdir -p "$ARTIFACT_DIR"
- target_id='effect_utils'
- target_name='effect-utils'
- target_label='effect-utils repository'
- target_group='source'
- artifact_file="$ARTIFACT_DIR/measurements.json"
- target_system="${DEVENV_SYSTEM:-${RUNNER_OS:-unknown}}"
-
- SCOPES_JSON='[{"id":"genie_ci_workflow","label":"Genie CI workflow helpers","group":"source / ci","path":["source","effect-utils","genie","ci-workflow"],"includePaths":["genie/ci-workflow",".github/workflows/ci.yml.genie.ts"],"includeExtensions":[".ts"]},{"id":"genie_runtime","label":"Genie runtime","group":"source / genie","path":["source","effect-utils","packages","genie"],"includePaths":["packages/@overeng/genie/src"],"includeExtensions":[".ts",".tsx"]},{"id":"nix_workspace_tools","label":"Nix workspace tools","group":"source / nix","path":["source","effect-utils","nix","workspace-tools"],"includePaths":["nix/workspace-tools"],"includeExtensions":[".nix"]}]' \
- TARGET_PATH_JSON='["source","effect-utils"]' \
- TARGET_ID="$target_id" \
- TARGET_NAME="$target_name" \
- TARGET_LABEL="$target_label" \
- TARGET_GROUP="$target_group" \
- TARGET_SYSTEM="$target_system" \
- node <<'NODE' >"$artifact_file"
- const cp = require('node:child_process')
- const fs = require('node:fs')
- const path = require('node:path')
-
- const normalize = (value) => {
- const normalized = value.replace(/\\/g, '/').replace(/^\.\//, '').replace(/\/+$/, '')
- return normalized === '.' ? '' : normalized
- }
- const scopes = JSON.parse(process.env.SCOPES_JSON || '[]')
- const targetPath = JSON.parse(process.env.TARGET_PATH_JSON || '["source"]')
- const gitFiles = cp
- .execFileSync('git', ['ls-files', '-z'], { encoding: 'buffer' })
- .toString('utf8')
- .split('\0')
- .filter(Boolean)
- .map(normalize)
-
- const includesPath = (file, candidates) => {
- if (!Array.isArray(candidates) || candidates.length === 0) return true
- return candidates.map(normalize).some((candidate) => candidate === '' || file === candidate || file.startsWith(candidate + '/'))
- }
-
- const excludesPath = (file, candidates) =>
- Array.isArray(candidates) &&
- candidates.map(normalize).some((candidate) => candidate !== '' && (file === candidate || file.startsWith(candidate + '/')))
-
- const matchesExtension = (file, extensions) => {
- if (!Array.isArray(extensions) || extensions.length === 0) return true
- const ext = path.extname(file).toLowerCase()
- return extensions.map((extension) => extension.toLowerCase()).some((extension) => ext === extension)
- }
-
- const countLines = (file) => {
- const buffer = fs.readFileSync(file)
- if (buffer.includes(0)) return undefined
- if (buffer.length === 0) return 0
- let lines = 0
- for (const byte of buffer) {
- if (byte === 10) lines += 1
- }
- return buffer[buffer.length - 1] === 10 ? lines : lines + 1
- }
-
- const observations = []
- const scopeSummaries = []
-
- for (const scope of scopes) {
- const root = normalize(scope.root || '.')
- const includePaths = Array.isArray(scope.includePaths) && scope.includePaths.length > 0 ? scope.includePaths : [root]
- const files = gitFiles
- .filter((file) => includesPath(file, includePaths))
- .filter((file) => !excludesPath(file, scope.excludePaths))
- .filter((file) => matchesExtension(file, scope.includeExtensions))
-
- let lineCount = 0
- let measuredFileCount = 0
- for (const file of files) {
- const lines = countLines(file)
- if (lines === undefined) continue
- lineCount += lines
- measuredFileCount += 1
- }
-
- const group = scope.group || 'source shape'
- const scopePath = Array.isArray(scope.path) ? scope.path : ['source', scope.id]
- const policy = scope.gate || { enabled: false, minBaselineSources: 3, minCurrentSamples: 1 }
- observations.push(
- {
- id: 'source.lines',
- label: scope.label + ' lines',
- group,
- path: scopePath,
- description: 'Tracked non-binary source lines in the configured scope.',
- measurementKind: 'deterministic',
- name: 'source.lines',
- unit: 'lines',
- value: lineCount,
- dimensions: { scope: scope.id },
- policy,
- statistics: { sampleCount: 1, measuredSampleCount: measuredFileCount },
- },
- {
- id: 'source.files',
- label: scope.label + ' files',
- group,
- path: scopePath,
- description: 'Tracked non-binary source files in the configured scope.',
- measurementKind: 'deterministic',
- name: 'source.files',
- unit: 'count',
- value: measuredFileCount,
- dimensions: { scope: scope.id },
- policy,
- statistics: { sampleCount: 1, measuredSampleCount: measuredFileCount },
- },
- )
- scopeSummaries.push({
- id: scope.id,
- label: scope.label,
- root,
- includePaths,
- excludePaths: scope.excludePaths || [],
- includeExtensions: scope.includeExtensions || [],
- fileCount: measuredFileCount,
- lineCount,
- })
- }
-
- const artifact = {
- schemaVersion: 1,
- generatedAt: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z'),
- producer: {
- name: 'effect-utils-ci-measurement',
- version: 1,
- measurementProtocol: 'source-shape-v1',
- },
- subject: {
- repo: process.env.GITHUB_REPOSITORY || 'unknown',
- branchKind: process.env.GITHUB_EVENT_NAME || 'unknown',
- ref: process.env.CI_MEASUREMENT_SUBJECT_REF || process.env.GITHUB_REF || 'unknown',
- headSha: process.env.CI_MEASUREMENT_SUBJECT_SHA || process.env.GITHUB_SHA || 'unknown',
- baseSha: process.env.GITHUB_BASE_SHA || '',
- },
- execution: {
- provider: process.env.GITHUB_RUN_ID && process.env.GITHUB_RUN_ID !== 'unknown' ? 'github-actions' : 'local',
- workflow: 'CI',
- job: process.env.GITHUB_JOB || 'unknown',
- runId: process.env.GITHUB_RUN_ID || 'unknown',
- runAttempt: process.env.GITHUB_RUN_ATTEMPT || 'unknown',
- taskId: process.env.CROSSTASK_TASK_ID || '',
- attemptId: process.env.CROSSTASK_ATTEMPT_ID || '',
- traceId: process.env.TRACE_ID || '',
- runner: {
- name: process.env.RUNNER_NAME || 'unknown',
- os: process.env.RUNNER_OS || 'unknown',
- arch: process.env.RUNNER_ARCH || 'unknown',
- class: process.env.RUNNER_CLASS || 'unknown',
- },
- },
- target: {
- kind: 'source-shape',
- id: process.env.TARGET_ID,
- name: process.env.TARGET_NAME,
- label: process.env.TARGET_LABEL,
- group: process.env.TARGET_GROUP,
- path: targetPath,
- system: process.env.TARGET_SYSTEM,
- },
- observations,
- details: { scopes: scopeSummaries },
- }
-
- process.stdout.write(JSON.stringify(artifact, null, 2) + '\n')
- NODE
-
- cat "$artifact_file"
-
- - name: Compare CI measurements with baseline
- shell: bash
- env:
- CI_MEASUREMENT_CURRENT_DIR: tmp/source-shape-ci/current
- CI_MEASUREMENT_BASELINE_DIR: tmp/source-shape-ci/baseline
- CI_MEASUREMENT_COMPARISON_FILE: tmp/source-shape-ci/measurement-comparison.json
- CI_MEASUREMENT_REGRESSION_MODE: warn
- CI_MEASUREMENT_PR_COMMENT_ENABLED: 'false'
- CI_MEASUREMENT_PR_COMMENT_TITLE: Source Shape Measurements
- CI_MEASUREMENT_PR_COMMENT_MAX_ROWS: '12'
- CI_MEASUREMENT_PR_COMMENT_MAX_HISTORY: '20'
- CI_MEASUREMENT_PR_COMMENT_ASSET_BRANCH: ci-measurement-assets
- run: |
- set -euo pipefail
-
- export PATH="/run/current-system/sw/bin:/usr/bin:/bin:$PATH"
-
- current_dir="${CI_MEASUREMENT_CURRENT_DIR:?CI_MEASUREMENT_CURRENT_DIR not set}"
- baseline_dir="${CI_MEASUREMENT_BASELINE_DIR:?CI_MEASUREMENT_BASELINE_DIR not set}"
- comparison_file="${CI_MEASUREMENT_COMPARISON_FILE:?CI_MEASUREMENT_COMPARISON_FILE not set}"
- mode="${CI_MEASUREMENT_REGRESSION_MODE:-warn}"
- mkdir -p "$(dirname "$comparison_file")"
-
- if [ "$mode" = "off" ]; then
- jq -n --argjson schemaVersion 1 --arg status skipped --arg mode "$mode" \
- '{schemaVersion:$schemaVersion,status:$status,mode:$mode,comparisons:{}}' \
- >"$comparison_file"
- exit 0
- fi
-
- current_index="$(mktemp)"
- baseline_index="$(mktemp)"
- find "$current_dir" -name baseline -type d -prune -o -name measurements.json -type f -print | sort >"$current_index" || true
- {
- find "$baseline_dir" -name baseline -type d ! -path "$baseline_dir" -prune -o -name measurements.json -type f -print
- } | sort -u >"$baseline_index" || true
-
- if [ ! -s "$current_index" ]; then
- echo "::error::no current measurements.json files found under $current_dir"
- exit 1
- fi
-
- current_json="$comparison_file.current.json"
- baseline_json="$comparison_file.baseline.json"
- xargs -r jq -s '.' <"$current_index" >"$current_json"
- if [ -s "$baseline_index" ]; then
- xargs -r jq -s '.' <"$baseline_index" >"$baseline_json"
- else
- printf '[]\n' >"$baseline_json"
- fi
-
- jq -n \
- --slurpfile current "$current_json" \
- --slurpfile baseline "$baseline_json" \
- --argjson schemaVersion 1 \
- --arg mode "$mode" \
- --arg currentDir "$current_dir" \
- --arg baselineDir "$baseline_dir" \
- '
- def identity_dimensions:
- (.dimensions // {})
- | to_entries
- | map(select(.key as $key | ["devenvRev", "otelServiceName", "status", "probeLabel", "sampleCount", "measuredSampleCount"] | index($key) | not))
- | sort_by(.key)
- | map("\(.key)=\(.value|tostring)")
- | join(",");
-
- def observation_key($doc):
- [
- ($doc.target.kind // "unknown"),
- ($doc.target.id // $doc.target.name // "unknown"),
- ($doc.target.system // "unknown"),
- (.id // .name // "unknown"),
- (.unit // "unknown"),
- identity_dimensions
- ] | join("|");
-
- def median:
- sort as $sorted
- | ($sorted | length) as $count
- | if $count == 0 then null
- elif ($count % 2) == 1 then $sorted[($count / 2 | floor)]
- else (($sorted[($count / 2 - 1)] + $sorted[($count / 2)]) / 2)
- end;
-
- def percentile($p):
- sort as $sorted
- | ($sorted | length) as $count
- | if $count == 0 then null
- else $sorted[(($p * ($count - 1)) | floor)]
- end;
-
- def abs_value: if . < 0 then -. else . end;
-
- def observations_by_key($docs):
- reduce $docs[]? as $doc
- ({};
- reduce (($doc.observations // [])[]? | select(.value | type == "number")) as $obs
- (.;
- ($obs | observation_key($doc)) as $key
- | .[$key] = ((.[$key] // []) + [{
- target: $doc.target,
- observation: $obs,
- generatedAt: $doc.generatedAt
- }])
- )
- );
-
- def observation_stats($items):
- ($items | map(.observation.value)) as $values
- | ($items | map(.observation.comparison.baseline // empty)) as $pairedBaselineValues
- | ($items | map(.observation.statistics.pairedDeltaMedian // empty)) as $pairedDeltaMedianValues
- | ($items | map(.observation.statistics.pairedDeltaP25 // empty)) as $pairedDeltaP25Values
- | ($items | map(.observation.statistics.pairedDeltaP75 // empty)) as $pairedDeltaP75Values
- | ($items | map(.observation.statistics.pairedDeltaMad // empty)) as $pairedDeltaMadValues
- | ($items | map(.observation.statistics.pairedDeltaSamples // []) | add // []) as $pairedDeltaSampleValues
- | ($items | map(.observation.statistics.measuredSampleCount // .observation.statistics.sampleCount // 1) | add // ($items | length)) as $sampleCount
- | ($values | median) as $median
- | {
- target: ($items[0].target // {}),
- observation: ($items[-1].observation // {}),
- measurementKind: ($items[-1].observation.measurementKind // null),
- value: $median,
- min: ($values | min),
- max: ($values | max),
- p25: ($values | percentile(0.25)),
- p75: ($values | percentile(0.75)),
- p95: ($values | percentile(0.95)),
- mad: ($values | map(. - $median | if . < 0 then -. else . end) | median),
- sourceCount: ($items | length),
- sampleCount: $sampleCount,
- pairedSampleCount: ($items | map(.observation.statistics.pairedSampleCount // .observation.comparison.pairedSampleCount // 0) | add // 0),
- pairedBaselineValue: (if ($pairedBaselineValues | length) == 0 then null else ($pairedBaselineValues | median) end),
- pairedDeltaMedianValue: (if ($pairedDeltaMedianValues | length) == 0 then null else ($pairedDeltaMedianValues | median) end),
- pairedDeltaP25Value: (if ($pairedDeltaP25Values | length) == 0 then null else ($pairedDeltaP25Values | median) end),
- pairedDeltaP75Value: (if ($pairedDeltaP75Values | length) == 0 then null else ($pairedDeltaP75Values | median) end),
- pairedDeltaMadValue: (if ($pairedDeltaMadValues | length) == 0 then null else ($pairedDeltaMadValues | median) end),
- pairedDeltaSampleValues: $pairedDeltaSampleValues,
- generatedAt: ($items[-1].generatedAt // null)
- };
-
- def budget($metric; $unit):
- if $metric == "nix.closure.nar_size" then
- {warnRatio:1.05, failRatio:1.10, warnAbs:52428800, failAbs:209715200, statisticalToleranceRatio:0.02, statisticalToleranceAbs:10485760}
- elif $metric == "nix.closure.bucket.nar_size" then
- {warnRatio:1.10, failRatio:1.20, warnAbs:52428800, failAbs:209715200, statisticalToleranceRatio:0.05, statisticalToleranceAbs:10485760}
- elif $metric == "nix.closure.path_count" then
- {warnRatio:1.05, failRatio:1.10, warnAbs:100, failAbs:500, statisticalToleranceRatio:0.02, statisticalToleranceAbs:10}
- elif $unit == "seconds" then
- {warnRatio:1.10, failRatio:1.20, warnAbs:0.25, failAbs:1, statisticalToleranceRatio:0.10, statisticalToleranceAbs:0.25}
- else
- {warnRatio:1.25, failRatio:1.50, warnAbs:1, failAbs:3, statisticalToleranceRatio:0.10, statisticalToleranceAbs:1}
- end;
-
- def noise_floor($metric; $unit):
- if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" then 10485760
- elif $metric == "nix.closure.path_count" then 10
- elif $unit == "seconds" then 0.1
- else 0
- end;
- def default_policy($metric; $unit):
- budget($metric; $unit) as $b
- | noise_floor($metric; $unit) as $noise
- | $b + {
- enabled:true,
- comparisonMode:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" or $unit != "seconds" then "budget" else "historical" end),
- minBaselineSources:(if $metric == "nix.closure.nar_size" or $metric == "nix.closure.bucket.nar_size" or $metric == "nix.closure.path_count" or $unit != "seconds" then 1 else 10 end),
- minCurrentSamples:(if $unit == "seconds" then 3 else 1 end),
- minPairedSamples:(if $unit == "seconds" then 5 else 0 end),
- noiseFloor:$noise
- };
- def observation_policy($obs):
- default_policy($obs.name // "unknown"; $obs.unit // "unknown") + ($obs.policy // {});
- def policy_enabled($policy):
- if ($policy | has("enabled")) then $policy.enabled else true end;
-
- def classify($metric; $unit; $measurementKind; $policy; $current; $currentP25; $currentP75; $currentMad; $baseline; $baselineMin; $baselineMax; $baselineP25; $baselineP75; $baselineP95; $baselineMad; $currentSamples; $baselineSources; $pairedSamples; $pairedDeltaMedian; $pairedDeltaP25; $pairedDeltaP75; $pairedDeltaMad; $pairedDeltaValues):
- $policy as $b
- | ($policy.comparisonMode // (if $measurementKind == "deterministic" or $unit != "seconds" then "budget" elif $measurementKind == "diagnostic" then "diagnostic" else "historical" end)) as $comparisonMode
- | ($policy.noiseFloor // noise_floor($metric; $unit)) as $noise
- | ($current - $baseline) as $delta
- | (if $comparisonMode == "paired" and $pairedDeltaMedian != null then $pairedDeltaMedian else $delta end) as $evidenceDelta
- | (($policy.pairedEvidenceQuantile // 0.25) | tonumber) as $pairedEvidenceQuantile
- | (if $baseline > 0 then ($current / $baseline) else null end) as $ratio
- | (($baselineP75 // $baseline) - ($baselineP25 // $baseline)) as $iqr
- | (($currentP75 // $current) - ($currentP25 // $current)) as $currentIqr
- | (($pairedDeltaP75 // $evidenceDelta) - ($pairedDeltaP25 // $evidenceDelta)) as $pairedDeltaIqr
- | ([
- $noise,
- (($policy.statisticalToleranceAbs // 0) | tonumber),
- (if $baseline > 0 then ($baseline * (($policy.statisticalToleranceRatio // 0) | tonumber)) else 0 end),
- (($baselineMad // 0) * 3),
- (($iqr // 0) * 1.5)
- ] | max) as $robustTolerance
- | (if $currentSamples > 1 then ([
- $noise,
- (($policy.statisticalToleranceAbs // 0) | tonumber),
- (if $current > 0 then ($current * (($policy.statisticalToleranceRatio // 0) | tonumber)) else 0 end),
- (($currentMad // 0) * 3),
- (($currentIqr // 0) * 1.5)
- ] | max) else 0 end) as $currentRobustTolerance
- | ([
- $noise,
- (($policy.statisticalToleranceAbs // 0) | tonumber),
- (if $baseline > 0 then ($baseline * (($policy.statisticalToleranceRatio // 0) | tonumber)) else 0 end),
- (($pairedDeltaMad // 0) * 3),
- (($pairedDeltaIqr // 0) * 1.5)
- ] | max) as $pairedDeltaTolerance
- | ($baseline + $robustTolerance) as $robustUpper
- | ($baseline - $robustTolerance) as $robustLower
- | ($current + $currentRobustTolerance) as $currentRobustUpper
- | ($current - $currentRobustTolerance) as $currentRobustLower
- | (if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then ($pairedDeltaValues | percentile($pairedEvidenceQuantile)) else ($evidenceDelta - $pairedDeltaTolerance) end) as $evidenceDeltaLower
- | (if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then ($pairedDeltaValues | percentile(1 - $pairedEvidenceQuantile)) else ($evidenceDelta + $pairedDeltaTolerance) end) as $evidenceDeltaUpper
- | ([($b.warnAbs // 0), (if $baseline > 0 then ($baseline * (($b.warnRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $warnBudget
- | ([($b.failAbs // 0), (if $baseline > 0 then ($baseline * (($b.failRatio // 1) - 1)) else 0 end), $noise, 0.000000001] | max) as $failBudget
- | ($comparisonMode != "paired") as $needsHistoricalBaselineCount
- | (
- ($current >= $robustLower and $current <= $robustUpper)
- or ($currentRobustTolerance > 0 and $currentRobustLower <= $robustUpper and $currentRobustUpper >= $robustLower)
- ) as $withinRobustBand
- | ($comparisonMode == "historical" and $measurementKind != "deterministic") as $canUseRobustBandSuppression
- | (
- $baselineMin != null
- and $baselineMax != null
- and $current >= $baselineMin
- and $current <= $baselineMax
- ) as $withinBaselineRange
- | (
- if $baseline <= 0 then "unknown"
- elif $comparisonMode == "paired" and $evidenceDeltaLower > $failBudget then "fail"
- elif $comparisonMode == "paired" and $evidenceDeltaLower > $warnBudget then "warn"
- elif $comparisonMode == "paired" then "pass"
- elif ($delta > $b.failAbs and $current > ($baseline * $b.failRatio)) then "fail"
- elif ($delta > $b.warnAbs and $current > ($baseline * $b.warnRatio)) then "warn"
- else "pass"
- end
- ) as $thresholdStatus
- | (
- policy_enabled($policy) == true
- and $baseline > 0
- and (if $needsHistoricalBaselineCount then $baselineSources >= ($policy.minBaselineSources // 1) else true end)
- and $currentSamples >= ($policy.minCurrentSamples // 1)
- and (if $comparisonMode == "paired" then $pairedSamples >= ($policy.minPairedSamples // 1) else true end)
- and (if $comparisonMode == "paired" then $pairedDeltaMedian != null else true end)
- ) as $gateable
- | (
- if (policy_enabled($policy) != true) then "disabled"
- elif $baseline <= 0 then "missing_baseline"
- elif $needsHistoricalBaselineCount and $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
- elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
- elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
- elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta"
- else "eligible"
- end
- ) as $gateReason
- | (
- if $baseline <= 0 then "unknown"
- elif (policy_enabled($policy) != true) then "diagnostic"
- elif ($delta | abs_value) <= $noise then "noise_floor"
- elif $needsHistoricalBaselineCount and $baselineSources < ($policy.minBaselineSources // 1) then "low_baseline_count"
- elif $currentSamples < ($policy.minCurrentSamples // 1) then "low_current_sample_count"
- elif $comparisonMode == "paired" and $pairedSamples < ($policy.minPairedSamples // 1) then "low_paired_sample_count"
- elif $comparisonMode == "paired" and $pairedDeltaMedian == null then "missing_paired_delta"
- elif $comparisonMode == "paired" and $thresholdStatus == "pass" and $evidenceDelta > $warnBudget then "paired_uncertain"
- elif ($canUseRobustBandSuppression and $thresholdStatus != "pass" and $withinRobustBand) then "within_robust_band"
- elif $thresholdStatus == "pass" then "within_budget"
- else "threshold_exceeded"
- end
- ) as $confidence
- | (
- if ($gateable and $confidence == "threshold_exceeded") then $thresholdStatus
- elif $thresholdStatus == "unknown" then "unknown"
- else "pass"
- end
- ) as $status
- | (
- if $baseline <= 0 then "unknown"
- elif $comparisonMode == "paired" and ($evidenceDelta | abs_value) <= $noise then "unchanged"
- elif $comparisonMode == "paired" and $evidenceDeltaLower <= 0 and $evidenceDeltaUpper >= 0 then "unchanged"
- elif $comparisonMode == "paired" and $evidenceDelta < 0 then "improved"
- elif $comparisonMode == "paired" then "regressed"
- elif ($delta | abs_value) <= $noise then "unchanged"
- elif $canUseRobustBandSuppression and $withinRobustBand then "unchanged"
- elif $delta < 0 then "improved"
- else "regressed"
- end
- ) as $direction
- | (
- if $baseline <= 0 then null
- elif (policy_enabled($policy) != true) then null
- elif $comparisonMode == "paired" and ($evidenceDeltaLower <= 0 and $evidenceDeltaUpper >= 0) then 0
- elif $comparisonMode == "paired" and ($evidenceDelta | abs_value) <= $noise then 0
- elif $comparisonMode == "paired" and $evidenceDelta > 0 then ([0, $evidenceDeltaLower] | max) / $warnBudget
- elif $comparisonMode == "paired" then -(([0, (-$evidenceDeltaUpper)] | max) / $warnBudget)
- elif $canUseRobustBandSuppression and $withinRobustBand then 0
- elif ($delta | abs_value) <= $noise then 0
- elif ($confidence == "threshold_exceeded" and $delta > 0) then ([0, ($currentRobustLower - $robustUpper), $delta] | max) / $warnBudget
- elif ($confidence == "threshold_exceeded" and $delta < 0) then -(([0, ($robustLower - $currentRobustUpper), (-$delta)] | max) / $warnBudget)
- elif $delta > 0 then ([0, ($currentRobustLower - $robustUpper)] | max) / $warnBudget
- else -(([0, ($robustLower - $currentRobustUpper)] | max) / $warnBudget)
- end
- ) as $semanticImpactScore
- | (
- if (policy_enabled($policy) != true) then "diagnostic"
- elif $semanticImpactScore == null then "unknown"
- elif $semanticImpactScore == 0 then "neutral"
- elif $semanticImpactScore >= ($failBudget / $warnBudget) then "fail_boundary"
- elif $semanticImpactScore >= 1 then "warn_boundary"
- elif $semanticImpactScore > 0 then "below_warn_boundary"
- else "improvement"
- end
- ) as $semanticImpactKind
- | {status:$status,current:$current,baseline:$baseline,delta:$delta,ratio:$ratio,budget:$b,gatePolicy:$policy,comparisonMode:$comparisonMode,gateable:$gateable,gateReason:$gateReason,confidence:$confidence,direction:$direction,semanticImpactScore:$semanticImpactScore,semanticImpactKind:$semanticImpactKind,semanticWarnBudget:$warnBudget,semanticFailBudget:$failBudget,baselineRobustLower:$robustLower,baselineRobustUpper:$robustUpper,baselineRobustTolerance:$robustTolerance,currentRobustLower:$currentRobustLower,currentRobustUpper:$currentRobustUpper,currentRobustTolerance:$currentRobustTolerance,withinBaselineRange:$withinBaselineRange,pairedSamples:$pairedSamples,evidenceDelta:$evidenceDelta,evidenceDeltaLower:$evidenceDeltaLower,evidenceDeltaUpper:$evidenceDeltaUpper,evidenceDeltaTolerance:$pairedDeltaTolerance,pairedEvidenceQuantile:$pairedEvidenceQuantile,pairedEvidenceProtocol:(if $comparisonMode == "paired" and ($pairedDeltaValues | length) > 0 then "paired-delta-quantile-v1" elif $comparisonMode == "paired" then "paired-summary-robust-band-v1" else null end)};
-
- (observations_by_key($current[0]) | with_entries(.value = observation_stats(.value))) as $currentObs
- | (observations_by_key($baseline[0]) | with_entries(.value = observation_stats(.value))) as $baselineObs
- | (
- $currentObs
- | to_entries
- | map(
- .key as $key
- | .value as $currentValue
- | ($baselineObs[$key] // null) as $baselineValue
- | ($currentValue.observation | observation_policy(.)) as $policy
- | ($policy.comparisonMode // (if ($currentValue.observation.measurementKind // $currentValue.measurementKind) == "deterministic" or ($currentValue.observation.unit // "") != "seconds" then "budget" elif ($currentValue.observation.measurementKind // $currentValue.measurementKind) == "diagnostic" then "diagnostic" else "historical" end)) as $comparisonMode
- | ($currentValue.pairedBaselineValue // null) as $pairedBaselineValue
- | (if $comparisonMode == "paired" and $pairedBaselineValue != null then {
- value: $pairedBaselineValue,
- min: $pairedBaselineValue,
- max: $pairedBaselineValue,
- p25: $pairedBaselineValue,
- p75: $pairedBaselineValue,
- p95: $pairedBaselineValue,
- mad: 0,
- sourceCount: $currentValue.pairedSampleCount
- } else $baselineValue end) as $effectiveBaselineValue
- | {
- key: $key,
- value: (
- if $effectiveBaselineValue == null then
- {
- status: "missing_baseline",
- target: $currentValue.target,
- observation: $currentValue.observation,
- current: $currentValue.value,
- currentSamples: $currentValue.sampleCount,
- baselineSources: 0,
- gatePolicy: $policy,
- comparisonMode: $comparisonMode,
- gateable: false,
- gateReason: "missing_baseline",
- confidence: "missing_baseline",
- direction: "unknown"
- }
- else
- classify(
- $currentValue.observation.name;
- $currentValue.observation.unit;
- ($currentValue.observation.measurementKind // $currentValue.measurementKind);
- $policy;
- $currentValue.value;
- $currentValue.p25;
- $currentValue.p75;
- $currentValue.mad;
- $effectiveBaselineValue.value;
- $effectiveBaselineValue.min;
- $effectiveBaselineValue.max;
- $effectiveBaselineValue.p25;
- $effectiveBaselineValue.p75;
- $effectiveBaselineValue.p95;
- $effectiveBaselineValue.mad;
- $currentValue.sampleCount;
- $effectiveBaselineValue.sourceCount;
- $currentValue.pairedSampleCount;
- $currentValue.pairedDeltaMedianValue;
- $currentValue.pairedDeltaP25Value;
- $currentValue.pairedDeltaP75Value;
- $currentValue.pairedDeltaMadValue;
- ($currentValue.pairedDeltaSampleValues // [])
- ) + {
- target: $currentValue.target,
- observation: $currentValue.observation,
- currentSamples: $currentValue.sampleCount,
- baselineSources: $effectiveBaselineValue.sourceCount,
- baselineMin: $effectiveBaselineValue.min,
- baselineMax: $effectiveBaselineValue.max,
- baselineP25: $effectiveBaselineValue.p25,
- baselineP75: $effectiveBaselineValue.p75,
- baselineP95: $effectiveBaselineValue.p95
- ,baselineMad: $effectiveBaselineValue.mad
- }
- end
- )
- }
- )
- | from_entries
- ) as $comparisons
- | (
- if any($comparisons[]?; .status == "fail") then "fail"
- elif any($comparisons[]?; .status == "warn") then "warn"
- elif any($comparisons[]?;
- (if (.gatePolicy | has("enabled")) then .gatePolicy.enabled else true end)
- and (.gateReason == "missing_baseline"
- or .gateReason == "low_baseline_count"
- or .gateReason == "low_current_sample_count"
- or .gateReason == "low_paired_sample_count"
- or .gateReason == "missing_paired_delta")
- ) then "partial"
- else "pass"
- end
- ) as $status
- | (
- [$comparisons[]?]
+ jq -n \
+ --slurpfile paths "$paths_file" \
+ --argjson schemaVersion 1 \
+ --arg generatedAt "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
+ --arg repository "${GITHUB_REPOSITORY:-unknown}" \
+ --arg branchKind "${GITHUB_EVENT_NAME:-unknown}" \
+ --arg ref "${CI_MEASUREMENT_SUBJECT_REF:-${GITHUB_REF:-unknown}}" \
+ --arg headSha "${CI_MEASUREMENT_SUBJECT_SHA:-${GITHUB_SHA:-unknown}}" \
+ --arg baseSha "${GITHUB_BASE_SHA:-}" \
+ --arg runnerName "${RUNNER_NAME:-unknown}" \
+ --arg runnerOs "${RUNNER_OS:-unknown}" \
+ --arg runnerArch "${RUNNER_ARCH:-unknown}" \
+ --arg runnerClass "${RUNNER_CLASS:-unknown}" \
+ --arg githubRunId "${GITHUB_RUN_ID:-unknown}" \
+ --arg githubRunAttempt "${GITHUB_RUN_ATTEMPT:-unknown}" \
+ --arg githubJob "${GITHUB_JOB:-unknown}" \
+ --arg taskId "${CROSSTASK_TASK_ID:-}" \
+ --arg taskAttemptId "${CROSSTASK_ATTEMPT_ID:-}" \
+ --arg traceId "${TRACE_ID:-}" \
+ --arg targetName "$target_name" \
+ --arg targetId "$target_id" \
+ --arg targetLabel "$target_label" \
+ --arg targetGroup "$target_group" \
+ --arg targetDescription "$target_description" \
+ --arg targetSystem "$target_system" \
+ --arg outPath "$out_path" \
+ --argjson buckets '[{"name":"node","label":"Node / pnpm","pathRegex":"node_modules|npm-deps|pnpm"},{"name":"nix-sources","label":"Nix sources","pathRegex":"-source$"},{"name":"rust","label":"Rust","pathRegex":"cargo|rust|rustc"}]' \
+ --argjson targetPath '["nix","closures","packages","megarepo"]' \
+ --argjson gatePolicy '{}' \
+ '
+ ($paths[0] // []) as $closurePaths
+ | ($closurePaths | map(.narSize) | add // 0) as $totalNarSize
+ | ($closurePaths | length) as $pathCount
+ | ($buckets | map(
+ . as $bucket
| {
- enabledCount: (map(select((if (.gatePolicy | has("enabled")) then .gatePolicy.enabled else true end))) | length),
- gateableCount: (map(select(.gateable == true)) | length),
- missingBaselineCount: (map(select(.gateReason == "missing_baseline")) | length),
- lowBaselineCount: (map(select(.gateReason == "low_baseline_count")) | length),
- lowCurrentSampleCount: (map(select(.gateReason == "low_current_sample_count")) | length),
- lowPairedSampleCount: (map(select(.gateReason == "low_paired_sample_count")) | length),
- missingPairedDeltaCount: (map(select(.gateReason == "missing_paired_delta")) | length)
+ name: "nix.closure.bucket.nar_size",
+ id: "nix.closure.bucket.nar_size",
+ label: (($bucket.label // $bucket.name) + " closure size"),
+ group: "nix closure buckets",
+ path: ($targetPath + ["buckets", $bucket.name]),
+ description: ("NAR size contributed by closure paths matching " + $bucket.pathRegex),
+ measurementKind: "deterministic",
+ unit: "bytes",
+ value: (
+ $closurePaths
+ | map(select(.path | test($bucket.pathRegex)) | .narSize)
+ | add // 0
+ ),
+ policy: $gatePolicy,
+ dimensions: { bucket: $bucket.name }
+ }
+ )) as $bucketObservations
+ | {
+ schemaVersion: $schemaVersion,
+ generatedAt: $generatedAt,
+ producer: { name: "effect-utils-ci-measurement", version: 1 },
+ subject: {
+ repo: $repository,
+ branchKind: (if $branchKind == "" then "unknown" else $branchKind end),
+ ref: $ref,
+ headSha: $headSha,
+ baseSha: $baseSha
+ },
+ execution: {
+ provider: (if ($githubRunId != "" and $githubRunId != "unknown") then "github-actions" else "local" end),
+ workflow: "CI",
+ job: $githubJob,
+ runId: $githubRunId,
+ runAttempt: $githubRunAttempt,
+ taskId: $taskId,
+ attemptId: $taskAttemptId,
+ traceId: $traceId,
+ runner: { name: $runnerName, os: $runnerOs, arch: $runnerArch, class: $runnerClass }
+ },
+ target: { kind: "nix-closure", id: $targetId, name: $targetName, label: $targetLabel, group: $targetGroup, path: $targetPath, system: $targetSystem },
+ observations: ([
+ {
+ id: "nix.closure.nar_size",
+ label: "Total closure size",
+ group: "nix closure",
+ path: ($targetPath + ["total", "nar-size"]),
+ description: ("Total NAR size for all paths in " + $targetDescription),
+ name: "nix.closure.nar_size",
+ measurementKind: "deterministic",
+ unit: "bytes",
+ value: $totalNarSize,
+ policy: $gatePolicy,
+ dimensions: { bucket: "total" }
+ },
+ {
+ id: "nix.closure.path_count",
+ label: "Total closure path count",
+ group: "nix closure",
+ path: ($targetPath + ["total", "path-count"]),
+ description: ("Number of store paths in " + $targetDescription),
+ name: "nix.closure.path_count",
+ measurementKind: "deterministic",
+ unit: "count",
+ value: $pathCount,
+ policy: $gatePolicy,
+ dimensions: { bucket: "total" }
}
- | . + {
- nonGateableCount: (.enabledCount - .gateableCount),
- enforceable: (.enabledCount == .gateableCount)
+ ] + $bucketObservations),
+ artifacts: [
+ { name: "nix-closure-path-info", path: "nix-closure-path-info.json", contentType: "application/json" },
+ { name: "nix-closure-paths", path: "nix-closure-paths.json", contentType: "application/json" }
+ ],
+ details: {
+ outPath: $outPath,
+ topPaths: ($closurePaths | sort_by(.narSize) | reverse | .[:30])
+ }
+ }
+ ' >"$artifact_file"
+
+ cat "$artifact_file"
+
+ - name: 'Measure Nix closure: oxlint-npm'
+ shell: bash
+ env:
+ ARTIFACT_DIR: tmp/nix-closure-ci/current/oxlint_npm_package
+ RUNNER_CLASS: '${{ runner.os }}-${{ runner.arch }}'
+ run: |
+ set -euo pipefail
+
+ mkdir -p "$ARTIFACT_DIR"
+ installable='.#oxlint-npm'
+ target_id='oxlint_npm_package'
+ target_name='oxlint-npm'
+ target_label='oxlint npm package'
+ target_group='packages'
+ target_description='the packaged oxlint npm compatibility wrapper closure'
+ artifact_file="$ARTIFACT_DIR/measurements.json"
+ target_system='x86_64-linux'
+
+ out_path="$(nix build --no-update-lock-file --no-link --print-out-paths "$installable")"
+ path_info="$ARTIFACT_DIR/nix-closure-path-info.json"
+ paths_file="$ARTIFACT_DIR/nix-closure-paths.json"
+
+ nix path-info --recursive --json "$out_path" >"$path_info"
+ jq 'to_entries | map({ path: .key, narSize: (.value.narSize // 0) })' "$path_info" >"$paths_file"
+
+ jq -n \
+ --slurpfile paths "$paths_file" \
+ --argjson schemaVersion 1 \
+ --arg generatedAt "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
+ --arg repository "${GITHUB_REPOSITORY:-unknown}" \
+ --arg branchKind "${GITHUB_EVENT_NAME:-unknown}" \
+ --arg ref "${CI_MEASUREMENT_SUBJECT_REF:-${GITHUB_REF:-unknown}}" \
+ --arg headSha "${CI_MEASUREMENT_SUBJECT_SHA:-${GITHUB_SHA:-unknown}}" \
+ --arg baseSha "${GITHUB_BASE_SHA:-}" \
+ --arg runnerName "${RUNNER_NAME:-unknown}" \
+ --arg runnerOs "${RUNNER_OS:-unknown}" \
+ --arg runnerArch "${RUNNER_ARCH:-unknown}" \
+ --arg runnerClass "${RUNNER_CLASS:-unknown}" \
+ --arg githubRunId "${GITHUB_RUN_ID:-unknown}" \
+ --arg githubRunAttempt "${GITHUB_RUN_ATTEMPT:-unknown}" \
+ --arg githubJob "${GITHUB_JOB:-unknown}" \
+ --arg taskId "${CROSSTASK_TASK_ID:-}" \
+ --arg taskAttemptId "${CROSSTASK_ATTEMPT_ID:-}" \
+ --arg traceId "${TRACE_ID:-}" \
+ --arg targetName "$target_name" \
+ --arg targetId "$target_id" \
+ --arg targetLabel "$target_label" \
+ --arg targetGroup "$target_group" \
+ --arg targetDescription "$target_description" \
+ --arg targetSystem "$target_system" \
+ --arg outPath "$out_path" \
+ --argjson buckets '[{"name":"node","label":"Node / pnpm","pathRegex":"node_modules|npm-deps|pnpm"},{"name":"nix-sources","label":"Nix sources","pathRegex":"-source$"},{"name":"rust","label":"Rust","pathRegex":"cargo|rust|rustc"}]' \
+ --argjson targetPath '["nix","closures","packages","oxlint-npm"]' \
+ --argjson gatePolicy '{}' \
+ '
+ ($paths[0] // []) as $closurePaths
+ | ($closurePaths | map(.narSize) | add // 0) as $totalNarSize
+ | ($closurePaths | length) as $pathCount
+ | ($buckets | map(
+ . as $bucket
+ | {
+ name: "nix.closure.bucket.nar_size",
+ id: "nix.closure.bucket.nar_size",
+ label: (($bucket.label // $bucket.name) + " closure size"),
+ group: "nix closure buckets",
+ path: ($targetPath + ["buckets", $bucket.name]),
+ description: ("NAR size contributed by closure paths matching " + $bucket.pathRegex),
+ measurementKind: "deterministic",
+ unit: "bytes",
+ value: (
+ $closurePaths
+ | map(select(.path | test($bucket.pathRegex)) | .narSize)
+ | add // 0
+ ),
+ policy: $gatePolicy,
+ dimensions: { bucket: $bucket.name }
}
- ) as $readiness
+ )) as $bucketObservations
| {
- schemaVersion:$schemaVersion,
- status:$status,
- mode:$mode,
- readiness:$readiness,
- currentDir:$currentDir,
- baselineDir:$baselineDir,
- comparisons:$comparisons
+ schemaVersion: $schemaVersion,
+ generatedAt: $generatedAt,
+ producer: { name: "effect-utils-ci-measurement", version: 1 },
+ subject: {
+ repo: $repository,
+ branchKind: (if $branchKind == "" then "unknown" else $branchKind end),
+ ref: $ref,
+ headSha: $headSha,
+ baseSha: $baseSha
+ },
+ execution: {
+ provider: (if ($githubRunId != "" and $githubRunId != "unknown") then "github-actions" else "local" end),
+ workflow: "CI",
+ job: $githubJob,
+ runId: $githubRunId,
+ runAttempt: $githubRunAttempt,
+ taskId: $taskId,
+ attemptId: $taskAttemptId,
+ traceId: $traceId,
+ runner: { name: $runnerName, os: $runnerOs, arch: $runnerArch, class: $runnerClass }
+ },
+ target: { kind: "nix-closure", id: $targetId, name: $targetName, label: $targetLabel, group: $targetGroup, path: $targetPath, system: $targetSystem },
+ observations: ([
+ {
+ id: "nix.closure.nar_size",
+ label: "Total closure size",
+ group: "nix closure",
+ path: ($targetPath + ["total", "nar-size"]),
+ description: ("Total NAR size for all paths in " + $targetDescription),
+ name: "nix.closure.nar_size",
+ measurementKind: "deterministic",
+ unit: "bytes",
+ value: $totalNarSize,
+ policy: $gatePolicy,
+ dimensions: { bucket: "total" }
+ },
+ {
+ id: "nix.closure.path_count",
+ label: "Total closure path count",
+ group: "nix closure",
+ path: ($targetPath + ["total", "path-count"]),
+ description: ("Number of store paths in " + $targetDescription),
+ name: "nix.closure.path_count",
+ measurementKind: "deterministic",
+ unit: "count",
+ value: $pathCount,
+ policy: $gatePolicy,
+ dimensions: { bucket: "total" }
+ }
+ ] + $bucketObservations),
+ artifacts: [
+ { name: "nix-closure-path-info", path: "nix-closure-path-info.json", contentType: "application/json" },
+ { name: "nix-closure-paths", path: "nix-closure-paths.json", contentType: "application/json" }
+ ],
+ details: {
+ outPath: $outPath,
+ topPaths: ($closurePaths | sort_by(.narSize) | reverse | .[:30])
+ }
}
- ' >"$comparison_file"
+ ' >"$artifact_file"
+
+ cat "$artifact_file"
+
+ - name: 'Upload CI measurements: nix-closure-measurements'
+ if: always()
+ uses: actions/upload-artifact@v4
+ with:
+ name: nix-closure-measurements
+ path: |
+ tmp/nix-closure-ci
+ !tmp/nix-closure-ci/baseline/**
+ if-no-files-found: error
+ retention-days: 30
+ - name: Save pnpm state
+ if: ${{ success() && steps.restore-pnpm-state.outputs.cache-hit != 'true' }}
+ uses: actions/cache/save@v4
+ with:
+ path: |
+ ${{ github.workspace }}/.pnpm-home
+ ${{ runner.temp }}/pnpm-store/${{ github.job }}
+ key: "pnpm-state-v1-${{ runner.os }}-${{ runner.arch }}-${{ hashFiles('**/pnpm-lock.yaml') }}"
+ - name: Nix diagnostics summary
+ if: failure()
+ shell: bash
+ run: |
+ diag_dir="${NIX_STORE_DIAGNOSTICS_DIR:-}"
+ if [ -z "$diag_dir" ] || [ ! -d "$diag_dir" ]; then
+ echo "## Nix Store Diagnostics" >> "$GITHUB_STEP_SUMMARY"
+ echo "" >> "$GITHUB_STEP_SUMMARY"
+ echo "No diagnostics directory found (validation may have failed before capture)." >> "$GITHUB_STEP_SUMMARY"
+ exit 0
+ fi
+
+ {
+ echo "## Nix Store Diagnostics"
+ echo ""
+ echo "Temporary instrumentation for #272; remove after root cause is confirmed and CI is stable."
+ echo ""
+ echo "- Diagnostics directory: \`$diag_dir\`"
+ echo "- Tracking issue: https://github.com/overengineeringstudio/effect-utils/issues/272"
+ } >> "$GITHUB_STEP_SUMMARY"
+
+ markers_file="${RUNNER_TEMP:-/tmp}/nix-store-signature-markers.txt"
+ grep -R -n -E "config\\.cachix|cachix\\.package|error: path '/nix/store/.+ is not valid" --exclude="$(basename "$markers_file")" "$diag_dir" > "$markers_file" || true
+
+ if [ -s "$markers_file" ]; then
+ {
+ echo ""
+ echo "### Signature markers"
+ echo '```text'
+ head -n 120 "$markers_file"
+ echo '```'
+ } >> "$GITHUB_STEP_SUMMARY"
+ else
+ echo "" >> "$GITHUB_STEP_SUMMARY"
+ echo "- No signature markers found in captured diagnostics." >> "$GITHUB_STEP_SUMMARY"
+ fi
+ - name: Upload Nix diagnostics artifact
+ if: failure() && env.NIX_STORE_DIAGNOSTICS_DIR != ''
+ uses: actions/upload-artifact@v4
+ with:
+ name: 'nix-store-diagnostics-${{ github.job }}-${{ runner.os }}-run-${{ github.run_id }}-attempt-${{ github.run_attempt }}'
+ path: ${{ env.NIX_STORE_DIAGNOSTICS_DIR }}
+ if-no-files-found: ignore
+ retention-days: 14
+ - name: Failure note
+ if: failure()
+ shell: bash
+ run: |
+ echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
+ echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
+ concurrency:
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-nix-closure-sizes"
+ cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
+ source-shape:
+ runs-on:
+ [namespace-profile-linux-x86-64, 'namespace-features:github.run-id=${{ github.run_id }}']
+ timeout-minutes: 30
+ defaults:
+ run:
+ shell: bash
+ permissions:
+ actions: read
+ contents: write
+ issues: write
+ pull-requests: write
+ env:
+ CI_MEASUREMENT_SUBJECT_REF: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.ref || github.ref }}
+ CI_MEASUREMENT_SUBJECT_SHA: ${{ inputs.measurement_baseline_ref || github.event.pull_request.head.sha || github.sha }}
+ CI_MEASUREMENT_SUBJECT_LABEL: ${{ inputs.measurement_baseline_label }}
+ CI_MEASUREMENT_ALLOW_PROBE_FAILURES: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && '1' || '' }}
+ steps:
+ - uses: actions/checkout@v6
+ - name: Checkout CI measurement baseline ref
+ if: ${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' }}
+ uses: actions/checkout@v6
+ with:
+ ref: ${{ inputs.measurement_baseline_ref }}
+ - name: 'Measure source shape: effect-utils'
+ shell: bash
+ env:
+ ARTIFACT_DIR: tmp/source-shape-ci/current/effect-utils
+ RUNNER_CLASS: '${{ runner.os }}-${{ runner.arch }}'
+ run: |
+ set -euo pipefail
- baseline_provenance_file="$baseline_dir/baseline-provenance.json"
- if [ -f "$baseline_provenance_file" ]; then
- comparison_with_provenance="$(mktemp)"
- jq --slurpfile baselineProvenance "$baseline_provenance_file" \
- '. + {baselineProvenance: ($baselineProvenance[0] // null)}' \
- "$comparison_file" >"$comparison_with_provenance"
- mv "$comparison_with_provenance" "$comparison_file"
- fi
+ mkdir -p "$ARTIFACT_DIR"
+ target_id='effect_utils'
+ target_name='effect-utils'
+ target_label='effect-utils repository'
+ target_group='source'
+ artifact_file="$ARTIFACT_DIR/measurements.json"
+ target_system="${DEVENV_SYSTEM:-${RUNNER_OS:-unknown}}"
- status="$(jq -r '.status' "$comparison_file")"
- exit_code=0
- case "$status:$mode" in
- fail:fail)
- echo "::error::CI measurement regression detected"
- exit_code=1
- ;;
- fail:*|warn:*)
- echo "::warning::CI measurement regression threshold exceeded"
- ;;
- partial:*)
- echo "::notice::CI measurement comparison is partial because one or more enabled observations are not gateable"
- ;;
- esac
+ SCOPES_JSON='[{"id":"genie_ci_workflow","label":"Genie CI workflow helpers","group":"source / ci","path":["source","effect-utils","genie","ci-workflow"],"includePaths":["genie/ci-workflow",".github/workflows/ci.yml.genie.ts"],"includeExtensions":[".ts"]},{"id":"genie_runtime","label":"Genie runtime","group":"source / genie","path":["source","effect-utils","packages","genie"],"includePaths":["packages/@overeng/genie/src"],"includeExtensions":[".ts",".tsx"]},{"id":"nix_workspace_tools","label":"Nix workspace tools","group":"source / nix","path":["source","effect-utils","nix","workspace-tools"],"includePaths":["nix/workspace-tools"],"includeExtensions":[".nix"]}]' \
+ TARGET_PATH_JSON='["source","effect-utils"]' \
+ TARGET_ID="$target_id" \
+ TARGET_NAME="$target_name" \
+ TARGET_LABEL="$target_label" \
+ TARGET_GROUP="$target_group" \
+ TARGET_SYSTEM="$target_system" \
+ node <<'NODE' >"$artifact_file"
+ const cp = require('node:child_process')
+ const fs = require('node:fs')
+ const path = require('node:path')
- if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
- {
- echo "### ${CI_MEASUREMENT_PR_COMMENT_TITLE:-CI Measurements}"
- echo ""
- jq -r '"- Status: " + .status + "\n- Gate: " + (if .mode == "fail" then "enforced" elif .mode == "warn" then "advisory" elif .mode == "off" then "off" else (.mode // "unknown") end) + "\n- Baseline: " + .baselineDir' "$comparison_file"
- echo ""
- echo "| Status | Gate | Target | Observation | Current | Baseline | Delta | Ratio |"
- echo "| --- | --- | --- | --- | ---: | ---: | ---: | ---: |"
- jq -r '
- .comparisons
- | to_entries
- | sort_by(
- if .value.status == "fail" then 0
- elif .value.status == "warn" then 1
- elif .value.status == "missing_baseline" then 2
- else 3
- end
- )
- | .[:20]
- | .[]
- | .value as $v
- | [
- $v.status,
- (if ($v.gateable // false) then "yes" else ($v.gateReason // "no") end),
- (($v.target.kind // "unknown") + "/" + ($v.target.name // "unknown") + "/" + ($v.target.system // "unknown")),
- ($v.observation.name // "unknown"),
- (($v.current // $v.observation.value // 0) | tostring),
- (($v.baseline // "") | tostring),
- (($v.delta // "") | tostring),
- (if $v.ratio == null or $v.ratio == "" then "" else (($v.ratio * 100 | round / 100) | tostring) end)
- ]
- | "| " + (map(gsub("\\|"; "\\\\|")) | join(" | ")) + " |"
- ' "$comparison_file"
- } >>"$GITHUB_STEP_SUMMARY"
- fi
+ const normalize = (value) => {
+ const normalized = value.replace(/\\/g, '/').replace(/^\.\//, '').replace(/\/+$/, '')
+ return normalized === '.' ? '' : normalized
+ }
+ const scopes = JSON.parse(process.env.SCOPES_JSON || '[]')
+ const targetPath = JSON.parse(process.env.TARGET_PATH_JSON || '["source"]')
+ const gitFiles = cp
+ .execFileSync('git', ['ls-files', '-z'], { encoding: 'buffer' })
+ .toString('utf8')
+ .split('\0')
+ .filter(Boolean)
+ .map(normalize)
+ const includesPath = (file, candidates) => {
+ if (!Array.isArray(candidates) || candidates.length === 0) return true
+ return candidates.map(normalize).some((candidate) => candidate === '' || file === candidate || file.startsWith(candidate + '/'))
+ }
+ const excludesPath = (file, candidates) =>
+ Array.isArray(candidates) &&
+ candidates.map(normalize).some((candidate) => candidate !== '' && (file === candidate || file.startsWith(candidate + '/')))
- if [ "$exit_code" -ne 0 ]; then
- exit "$exit_code"
- fi
+ const matchesExtension = (file, extensions) => {
+ if (!Array.isArray(extensions) || extensions.length === 0) return true
+ const ext = path.extname(file).toLowerCase()
+ return extensions.map((extension) => extension.toLowerCase()).some((extension) => ext === extension)
+ }
+
+ const countLines = (file) => {
+ const buffer = fs.readFileSync(file)
+ if (buffer.includes(0)) return undefined
+ if (buffer.length === 0) return 0
+ let lines = 0
+ for (const byte of buffer) {
+ if (byte === 10) lines += 1
+ }
+ return buffer[buffer.length - 1] === 10 ? lines : lines + 1
+ }
+
+ const observations = []
+ const scopeSummaries = []
+
+ for (const scope of scopes) {
+ const root = normalize(scope.root || '.')
+ const includePaths = Array.isArray(scope.includePaths) && scope.includePaths.length > 0 ? scope.includePaths : [root]
+ const files = gitFiles
+ .filter((file) => includesPath(file, includePaths))
+ .filter((file) => !excludesPath(file, scope.excludePaths))
+ .filter((file) => matchesExtension(file, scope.includeExtensions))
+
+ let lineCount = 0
+ let measuredFileCount = 0
+ for (const file of files) {
+ const lines = countLines(file)
+ if (lines === undefined) continue
+ lineCount += lines
+ measuredFileCount += 1
+ }
+
+ const group = scope.group || 'source shape'
+ const scopePath = Array.isArray(scope.path) ? scope.path : ['source', scope.id]
+ const policy = scope.gate || { enabled: false, minBaselineSources: 3, minCurrentSamples: 1 }
+ observations.push(
+ {
+ id: 'source.lines',
+ label: scope.label + ' lines',
+ group,
+ path: scopePath,
+ description: 'Tracked non-binary source lines in the configured scope.',
+ measurementKind: 'deterministic',
+ name: 'source.lines',
+ unit: 'lines',
+ value: lineCount,
+ dimensions: { scope: scope.id },
+ policy,
+ statistics: { sampleCount: 1, measuredSampleCount: measuredFileCount },
+ },
+ {
+ id: 'source.files',
+ label: scope.label + ' files',
+ group,
+ path: scopePath,
+ description: 'Tracked non-binary source files in the configured scope.',
+ measurementKind: 'deterministic',
+ name: 'source.files',
+ unit: 'count',
+ value: measuredFileCount,
+ dimensions: { scope: scope.id },
+ policy,
+ statistics: { sampleCount: 1, measuredSampleCount: measuredFileCount },
+ },
+ )
+ scopeSummaries.push({
+ id: scope.id,
+ label: scope.label,
+ root,
+ includePaths,
+ excludePaths: scope.excludePaths || [],
+ includeExtensions: scope.includeExtensions || [],
+ fileCount: measuredFileCount,
+ lineCount,
+ })
+ }
+
+ const artifact = {
+ schemaVersion: 1,
+ generatedAt: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z'),
+ producer: {
+ name: 'effect-utils-ci-measurement',
+ version: 1,
+ measurementProtocol: 'source-shape-v1',
+ },
+ subject: {
+ repo: process.env.GITHUB_REPOSITORY || 'unknown',
+ branchKind: process.env.GITHUB_EVENT_NAME || 'unknown',
+ ref: process.env.CI_MEASUREMENT_SUBJECT_REF || process.env.GITHUB_REF || 'unknown',
+ headSha: process.env.CI_MEASUREMENT_SUBJECT_SHA || process.env.GITHUB_SHA || 'unknown',
+ baseSha: process.env.GITHUB_BASE_SHA || '',
+ },
+ execution: {
+ provider: process.env.GITHUB_RUN_ID && process.env.GITHUB_RUN_ID !== 'unknown' ? 'github-actions' : 'local',
+ workflow: 'CI',
+ job: process.env.GITHUB_JOB || 'unknown',
+ runId: process.env.GITHUB_RUN_ID || 'unknown',
+ runAttempt: process.env.GITHUB_RUN_ATTEMPT || 'unknown',
+ taskId: process.env.CROSSTASK_TASK_ID || '',
+ attemptId: process.env.CROSSTASK_ATTEMPT_ID || '',
+ traceId: process.env.TRACE_ID || '',
+ runner: {
+ name: process.env.RUNNER_NAME || 'unknown',
+ os: process.env.RUNNER_OS || 'unknown',
+ arch: process.env.RUNNER_ARCH || 'unknown',
+ class: process.env.RUNNER_CLASS || 'unknown',
+ },
+ },
+ target: {
+ kind: 'source-shape',
+ id: process.env.TARGET_ID,
+ name: process.env.TARGET_NAME,
+ label: process.env.TARGET_LABEL,
+ group: process.env.TARGET_GROUP,
+ path: targetPath,
+ system: process.env.TARGET_SYSTEM,
+ },
+ observations,
+ details: { scopes: scopeSummaries },
+ }
+
+ process.stdout.write(JSON.stringify(artifact, null, 2) + '\n')
+ NODE
+
+ cat "$artifact_file"
- if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
- name: 'Upload CI measurements: source-shape'
if: always()
uses: actions/upload-artifact@v4
diff --git a/.github/workflows/ci.yml.genie.ts b/.github/workflows/ci.yml.genie.ts
index 016fa0b20..a65f5c5e1 100644
--- a/.github/workflows/ci.yml.genie.ts
+++ b/.github/workflows/ci.yml.genie.ts
@@ -456,6 +456,7 @@ const extraJobs: Record = {
},
],
permissions: ciMeasurementsCommentPermissions,
+ compare: false,
prComment: {
enabled: false,
title: 'Devenv Performance',
@@ -483,6 +484,7 @@ const extraJobs: Record = {
baselineMaxRuns: 20,
targets: nixClosureMeasurementTargets,
buckets: defaultNixClosureMeasurementBuckets,
+ compare: false,
regressionMode: 'warn',
prComment: {
enabled: false,
@@ -509,25 +511,6 @@ const extraJobs: Record = {
steps: [
checkoutStep(),
ciMeasurementBaselineCheckoutStep,
- {
- ...downloadPreviousGitHubArtifactStep({
- artifactName: 'source-shape',
- outputDir: `${sourceShapeMeasurementsDir}/baseline`,
- seedRuns: [
- {
- runId: '26085158592',
- label: 'main baseline',
- sha: 'ce7cf8f8ebfaa1da6c7e9122cd195a5f95ce2fca',
- source: 'manual-backfill',
- artifacts: ['source-shape'],
- notes:
- 'Backfilled with the current measurement workflow for the effect-utils #658 rollout.',
- },
- ],
- maxRuns: 20,
- }),
- if: normalCiIf,
- },
sourceShapeMeasurementStep({
artifactDir: `${sourceShapeMeasurementsDir}/current/effect-utils`,
targetId: 'effect_utils',
@@ -562,21 +545,6 @@ const extraJobs: Record = {
},
],
}),
- {
- ...compareCiMeasurementsStep({
- currentDir: `${sourceShapeMeasurementsDir}/current`,
- baselineDir: `${sourceShapeMeasurementsDir}/baseline`,
- outputFile: `${sourceShapeMeasurementsDir}/measurement-comparison.json`,
- regressionMode: 'warn',
- prComment: {
- enabled: false,
- title: 'Source Shape Measurements',
- maxRows: 12,
- maxHistory: 20,
- },
- }),
- if: normalCiIf,
- },
ciMeasurementsArtifactStep({
artifactName: 'source-shape',
path: sourceShapeMeasurementsDir,
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 2e6816d9a..729e5eb11 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -184,6 +184,7 @@ export type NixClosureMeasurementsStepsOptions = {
readonly targets: readonly [NixClosureMeasurementTarget, ...NixClosureMeasurementTarget[]]
readonly buckets?: readonly NixClosureMeasurementBucket[]
readonly retentionDays?: number
+ readonly compare?: boolean
readonly regressionMode?: 'off' | 'warn' | 'fail'
readonly prComment?: CiMeasurementsComparisonStepOptions['prComment']
}
@@ -356,6 +357,7 @@ export type DevenvPerfJobOptions = {
readonly taskProbes?: readonly DevenvPerfTaskProbe[]
readonly probes?: readonly DevenvPerfProbe[]
readonly retentionDays?: number
+ readonly compare?: boolean
readonly regressionMode?: 'off' | 'warn' | 'fail'
readonly prComment?: CiMeasurementsComparisonStepOptions['prComment']
readonly permissions?: GitHubWorkflowArgs['jobs'][string]['permissions']
@@ -1574,16 +1576,21 @@ export const nixClosureMeasurementSteps = (opts: NixClosureMeasurementsStepsOpti
const artifactDir = opts.artifactDir ?? 'tmp/nix-closure-measurements'
const baselineArtifactName = opts.baselineArtifactName ?? opts.artifactName
const buckets = opts.buckets ?? defaultNixClosureMeasurementBuckets
+ const compare = opts.compare ?? true
return [
- downloadPreviousGitHubArtifactStep({
- artifactName: baselineArtifactName,
- outputDir: `${artifactDir}/baseline`,
- seedRuns: opts.baselineSeedRuns,
- seedRunIds: opts.baselineSeedRunIds,
- maxRuns: opts.baselineMaxRuns,
- maxCandidateRuns: opts.baselineMaxCandidateRuns,
- }),
+ ...(compare
+ ? [
+ downloadPreviousGitHubArtifactStep({
+ artifactName: baselineArtifactName,
+ outputDir: `${artifactDir}/baseline`,
+ seedRuns: opts.baselineSeedRuns,
+ seedRunIds: opts.baselineSeedRunIds,
+ maxRuns: opts.baselineMaxRuns,
+ maxCandidateRuns: opts.baselineMaxCandidateRuns,
+ }),
+ ]
+ : []),
...opts.targets.map((target) =>
nixClosureMeasurementStep({
installable: target.installable,
@@ -1599,13 +1606,17 @@ export const nixClosureMeasurementSteps = (opts: NixClosureMeasurementsStepsOpti
gate: target.gate,
}),
),
- compareCiMeasurementsStep({
- currentDir: `${artifactDir}/current`,
- baselineDir: `${artifactDir}/baseline`,
- outputFile: `${artifactDir}/measurement-comparison.json`,
- regressionMode: opts.regressionMode ?? 'warn',
- prComment: opts.prComment,
- }),
+ ...(compare
+ ? [
+ compareCiMeasurementsStep({
+ currentDir: `${artifactDir}/current`,
+ baselineDir: `${artifactDir}/baseline`,
+ outputFile: `${artifactDir}/measurement-comparison.json`,
+ regressionMode: opts.regressionMode ?? 'warn',
+ prComment: opts.prComment,
+ }),
+ ]
+ : []),
ciMeasurementsArtifactStep({
artifactName: opts.artifactName,
path: artifactDir,
@@ -3416,6 +3427,7 @@ export const devenvPerfJob = (opts?: DevenvPerfJobOptions) => {
opts?.artifactName ??
'devenv-perf-${{ github.job }}-${{ github.run_id }}-attempt-${{ github.run_attempt }}'
const baselineArtifactName = opts?.baselineArtifactName ?? opts?.artifactName
+ const compare = opts?.compare ?? true
const probes = devenvPerfProbes({
taskProbes: opts?.taskProbes ?? [],
probes: opts?.probes ?? [],
@@ -3439,9 +3451,8 @@ export const devenvPerfJob = (opts?: DevenvPerfJobOptions) => {
preparePinnedDevenvStep,
validateNixStoreStep,
]),
- ...(baselineArtifactName === undefined
- ? []
- : [
+ ...(compare && baselineArtifactName !== undefined
+ ? [
downloadPreviousGitHubArtifactStep({
artifactName: baselineArtifactName,
outputDir: `${artifactDir}/baseline`,
@@ -3451,18 +3462,23 @@ export const devenvPerfJob = (opts?: DevenvPerfJobOptions) => {
maxCandidateRuns: opts?.baselineMaxCandidateRuns,
requiredObservations: devenvPerfRequiredBaselineObservations(probes),
}),
- ]),
+ ]
+ : []),
devenvPerfBenchmarkStep({
taskProbes: opts?.taskProbes,
probes: opts?.probes,
}),
- compareCiMeasurementsStep({
- currentDir: artifactDir,
- baselineDir: `${artifactDir}/baseline`,
- outputFile: `${artifactDir}/measurement-comparison.json`,
- regressionMode: opts?.regressionMode ?? 'warn',
- prComment: opts?.prComment,
- }),
+ ...(compare
+ ? [
+ compareCiMeasurementsStep({
+ currentDir: artifactDir,
+ baselineDir: `${artifactDir}/baseline`,
+ outputFile: `${artifactDir}/measurement-comparison.json`,
+ regressionMode: opts?.regressionMode ?? 'warn',
+ prComment: opts?.prComment,
+ }),
+ ]
+ : []),
devenvPerfArtifactStep({
artifactDir,
artifactName,
From 332c3b59607d5956a7053949d67354035d720068 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Wed, 20 May 2026 11:44:35 +0200
Subject: [PATCH 70/81] Bootstrap CI measurement producer tools
---
.github/workflows/ci.yml | 55 +++++++++++++++++++++++++++++++
genie/ci-workflow/measurements.ts | 33 +++++++++++++++++++
2 files changed, 88 insertions(+)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 2dfeb2475..2809189cb 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2434,6 +2434,34 @@ jobs:
run: |
set -euo pipefail
+ ensure_ci_measurement_tool() {
+ tool_name="$1"
+ nix_attr="$2"
+ if command -v "$tool_name" >/dev/null 2>&1; then
+ return 0
+ fi
+ if ! command -v nix >/dev/null 2>&1; then
+ return 1
+ fi
+ if tool_out="$(nix build --no-link --print-out-paths "nixpkgs#$nix_attr" 2>/dev/null)"; then
+ export PATH="$tool_out/bin:$PATH"
+ fi
+ command -v "$tool_name" >/dev/null 2>&1
+ }
+
+ require_ci_measurement_tool() {
+ tool_name="$1"
+ nix_attr="$2"
+ if ensure_ci_measurement_tool "$tool_name" "$nix_attr"; then
+ return 0
+ fi
+ echo "::error::$tool_name is not available; unable to produce CI measurement artifact"
+ exit 1
+ }
+
+ require_ci_measurement_tool awk gawk
+ require_ci_measurement_tool jq jq
+
ARTIFACT_DIR="$(mkdir -p "$ARTIFACT_DIR" && cd "$ARTIFACT_DIR" && pwd -P)"
CI_MEASUREMENT_HEAD_DIR="${CI_MEASUREMENT_HEAD_DIR:-$PWD}"
CI_MEASUREMENT_BASE_DIR="${CI_MEASUREMENT_BASE_DIR:-${RUNNER_TEMP:-/tmp}/ci-measurement-base}"
@@ -3758,6 +3786,33 @@ jobs:
run: |
set -euo pipefail
+ ensure_ci_measurement_tool() {
+ tool_name="$1"
+ nix_attr="$2"
+ if command -v "$tool_name" >/dev/null 2>&1; then
+ return 0
+ fi
+ if ! command -v nix >/dev/null 2>&1; then
+ return 1
+ fi
+ if tool_out="$(nix build --no-link --print-out-paths "nixpkgs#$nix_attr" 2>/dev/null)"; then
+ export PATH="$tool_out/bin:$PATH"
+ fi
+ command -v "$tool_name" >/dev/null 2>&1
+ }
+
+ require_ci_measurement_tool() {
+ tool_name="$1"
+ nix_attr="$2"
+ if ensure_ci_measurement_tool "$tool_name" "$nix_attr"; then
+ return 0
+ fi
+ echo "::error::$tool_name is not available; unable to produce CI measurement artifact"
+ exit 1
+ }
+
+ require_ci_measurement_tool node nodejs
+
mkdir -p "$ARTIFACT_DIR"
target_id='effect_utils'
target_name='effect-utils'
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 729e5eb11..65554699a 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -539,6 +539,32 @@ const devenvPerfRequiredBaselineObservations = (
.filter((probe) => probe.enabled)
.map(({ id, minSources }) => ({ id, minSources }))
+const ciMeasurementToolBootstrapScript = String.raw`ensure_ci_measurement_tool() {
+ tool_name="$1"
+ nix_attr="$2"
+ if command -v "$tool_name" >/dev/null 2>&1; then
+ return 0
+ fi
+ if ! command -v nix >/dev/null 2>&1; then
+ return 1
+ fi
+ if tool_out="$(nix build --no-link --print-out-paths "nixpkgs#$nix_attr" 2>/dev/null)"; then
+ export PATH="$tool_out/bin:$PATH"
+ fi
+ command -v "$tool_name" >/dev/null 2>&1
+}
+
+require_ci_measurement_tool() {
+ tool_name="$1"
+ nix_attr="$2"
+ if ensure_ci_measurement_tool "$tool_name" "$nix_attr"; then
+ return 0
+ fi
+ echo "::error::$tool_name is not available; unable to produce CI measurement artifact"
+ exit 1
+}
+`
+
const renderDevenvPerfScript = (
opts: Required>,
) => {
@@ -546,6 +572,10 @@ const renderDevenvPerfScript = (
return String.raw`set -euo pipefail
+${ciMeasurementToolBootstrapScript}
+require_ci_measurement_tool awk gawk
+require_ci_measurement_tool jq jq
+
ARTIFACT_DIR="$(mkdir -p "$ARTIFACT_DIR" && cd "$ARTIFACT_DIR" && pwd -P)"
CI_MEASUREMENT_HEAD_DIR="${dollar}{CI_MEASUREMENT_HEAD_DIR:-$PWD}"
CI_MEASUREMENT_BASE_DIR="${dollar}{CI_MEASUREMENT_BASE_DIR:-${dollar}{RUNNER_TEMP:-/tmp}/ci-measurement-base}"
@@ -1668,6 +1698,9 @@ export const sourceShapeMeasurementStep = (opts: SourceShapeMeasurementStepOptio
},
run: String.raw`set -euo pipefail
+${ciMeasurementToolBootstrapScript}
+require_ci_measurement_tool node nodejs
+
mkdir -p "$ARTIFACT_DIR"
target_id=${shellSingleQuote(targetId)}
target_name=${shellSingleQuote(targetName)}
From fc8703b4709398a7720019e441e3a74f49b4b4a0 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Wed, 20 May 2026 12:06:25 +0200
Subject: [PATCH 71/81] Fix CI workflow helper expectations
Align the genie workflow helper test with the generated split measurement workflow: the PR comment now uses the default CI Measurements title, the task probes are emitted as producer measurements, and the seeded baseline is the current main backfill run.
Merge-Queue-Schema: mq.commit.v1
Merge-Queue-Mode: agent-escalated
Merge-Queue-PR: overengineeringstudio/effect-utils#658
Merge-Queue-Attempt-ID: 84c9d5d6-e498-4163-ac30-eda2a7871bad
Merge-Queue-Agent-Session-ID: 0292d490-bca7-5f95-ba14-a23c5c652ed5
---
.../ci-workflow-helpers.unit.test.ts | 14 +++++---------
1 file changed, 5 insertions(+), 9 deletions(-)
diff --git a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
index 70073acf9..f1e4e7c4d 100644
--- a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
+++ b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
@@ -465,15 +465,13 @@ describe('ci workflow devenv perf helpers', () => {
expect(generatedCiWorkflowYamlSource).toContain('devenv-perf-warm-median-v2')
expect(generatedCiWorkflowYamlSource).toContain("CI_MEASUREMENT_PR_COMMENT_ENABLED: 'true'")
expect(generatedCiWorkflowYamlSource).toContain(
- 'CI_MEASUREMENT_PR_COMMENT_TITLE: Devenv Performance',
+ 'CI_MEASUREMENT_PR_COMMENT_TITLE: CI Measurements',
)
expect(generatedCiWorkflowYamlSource).toContain('BASELINE_SEED_RUNS_JSON:')
expect(generatedCiWorkflowYamlSource).toContain('BASELINE_REQUIRED_OBSERVATIONS_JSON:')
expect(generatedCiWorkflowYamlSource).toContain('BASELINE_MAX_CANDIDATE_RUNS:')
- expect(generatedCiWorkflowYamlSource).toContain('"id":"devenv.task_check_quick_warm.duration"')
- expect(generatedCiWorkflowYamlSource).toContain(
- '"id":"devenv.task_check_quick_forced.duration"',
- )
+ expect(generatedCiWorkflowYamlSource).toContain("measure 'task_check_quick_warm'")
+ expect(generatedCiWorkflowYamlSource).toContain("measure 'task_check_quick_forced'")
expect(generatedCiWorkflowYamlSource).not.toContain('"id":"devenv.task_check_quick.duration"')
expect(ciWorkflowSource).toContain(
'requiredObservations?: readonly CiMeasurementRequiredBaselineObservation[]',
@@ -481,10 +479,8 @@ describe('ci workflow devenv perf helpers', () => {
expect(ciWorkflowSource).toContain('baselineMaxCandidateRuns?: number')
expect(ciWorkflowSource).toContain('baseline_requirements_satisfied')
expect(ciWorkflowSource).toContain('observationCounts: ($observationCounts[0] // null)')
- expect(generatedCiWorkflowYamlSource).toContain('"runId":"25959801150"')
- expect(generatedCiWorkflowYamlSource).toContain('"runId":"25959814835"')
- expect(generatedCiWorkflowYamlSource).toContain('"label":"PR #655"')
- expect(generatedCiWorkflowYamlSource).toContain('"label":"PR #632"')
+ expect(generatedCiWorkflowYamlSource).toContain('"runId":"26085158592"')
+ expect(generatedCiWorkflowYamlSource).toContain('"label":"main baseline"')
expect(generatedCiWorkflowYamlSource).toContain('Upload devenv perf artifacts')
expect(generatedCiWorkflowYamlSource).toContain('retention-days: 30')
expect(ciWorkflowSource).toContain("contents: 'write'")
From 327d39306bb855851259ddbbbccc822ef5dda575 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Wed, 20 May 2026 12:15:51 +0200
Subject: [PATCH 72/81] Trigger clean CI run
From 5119eb0d0109293327e2c2675bffa27dd2eed5af Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Wed, 20 May 2026 12:24:52 +0200
Subject: [PATCH 73/81] Fix matrix-safe CI job concurrency
---
.github/workflows/ci.yml | 6 +++---
genie/ci-workflow/shared.ts | 12 +++++++++---
.../github-workflow/ci-workflow-helpers.unit.test.ts | 7 +++++++
3 files changed, 19 insertions(+), 6 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 2809189cb..220db4e5a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1203,7 +1203,7 @@ jobs:
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
concurrency:
- group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-test"
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-test-${{ strategy.job-index }}"
cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
nix-check:
if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
@@ -1557,7 +1557,7 @@ jobs:
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
concurrency:
- group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-nix-check"
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-nix-check-${{ strategy.job-index }}"
cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
nix-fod-check:
if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
@@ -1723,7 +1723,7 @@ jobs:
echo "If this looks like Namespace runner Nix store corruption (e.g. \"... is not valid\", \"config.cachix\", \"cachix.package\"), add the run link + full nix-store output to:"
echo " https://github.com/overengineeringstudio/effect-utils/issues/201"
concurrency:
- group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-nix-fod-check"
+ group: "${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}-nix-fod-check-${{ strategy.job-index }}"
cancel-in-progress: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}
pnpm-builder-contract:
if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') }}
diff --git a/genie/ci-workflow/shared.ts b/genie/ci-workflow/shared.ts
index 5c8c736cd..b2652a10d 100644
--- a/genie/ci-workflow/shared.ts
+++ b/genie/ci-workflow/shared.ts
@@ -72,20 +72,26 @@ export const standardCIEnv = {
* allowed to materialize full PR CI. Other label events do not change the
* commit under test and must not cancel an already-running validation run.
*/
-export const ciJobConcurrency = (jobId: string) =>
+export const ciJobConcurrency = (jobId: string, opts?: { readonly matrix?: boolean }) =>
({
group:
"${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '' && format('measurement-baseline-{0}', inputs.measurement_baseline_ref) || (github.event_name == 'workflow_dispatch' && format('manual-run-{0}', github.run_id) || (github.event_name == 'pull_request' && (github.event.action == 'labeled' || github.event.action == 'unlabeled') && format('label-{0}', github.event.label.name) || 'code')) }}" +
- `-${jobId}`,
+ `-${jobId}` +
+ (opts?.matrix === true ? '-${{ strategy.job-index }}' : ''),
'cancel-in-progress':
"${{ !(github.event_name == 'workflow_dispatch' && inputs.measurement_baseline_ref != '') && (github.event_name != 'pull_request' || (github.event.action != 'labeled' && github.event.action != 'unlabeled')) }}",
}) as const
+const isMatrixJob = (job: GitHubWorkflowArgs['jobs'][string]) =>
+ typeof job.strategy === 'object' && job.strategy !== null && 'matrix' in job.strategy
+
const withDefaultJobConcurrency = (jobs: GitHubWorkflowArgs['jobs']): GitHubWorkflowArgs['jobs'] =>
Object.fromEntries(
Object.entries(jobs).map(([jobId, job]) => [
jobId,
- job.concurrency === undefined ? { ...job, concurrency: ciJobConcurrency(jobId) } : job,
+ job.concurrency === undefined
+ ? { ...job, concurrency: ciJobConcurrency(jobId, { matrix: isMatrixJob(job) }) }
+ : job,
]),
)
diff --git a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
index f1e4e7c4d..1433c2993 100644
--- a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
+++ b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
@@ -515,6 +515,13 @@ describe('ci workflow devenv perf helpers', () => {
expect(generatedCiWorkflowYamlSource).not.toMatch(/^concurrency:/m)
expect(generatedCiWorkflowYamlSource).toContain('concurrency:\n group:')
expect(generatedCiWorkflowYamlSource).toContain('}}-typecheck')
+ expect(ciWorkflowSource).toContain('export const ciJobConcurrency = (jobId: string, opts?:')
+ expect(ciWorkflowSource).toContain("opts?.matrix === true ? '-${{ strategy.job-index }}' : ''")
+ expect(ciWorkflowSource).toContain('const isMatrixJob = (job: GitHubWorkflowArgs')
+ expect(generatedCiWorkflowYamlSource).toContain('}}-test-${{ strategy.job-index }}')
+ expect(generatedCiWorkflowYamlSource).toContain(
+ '}}-nix-check-${{ strategy.job-index }}',
+ )
expect(generatedCiWorkflowYamlSource).toContain("format('measurement-baseline-{0}'")
expect(generatedCiWorkflowYamlSource).not.toContain("format('measurement-pr-{0}-run-{1}'")
expect(generatedCiWorkflowYamlSource).not.toContain('inputs.measurement_pr_number')
From c9b03ce1aad8ed5dc6bae8cae986b4c8d61523bd Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Wed, 20 May 2026 12:30:40 +0200
Subject: [PATCH 74/81] Format matrix concurrency test
---
.../runtime/github-workflow/ci-workflow-helpers.unit.test.ts | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
index 1433c2993..d7e153031 100644
--- a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
+++ b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
@@ -519,9 +519,7 @@ describe('ci workflow devenv perf helpers', () => {
expect(ciWorkflowSource).toContain("opts?.matrix === true ? '-${{ strategy.job-index }}' : ''")
expect(ciWorkflowSource).toContain('const isMatrixJob = (job: GitHubWorkflowArgs')
expect(generatedCiWorkflowYamlSource).toContain('}}-test-${{ strategy.job-index }}')
- expect(generatedCiWorkflowYamlSource).toContain(
- '}}-nix-check-${{ strategy.job-index }}',
- )
+ expect(generatedCiWorkflowYamlSource).toContain('}}-nix-check-${{ strategy.job-index }}')
expect(generatedCiWorkflowYamlSource).toContain("format('measurement-baseline-{0}'")
expect(generatedCiWorkflowYamlSource).not.toContain("format('measurement-pr-{0}-run-{1}'")
expect(generatedCiWorkflowYamlSource).not.toContain('inputs.measurement_pr_number')
From 0ad1844092192bcf321cce26b00d5037b90b06c8 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Wed, 20 May 2026 14:03:35 +0200
Subject: [PATCH 75/81] Run merge queue gates on control-plane runners
---
genie/ci-workflow/merge-queue.ts | 6 ++++--
.../github-workflow/ci-workflow-helpers.unit.test.ts | 1 +
2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/genie/ci-workflow/merge-queue.ts b/genie/ci-workflow/merge-queue.ts
index c2abf56aa..3e7947990 100644
--- a/genie/ci-workflow/merge-queue.ts
+++ b/genie/ci-workflow/merge-queue.ts
@@ -59,6 +59,8 @@ const defaultMergeQueuePermissions = {
'pull-requests': 'read',
} as const
+const defaultMergeQueueControlRunner = 'ubuntu-latest' as const
+
const mergeRequiredAdmissionPermissions = (
permissions: WorkflowPermissions | undefined,
): WorkflowPermissions => {
@@ -211,7 +213,7 @@ export type MergeQueueAdmissionGateJobOptions = MergeQueueAdmissionStepOptions &
}
export const mergeQueueAdmissionGateJob = ({
- runsOn = ['nix'],
+ runsOn = defaultMergeQueueControlRunner,
timeoutMinutes = 5,
...stepOptions
}: MergeQueueAdmissionGateJobOptions = {}): WorkflowJob => ({
@@ -246,7 +248,7 @@ export const mergeQueueAdmissionDeferredLines = (name: string) =>
export const mergeQueueSemanticGateJob = ({
name,
needs,
- runsOn = ['nix'],
+ runsOn = defaultMergeQueueControlRunner,
timeoutMinutes = 20,
tokenExpression = '${{ secrets.GITHUB_TOKEN }}',
}: MergeQueueSemanticGateJobOptions): WorkflowJob => ({
diff --git a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
index d7e153031..2a0d37f40 100644
--- a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
+++ b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
@@ -259,6 +259,7 @@ describe('ci workflow merge queue helpers', () => {
expect(ciWorkflowSource).toContain('export const mergeQueueAdmittedJob')
expect(ciWorkflowSource).toContain('export const mergeQueueSemanticGateJob')
expect(ciWorkflowSource).toContain('export const mergeQueueSemanticGateJobs')
+ expect(ciWorkflowSource).toContain("const defaultMergeQueueControlRunner = 'ubuntu-latest'")
expect(ciWorkflowSource).toContain('trustNeedsAdmission: true')
expect(ciWorkflowSource).toContain('requiredGateCheckName(name)')
})
From 3ca45dc69d04339cb0d986d1d1446451a1165166 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Wed, 20 May 2026 14:10:29 +0200
Subject: [PATCH 76/81] Keep merge queue gates on available runners
---
genie/ci-workflow/merge-queue.ts | 6 ++----
.../github-workflow/ci-workflow-helpers.unit.test.ts | 1 -
2 files changed, 2 insertions(+), 5 deletions(-)
diff --git a/genie/ci-workflow/merge-queue.ts b/genie/ci-workflow/merge-queue.ts
index 3e7947990..c2abf56aa 100644
--- a/genie/ci-workflow/merge-queue.ts
+++ b/genie/ci-workflow/merge-queue.ts
@@ -59,8 +59,6 @@ const defaultMergeQueuePermissions = {
'pull-requests': 'read',
} as const
-const defaultMergeQueueControlRunner = 'ubuntu-latest' as const
-
const mergeRequiredAdmissionPermissions = (
permissions: WorkflowPermissions | undefined,
): WorkflowPermissions => {
@@ -213,7 +211,7 @@ export type MergeQueueAdmissionGateJobOptions = MergeQueueAdmissionStepOptions &
}
export const mergeQueueAdmissionGateJob = ({
- runsOn = defaultMergeQueueControlRunner,
+ runsOn = ['nix'],
timeoutMinutes = 5,
...stepOptions
}: MergeQueueAdmissionGateJobOptions = {}): WorkflowJob => ({
@@ -248,7 +246,7 @@ export const mergeQueueAdmissionDeferredLines = (name: string) =>
export const mergeQueueSemanticGateJob = ({
name,
needs,
- runsOn = defaultMergeQueueControlRunner,
+ runsOn = ['nix'],
timeoutMinutes = 20,
tokenExpression = '${{ secrets.GITHUB_TOKEN }}',
}: MergeQueueSemanticGateJobOptions): WorkflowJob => ({
diff --git a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
index 2a0d37f40..d7e153031 100644
--- a/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
+++ b/packages/@overeng/genie/src/runtime/github-workflow/ci-workflow-helpers.unit.test.ts
@@ -259,7 +259,6 @@ describe('ci workflow merge queue helpers', () => {
expect(ciWorkflowSource).toContain('export const mergeQueueAdmittedJob')
expect(ciWorkflowSource).toContain('export const mergeQueueSemanticGateJob')
expect(ciWorkflowSource).toContain('export const mergeQueueSemanticGateJobs')
- expect(ciWorkflowSource).toContain("const defaultMergeQueueControlRunner = 'ubuntu-latest'")
expect(ciWorkflowSource).toContain('trustNeedsAdmission: true')
expect(ciWorkflowSource).toContain('requiredGateCheckName(name)')
})
From 820c16f31f06bcfcffbd3d95feaf9ad331676a0a Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Wed, 20 May 2026 16:02:45 +0200
Subject: [PATCH 77/81] Retry transient Nix source fetch failures
---
.github/workflows/ci.yml | 160 ++++++++++++---------
genie/ci-scripts/nix-gc-race-retry.sh | 20 +--
genie/ci-scripts/nix-gc-race-retry.test.sh | 25 +++-
genie/ci-workflow/shared.ts | 20 +--
4 files changed, 143 insertions(+), 82 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 220db4e5a..d17ce149e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -220,7 +220,7 @@ jobs:
local max="${NIX_GC_RACE_MAX_RETRIES:-10}"
local heartbeat="${CI_PROGRESS_HEARTBEAT_SECONDS:-60}"
local attempt=1
- local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature had_errexit
+ local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature saw_fetch_signature had_errexit
start="$(date +%s)"
@@ -268,7 +268,7 @@ jobs:
if [ "$rc" -eq 0 ]; then
echo "::notice::[ci] completed $task in $elapsed s"
if [ "$attempt" -gt 1 ]; then
- write_summary success "Recovered from Nix GC race after retry"
+ write_summary success "Recovered from transient Nix failure after retry"
else
write_summary success
fi
@@ -284,18 +284,22 @@ jobs:
tr -d '[:space:]' || true)
saw_invalid_path=false
saw_cachix_signature=false
+ saw_fetch_signature=false
[ -n "$path" ] && saw_invalid_path=true
printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*Failed to convert config\.cachix to JSON' && saw_cachix_signature=true || true
printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*while evaluating the option.*cachix\.package' && saw_cachix_signature=true || true
+ printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*cannot read file from tarball:[[:space:]]*Truncated tar archive detected while reading data' && saw_fetch_signature=true || true
rm -f "$log"
- if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ]; then
- echo "::warning::[ci] $task failed after $elapsed s without a detected Nix store validity race"
- write_summary failure "No Nix GC race signature detected"
+ if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ] && [ "$saw_fetch_signature" != true ]; then
+ echo "::warning::[ci] $task failed after $elapsed s without a detected transient Nix failure"
+ write_summary failure "No transient Nix failure signature detected"
return "$rc"
fi
- if [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then
+ if [ "$saw_fetch_signature" = true ]; then
+ echo "::warning::Nix source fetch corruption detected for $task (attempt $attempt/$max); retrying with a refreshed eval cache"
+ elif [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then
echo "::warning::Nix store validity race detected for $task via cachix eval wrapper (attempt $attempt/$max): $path"
elif [ "$saw_cachix_signature" = true ]; then
echo "::warning::Nix store validity race detected for $task via cachix eval wrapper without extracted store path (attempt $attempt/$max)"
@@ -310,8 +314,8 @@ jobs:
now=$(date +%s)
elapsed=$((now - start))
- echo "::error::Nix GC race retry exhausted for $task ($max attempts)"
- write_summary failure "Nix GC race retry exhausted"
+ echo "::error::Transient Nix retry exhausted for $task ($max attempts)"
+ write_summary failure "Transient Nix retry exhausted"
return 1
}
EOF
@@ -341,7 +345,7 @@ jobs:
local max="${NIX_GC_RACE_MAX_RETRIES:-10}"
local heartbeat="${CI_PROGRESS_HEARTBEAT_SECONDS:-60}"
local attempt=1
- local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature had_errexit
+ local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature saw_fetch_signature had_errexit
start="$(date +%s)"
@@ -389,7 +393,7 @@ jobs:
if [ "$rc" -eq 0 ]; then
echo "::notice::[ci] completed $task in $elapsed s"
if [ "$attempt" -gt 1 ]; then
- write_summary success "Recovered from Nix GC race after retry"
+ write_summary success "Recovered from transient Nix failure after retry"
else
write_summary success
fi
@@ -405,18 +409,22 @@ jobs:
tr -d '[:space:]' || true)
saw_invalid_path=false
saw_cachix_signature=false
+ saw_fetch_signature=false
[ -n "$path" ] && saw_invalid_path=true
printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*Failed to convert config\.cachix to JSON' && saw_cachix_signature=true || true
printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*while evaluating the option.*cachix\.package' && saw_cachix_signature=true || true
+ printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*cannot read file from tarball:[[:space:]]*Truncated tar archive detected while reading data' && saw_fetch_signature=true || true
rm -f "$log"
- if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ]; then
- echo "::warning::[ci] $task failed after $elapsed s without a detected Nix store validity race"
- write_summary failure "No Nix GC race signature detected"
+ if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ] && [ "$saw_fetch_signature" != true ]; then
+ echo "::warning::[ci] $task failed after $elapsed s without a detected transient Nix failure"
+ write_summary failure "No transient Nix failure signature detected"
return "$rc"
fi
- if [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then
+ if [ "$saw_fetch_signature" = true ]; then
+ echo "::warning::Nix source fetch corruption detected for $task (attempt $attempt/$max); retrying with a refreshed eval cache"
+ elif [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then
echo "::warning::Nix store validity race detected for $task via cachix eval wrapper (attempt $attempt/$max): $path"
elif [ "$saw_cachix_signature" = true ]; then
echo "::warning::Nix store validity race detected for $task via cachix eval wrapper without extracted store path (attempt $attempt/$max)"
@@ -431,8 +439,8 @@ jobs:
now=$(date +%s)
elapsed=$((now - start))
- echo "::error::Nix GC race retry exhausted for $task ($max attempts)"
- write_summary failure "Nix GC race retry exhausted"
+ echo "::error::Transient Nix retry exhausted for $task ($max attempts)"
+ write_summary failure "Transient Nix retry exhausted"
return 1
}
EOF
@@ -692,7 +700,7 @@ jobs:
local max="${NIX_GC_RACE_MAX_RETRIES:-10}"
local heartbeat="${CI_PROGRESS_HEARTBEAT_SECONDS:-60}"
local attempt=1
- local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature had_errexit
+ local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature saw_fetch_signature had_errexit
start="$(date +%s)"
@@ -740,7 +748,7 @@ jobs:
if [ "$rc" -eq 0 ]; then
echo "::notice::[ci] completed $task in $elapsed s"
if [ "$attempt" -gt 1 ]; then
- write_summary success "Recovered from Nix GC race after retry"
+ write_summary success "Recovered from transient Nix failure after retry"
else
write_summary success
fi
@@ -756,18 +764,22 @@ jobs:
tr -d '[:space:]' || true)
saw_invalid_path=false
saw_cachix_signature=false
+ saw_fetch_signature=false
[ -n "$path" ] && saw_invalid_path=true
printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*Failed to convert config\.cachix to JSON' && saw_cachix_signature=true || true
printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*while evaluating the option.*cachix\.package' && saw_cachix_signature=true || true
+ printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*cannot read file from tarball:[[:space:]]*Truncated tar archive detected while reading data' && saw_fetch_signature=true || true
rm -f "$log"
- if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ]; then
- echo "::warning::[ci] $task failed after $elapsed s without a detected Nix store validity race"
- write_summary failure "No Nix GC race signature detected"
+ if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ] && [ "$saw_fetch_signature" != true ]; then
+ echo "::warning::[ci] $task failed after $elapsed s without a detected transient Nix failure"
+ write_summary failure "No transient Nix failure signature detected"
return "$rc"
fi
- if [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then
+ if [ "$saw_fetch_signature" = true ]; then
+ echo "::warning::Nix source fetch corruption detected for $task (attempt $attempt/$max); retrying with a refreshed eval cache"
+ elif [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then
echo "::warning::Nix store validity race detected for $task via cachix eval wrapper (attempt $attempt/$max): $path"
elif [ "$saw_cachix_signature" = true ]; then
echo "::warning::Nix store validity race detected for $task via cachix eval wrapper without extracted store path (attempt $attempt/$max)"
@@ -782,8 +794,8 @@ jobs:
now=$(date +%s)
elapsed=$((now - start))
- echo "::error::Nix GC race retry exhausted for $task ($max attempts)"
- write_summary failure "Nix GC race retry exhausted"
+ echo "::error::Transient Nix retry exhausted for $task ($max attempts)"
+ write_summary failure "Transient Nix retry exhausted"
return 1
}
EOF
@@ -1046,7 +1058,7 @@ jobs:
local max="${NIX_GC_RACE_MAX_RETRIES:-10}"
local heartbeat="${CI_PROGRESS_HEARTBEAT_SECONDS:-60}"
local attempt=1
- local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature had_errexit
+ local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature saw_fetch_signature had_errexit
start="$(date +%s)"
@@ -1094,7 +1106,7 @@ jobs:
if [ "$rc" -eq 0 ]; then
echo "::notice::[ci] completed $task in $elapsed s"
if [ "$attempt" -gt 1 ]; then
- write_summary success "Recovered from Nix GC race after retry"
+ write_summary success "Recovered from transient Nix failure after retry"
else
write_summary success
fi
@@ -1110,18 +1122,22 @@ jobs:
tr -d '[:space:]' || true)
saw_invalid_path=false
saw_cachix_signature=false
+ saw_fetch_signature=false
[ -n "$path" ] && saw_invalid_path=true
printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*Failed to convert config\.cachix to JSON' && saw_cachix_signature=true || true
printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*while evaluating the option.*cachix\.package' && saw_cachix_signature=true || true
+ printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*cannot read file from tarball:[[:space:]]*Truncated tar archive detected while reading data' && saw_fetch_signature=true || true
rm -f "$log"
- if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ]; then
- echo "::warning::[ci] $task failed after $elapsed s without a detected Nix store validity race"
- write_summary failure "No Nix GC race signature detected"
+ if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ] && [ "$saw_fetch_signature" != true ]; then
+ echo "::warning::[ci] $task failed after $elapsed s without a detected transient Nix failure"
+ write_summary failure "No transient Nix failure signature detected"
return "$rc"
fi
- if [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then
+ if [ "$saw_fetch_signature" = true ]; then
+ echo "::warning::Nix source fetch corruption detected for $task (attempt $attempt/$max); retrying with a refreshed eval cache"
+ elif [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then
echo "::warning::Nix store validity race detected for $task via cachix eval wrapper (attempt $attempt/$max): $path"
elif [ "$saw_cachix_signature" = true ]; then
echo "::warning::Nix store validity race detected for $task via cachix eval wrapper without extracted store path (attempt $attempt/$max)"
@@ -1136,8 +1152,8 @@ jobs:
now=$(date +%s)
elapsed=$((now - start))
- echo "::error::Nix GC race retry exhausted for $task ($max attempts)"
- write_summary failure "Nix GC race retry exhausted"
+ echo "::error::Transient Nix retry exhausted for $task ($max attempts)"
+ write_summary failure "Transient Nix retry exhausted"
return 1
}
EOF
@@ -1400,7 +1416,7 @@ jobs:
local max="${NIX_GC_RACE_MAX_RETRIES:-10}"
local heartbeat="${CI_PROGRESS_HEARTBEAT_SECONDS:-60}"
local attempt=1
- local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature had_errexit
+ local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature saw_fetch_signature had_errexit
start="$(date +%s)"
@@ -1448,7 +1464,7 @@ jobs:
if [ "$rc" -eq 0 ]; then
echo "::notice::[ci] completed $task in $elapsed s"
if [ "$attempt" -gt 1 ]; then
- write_summary success "Recovered from Nix GC race after retry"
+ write_summary success "Recovered from transient Nix failure after retry"
else
write_summary success
fi
@@ -1464,18 +1480,22 @@ jobs:
tr -d '[:space:]' || true)
saw_invalid_path=false
saw_cachix_signature=false
+ saw_fetch_signature=false
[ -n "$path" ] && saw_invalid_path=true
printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*Failed to convert config\.cachix to JSON' && saw_cachix_signature=true || true
printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*while evaluating the option.*cachix\.package' && saw_cachix_signature=true || true
+ printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*cannot read file from tarball:[[:space:]]*Truncated tar archive detected while reading data' && saw_fetch_signature=true || true
rm -f "$log"
- if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ]; then
- echo "::warning::[ci] $task failed after $elapsed s without a detected Nix store validity race"
- write_summary failure "No Nix GC race signature detected"
+ if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ] && [ "$saw_fetch_signature" != true ]; then
+ echo "::warning::[ci] $task failed after $elapsed s without a detected transient Nix failure"
+ write_summary failure "No transient Nix failure signature detected"
return "$rc"
fi
- if [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then
+ if [ "$saw_fetch_signature" = true ]; then
+ echo "::warning::Nix source fetch corruption detected for $task (attempt $attempt/$max); retrying with a refreshed eval cache"
+ elif [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then
echo "::warning::Nix store validity race detected for $task via cachix eval wrapper (attempt $attempt/$max): $path"
elif [ "$saw_cachix_signature" = true ]; then
echo "::warning::Nix store validity race detected for $task via cachix eval wrapper without extracted store path (attempt $attempt/$max)"
@@ -1490,8 +1510,8 @@ jobs:
now=$(date +%s)
elapsed=$((now - start))
- echo "::error::Nix GC race retry exhausted for $task ($max attempts)"
- write_summary failure "Nix GC race retry exhausted"
+ echo "::error::Transient Nix retry exhausted for $task ($max attempts)"
+ write_summary failure "Transient Nix retry exhausted"
return 1
}
EOF
@@ -6448,7 +6468,7 @@ jobs:
local max="${NIX_GC_RACE_MAX_RETRIES:-10}"
local heartbeat="${CI_PROGRESS_HEARTBEAT_SECONDS:-60}"
local attempt=1
- local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature had_errexit
+ local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature saw_fetch_signature had_errexit
start="$(date +%s)"
@@ -6496,7 +6516,7 @@ jobs:
if [ "$rc" -eq 0 ]; then
echo "::notice::[ci] completed $task in $elapsed s"
if [ "$attempt" -gt 1 ]; then
- write_summary success "Recovered from Nix GC race after retry"
+ write_summary success "Recovered from transient Nix failure after retry"
else
write_summary success
fi
@@ -6512,18 +6532,22 @@ jobs:
tr -d '[:space:]' || true)
saw_invalid_path=false
saw_cachix_signature=false
+ saw_fetch_signature=false
[ -n "$path" ] && saw_invalid_path=true
printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*Failed to convert config\.cachix to JSON' && saw_cachix_signature=true || true
printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*while evaluating the option.*cachix\.package' && saw_cachix_signature=true || true
+ printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*cannot read file from tarball:[[:space:]]*Truncated tar archive detected while reading data' && saw_fetch_signature=true || true
rm -f "$log"
- if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ]; then
- echo "::warning::[ci] $task failed after $elapsed s without a detected Nix store validity race"
- write_summary failure "No Nix GC race signature detected"
+ if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ] && [ "$saw_fetch_signature" != true ]; then
+ echo "::warning::[ci] $task failed after $elapsed s without a detected transient Nix failure"
+ write_summary failure "No transient Nix failure signature detected"
return "$rc"
fi
- if [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then
+ if [ "$saw_fetch_signature" = true ]; then
+ echo "::warning::Nix source fetch corruption detected for $task (attempt $attempt/$max); retrying with a refreshed eval cache"
+ elif [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then
echo "::warning::Nix store validity race detected for $task via cachix eval wrapper (attempt $attempt/$max): $path"
elif [ "$saw_cachix_signature" = true ]; then
echo "::warning::Nix store validity race detected for $task via cachix eval wrapper without extracted store path (attempt $attempt/$max)"
@@ -6538,8 +6562,8 @@ jobs:
now=$(date +%s)
elapsed=$((now - start))
- echo "::error::Nix GC race retry exhausted for $task ($max attempts)"
- write_summary failure "Nix GC race retry exhausted"
+ echo "::error::Transient Nix retry exhausted for $task ($max attempts)"
+ write_summary failure "Transient Nix retry exhausted"
return 1
}
EOF
@@ -6812,7 +6836,7 @@ jobs:
local max="${NIX_GC_RACE_MAX_RETRIES:-10}"
local heartbeat="${CI_PROGRESS_HEARTBEAT_SECONDS:-60}"
local attempt=1
- local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature had_errexit
+ local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature saw_fetch_signature had_errexit
start="$(date +%s)"
@@ -6860,7 +6884,7 @@ jobs:
if [ "$rc" -eq 0 ]; then
echo "::notice::[ci] completed $task in $elapsed s"
if [ "$attempt" -gt 1 ]; then
- write_summary success "Recovered from Nix GC race after retry"
+ write_summary success "Recovered from transient Nix failure after retry"
else
write_summary success
fi
@@ -6876,18 +6900,22 @@ jobs:
tr -d '[:space:]' || true)
saw_invalid_path=false
saw_cachix_signature=false
+ saw_fetch_signature=false
[ -n "$path" ] && saw_invalid_path=true
printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*Failed to convert config\.cachix to JSON' && saw_cachix_signature=true || true
printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*while evaluating the option.*cachix\.package' && saw_cachix_signature=true || true
+ printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*cannot read file from tarball:[[:space:]]*Truncated tar archive detected while reading data' && saw_fetch_signature=true || true
rm -f "$log"
- if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ]; then
- echo "::warning::[ci] $task failed after $elapsed s without a detected Nix store validity race"
- write_summary failure "No Nix GC race signature detected"
+ if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ] && [ "$saw_fetch_signature" != true ]; then
+ echo "::warning::[ci] $task failed after $elapsed s without a detected transient Nix failure"
+ write_summary failure "No transient Nix failure signature detected"
return "$rc"
fi
- if [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then
+ if [ "$saw_fetch_signature" = true ]; then
+ echo "::warning::Nix source fetch corruption detected for $task (attempt $attempt/$max); retrying with a refreshed eval cache"
+ elif [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then
echo "::warning::Nix store validity race detected for $task via cachix eval wrapper (attempt $attempt/$max): $path"
elif [ "$saw_cachix_signature" = true ]; then
echo "::warning::Nix store validity race detected for $task via cachix eval wrapper without extracted store path (attempt $attempt/$max)"
@@ -6902,8 +6930,8 @@ jobs:
now=$(date +%s)
elapsed=$((now - start))
- echo "::error::Nix GC race retry exhausted for $task ($max attempts)"
- write_summary failure "Nix GC race retry exhausted"
+ echo "::error::Transient Nix retry exhausted for $task ($max attempts)"
+ write_summary failure "Transient Nix retry exhausted"
return 1
}
EOF
@@ -6921,7 +6949,7 @@ jobs:
local max="${NIX_GC_RACE_MAX_RETRIES:-10}"
local heartbeat="${CI_PROGRESS_HEARTBEAT_SECONDS:-60}"
local attempt=1
- local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature had_errexit
+ local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature saw_fetch_signature had_errexit
start="$(date +%s)"
@@ -6969,7 +6997,7 @@ jobs:
if [ "$rc" -eq 0 ]; then
echo "::notice::[ci] completed $task in $elapsed s"
if [ "$attempt" -gt 1 ]; then
- write_summary success "Recovered from Nix GC race after retry"
+ write_summary success "Recovered from transient Nix failure after retry"
else
write_summary success
fi
@@ -6985,18 +7013,22 @@ jobs:
tr -d '[:space:]' || true)
saw_invalid_path=false
saw_cachix_signature=false
+ saw_fetch_signature=false
[ -n "$path" ] && saw_invalid_path=true
printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*Failed to convert config\.cachix to JSON' && saw_cachix_signature=true || true
printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*while evaluating the option.*cachix\.package' && saw_cachix_signature=true || true
+ printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*cannot read file from tarball:[[:space:]]*Truncated tar archive detected while reading data' && saw_fetch_signature=true || true
rm -f "$log"
- if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ]; then
- echo "::warning::[ci] $task failed after $elapsed s without a detected Nix store validity race"
- write_summary failure "No Nix GC race signature detected"
+ if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ] && [ "$saw_fetch_signature" != true ]; then
+ echo "::warning::[ci] $task failed after $elapsed s without a detected transient Nix failure"
+ write_summary failure "No transient Nix failure signature detected"
return "$rc"
fi
- if [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then
+ if [ "$saw_fetch_signature" = true ]; then
+ echo "::warning::Nix source fetch corruption detected for $task (attempt $attempt/$max); retrying with a refreshed eval cache"
+ elif [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then
echo "::warning::Nix store validity race detected for $task via cachix eval wrapper (attempt $attempt/$max): $path"
elif [ "$saw_cachix_signature" = true ]; then
echo "::warning::Nix store validity race detected for $task via cachix eval wrapper without extracted store path (attempt $attempt/$max)"
@@ -7011,8 +7043,8 @@ jobs:
now=$(date +%s)
elapsed=$((now - start))
- echo "::error::Nix GC race retry exhausted for $task ($max attempts)"
- write_summary failure "Nix GC race retry exhausted"
+ echo "::error::Transient Nix retry exhausted for $task ($max attempts)"
+ write_summary failure "Transient Nix retry exhausted"
return 1
}
EOF
diff --git a/genie/ci-scripts/nix-gc-race-retry.sh b/genie/ci-scripts/nix-gc-race-retry.sh
index 3f2bce50a..e7d3d056f 100644
--- a/genie/ci-scripts/nix-gc-race-retry.sh
+++ b/genie/ci-scripts/nix-gc-race-retry.sh
@@ -6,7 +6,7 @@ run_nix_gc_race_retry() {
local max="${NIX_GC_RACE_MAX_RETRIES:-10}"
local heartbeat="${CI_PROGRESS_HEARTBEAT_SECONDS:-60}"
local attempt=1
- local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature had_errexit
+ local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature saw_fetch_signature had_errexit
start="$(date +%s)"
@@ -54,7 +54,7 @@ run_nix_gc_race_retry() {
if [ "$rc" -eq 0 ]; then
echo "::notice::[ci] completed $task in $elapsed s"
if [ "$attempt" -gt 1 ]; then
- write_summary success "Recovered from Nix GC race after retry"
+ write_summary success "Recovered from transient Nix failure after retry"
else
write_summary success
fi
@@ -70,18 +70,22 @@ run_nix_gc_race_retry() {
tr -d '[:space:]' || true)
saw_invalid_path=false
saw_cachix_signature=false
+ saw_fetch_signature=false
[ -n "$path" ] && saw_invalid_path=true
printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*Failed to convert config\.cachix to JSON' && saw_cachix_signature=true || true
printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*while evaluating the option.*cachix\.package' && saw_cachix_signature=true || true
+ printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*cannot read file from tarball:[[:space:]]*Truncated tar archive detected while reading data' && saw_fetch_signature=true || true
rm -f "$log"
- if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ]; then
- echo "::warning::[ci] $task failed after $elapsed s without a detected Nix store validity race"
- write_summary failure "No Nix GC race signature detected"
+ if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ] && [ "$saw_fetch_signature" != true ]; then
+ echo "::warning::[ci] $task failed after $elapsed s without a detected transient Nix failure"
+ write_summary failure "No transient Nix failure signature detected"
return "$rc"
fi
- if [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then
+ if [ "$saw_fetch_signature" = true ]; then
+ echo "::warning::Nix source fetch corruption detected for $task (attempt $attempt/$max); retrying with a refreshed eval cache"
+ elif [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then
echo "::warning::Nix store validity race detected for $task via cachix eval wrapper (attempt $attempt/$max): $path"
elif [ "$saw_cachix_signature" = true ]; then
echo "::warning::Nix store validity race detected for $task via cachix eval wrapper without extracted store path (attempt $attempt/$max)"
@@ -96,7 +100,7 @@ run_nix_gc_race_retry() {
now=$(date +%s)
elapsed=$((now - start))
- echo "::error::Nix GC race retry exhausted for $task ($max attempts)"
- write_summary failure "Nix GC race retry exhausted"
+ echo "::error::Transient Nix retry exhausted for $task ($max attempts)"
+ write_summary failure "Transient Nix retry exhausted"
return 1
}
diff --git a/genie/ci-scripts/nix-gc-race-retry.test.sh b/genie/ci-scripts/nix-gc-race-retry.test.sh
index 7c85a8fc9..300589dc0 100644
--- a/genie/ci-scripts/nix-gc-race-retry.test.sh
+++ b/genie/ci-scripts/nix-gc-race-retry.test.sh
@@ -80,7 +80,28 @@ chmod +x "$cachix_fixture"
CI_PROGRESS_HEARTBEAT_SECONDS=1 NIX_GC_RACE_MAX_RETRIES=2 run_nix_gc_race_retry "cachix-fixture" "$cachix_fixture" >/dev/null
assert_eq "2" "$(cat "$test_dir/cachix-attempt")" "cachix wrapper retry count"
-echo "Test 3: does not retry when literal signature strings appear outside Nix error context"
+echo "Test 3: retries truncated Nix input tarball failures"
+fetch_fixture="$test_dir/fetch-fixture.sh"
+cat > "$fetch_fixture" < "\$attempt_file"
+ echo "error: cannot read file from tarball: Truncated tar archive detected while reading data" >&2
+ exit 1
+fi
+echo "fetch recovered"
+EOF
+chmod +x "$fetch_fixture"
+CI_PROGRESS_HEARTBEAT_SECONDS=1 NIX_GC_RACE_MAX_RETRIES=2 run_nix_gc_race_retry "fetch-fixture" "$fetch_fixture" >/dev/null
+assert_eq "2" "$(cat "$test_dir/fetch-attempt")" "truncated tarball retry count"
+
+echo "Test 4: does not retry when literal signature strings appear outside Nix error context"
false_positive_fixture="$test_dir/false-positive-fixture.sh"
cat > "$false_positive_fixture" <<'EOF'
#!/usr/bin/env bash
@@ -96,7 +117,7 @@ exit_code=$?
set -e
assert_exit_code 9 "$exit_code" "non-error-context strings do not trigger retries"
-echo "Test 4: preserves the original exit code when no GC-race signature is present"
+echo "Test 5: preserves the original exit code when no retry signature is present"
non_retry_fixture="$test_dir/non-retry-fixture.sh"
cat > "$non_retry_fixture" <<'EOF'
#!/usr/bin/env bash
diff --git a/genie/ci-workflow/shared.ts b/genie/ci-workflow/shared.ts
index b2652a10d..45a68bb50 100644
--- a/genie/ci-workflow/shared.ts
+++ b/genie/ci-workflow/shared.ts
@@ -273,7 +273,7 @@ run_nix_gc_race_retry() {
local max="${dollar}{NIX_GC_RACE_MAX_RETRIES:-10}"
local heartbeat="${dollar}{CI_PROGRESS_HEARTBEAT_SECONDS:-60}"
local attempt=1
- local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature had_errexit
+ local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature saw_fetch_signature had_errexit
start="$(date +%s)"
@@ -321,7 +321,7 @@ run_nix_gc_race_retry() {
if [ "$rc" -eq 0 ]; then
echo "::notice::[ci] completed $task in $elapsed s"
if [ "$attempt" -gt 1 ]; then
- write_summary success "Recovered from Nix GC race after retry"
+ write_summary success "Recovered from transient Nix failure after retry"
else
write_summary success
fi
@@ -337,18 +337,22 @@ run_nix_gc_race_retry() {
tr -d '[:space:]' || true)
saw_invalid_path=false
saw_cachix_signature=false
+ saw_fetch_signature=false
[ -n "$path" ] && saw_invalid_path=true
printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*Failed to convert config\.cachix to JSON' && saw_cachix_signature=true || true
printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*while evaluating the option.*cachix\.package' && saw_cachix_signature=true || true
+ printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*cannot read file from tarball:[[:space:]]*Truncated tar archive detected while reading data' && saw_fetch_signature=true || true
rm -f "$log"
- if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ]; then
- echo "::warning::[ci] $task failed after $elapsed s without a detected Nix store validity race"
- write_summary failure "No Nix GC race signature detected"
+ if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ] && [ "$saw_fetch_signature" != true ]; then
+ echo "::warning::[ci] $task failed after $elapsed s without a detected transient Nix failure"
+ write_summary failure "No transient Nix failure signature detected"
return "$rc"
fi
- if [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then
+ if [ "$saw_fetch_signature" = true ]; then
+ echo "::warning::Nix source fetch corruption detected for $task (attempt $attempt/$max); retrying with a refreshed eval cache"
+ elif [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then
echo "::warning::Nix store validity race detected for $task via cachix eval wrapper (attempt $attempt/$max): $path"
elif [ "$saw_cachix_signature" = true ]; then
echo "::warning::Nix store validity race detected for $task via cachix eval wrapper without extracted store path (attempt $attempt/$max)"
@@ -363,8 +367,8 @@ run_nix_gc_race_retry() {
now=$(date +%s)
elapsed=$((now - start))
- echo "::error::Nix GC race retry exhausted for $task ($max attempts)"
- write_summary failure "Nix GC race retry exhausted"
+ echo "::error::Transient Nix retry exhausted for $task ($max attempts)"
+ write_summary failure "Transient Nix retry exhausted"
return 1
}`
From 54169bcb331dfdf85f47f4bed1fae8fe1c520f04 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Wed, 20 May 2026 16:03:12 +0200
Subject: [PATCH 78/81] Refresh effect megarepo lock
---
megarepo.lock | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/megarepo.lock b/megarepo.lock
index 98f82c06e..dc5cc40f0 100644
--- a/megarepo.lock
+++ b/megarepo.lock
@@ -4,9 +4,9 @@
"effect": {
"url": "https://github.com/effect-ts/effect",
"ref": "main",
- "commit": "1a63ec87cd295972b05b51c9b4ad2db9567dc994",
+ "commit": "3585f25110fca7af6aeec3f59c3fc05b20c8d316",
"pinned": false,
- "lockedAt": "2026-05-12T02:00:43.137Z"
+ "lockedAt": "2026-05-20T14:02:25.587Z"
}
}
}
From 61aee5a5f960ebab65805a8ef8d42a9d946fb16e Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Wed, 20 May 2026 16:10:14 +0200
Subject: [PATCH 79/81] Declare baseline ref input for shared concurrency
---
genie/ci-workflow/shared.ts | 28 +++++++++++++++++++++++++++-
1 file changed, 27 insertions(+), 1 deletion(-)
diff --git a/genie/ci-workflow/shared.ts b/genie/ci-workflow/shared.ts
index 45a68bb50..a4ba6ec2a 100644
--- a/genie/ci-workflow/shared.ts
+++ b/genie/ci-workflow/shared.ts
@@ -85,6 +85,31 @@ export const ciJobConcurrency = (jobId: string, opts?: { readonly matrix?: boole
const isMatrixJob = (job: GitHubWorkflowArgs['jobs'][string]) =>
typeof job.strategy === 'object' && job.strategy !== null && 'matrix' in job.strategy
+const workflowDispatchBaselineRefInput = {
+ description:
+ 'Optional ref/SHA to checkout before running CI measurement jobs. Used to backfill comparable baseline artifacts.',
+ required: false,
+ default: '',
+ type: 'string',
+} as const
+
+const withJobConcurrencyDispatchInputs = (on: GitHubWorkflowArgs['on']): GitHubWorkflowArgs['on'] => {
+ if (typeof on !== 'object' || on === null || !('workflow_dispatch' in on) || on.workflow_dispatch === null) {
+ return on
+ }
+
+ return {
+ ...on,
+ workflow_dispatch: {
+ ...on.workflow_dispatch,
+ inputs: {
+ measurement_baseline_ref: workflowDispatchBaselineRefInput,
+ ...on.workflow_dispatch.inputs,
+ },
+ },
+ }
+}
+
const withDefaultJobConcurrency = (jobs: GitHubWorkflowArgs['jobs']): GitHubWorkflowArgs['jobs'] =>
Object.fromEntries(
Object.entries(jobs).map(([jobId, job]) => [
@@ -110,9 +135,10 @@ export const ciWorkflowConcurrency = {
* field, and individual jobs can opt out or provide their own `concurrency`.
*/
export const ciWorkflow = (args: GitHubWorkflowArgs) =>
- (({ concurrency, actionlint, jobs, ...rest }) =>
+ (({ concurrency, actionlint, jobs, on, ...rest }) =>
githubWorkflow({
...rest,
+ on: concurrency === undefined ? withJobConcurrencyDispatchInputs(on) : on,
...(concurrency === undefined ? {} : { concurrency }),
actionlint: actionlint ?? defaultActionlintConfig,
jobs: concurrency === undefined ? withDefaultJobConcurrency(jobs) : jobs,
From 79156f1d9c1a0f8b6cee071b7fcb8e4a612dd5d1 Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Wed, 20 May 2026 17:34:29 +0200
Subject: [PATCH 80/81] Harden CI measurement setup
---
.github/workflows/ci.yml | 147 ++++++++++++++++++++++++++++--
genie/ci-workflow/measurements.ts | 15 ++-
genie/ci-workflow/setup.ts | 5 +-
3 files changed, 154 insertions(+), 13 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d17ce149e..5f1f7a467 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1682,16 +1682,127 @@ jobs:
- name: Cold pnpm deps validation
shell: bash
run: |
- set -euo pipefail
- for attr in '.#genie-pnpm-deps' '.#megarepo-pnpm-deps' '.#oxc-config-plugin-pnpm-deps'; do
+ __nix_gc_retry_helper=$(mktemp)
+ cat > "$__nix_gc_retry_helper" <<'EOF'
+ #!/usr/bin/env bash
+
+ run_nix_gc_race_retry() {
+ local task="$1"
+ local command="$2"
+ local max="${NIX_GC_RACE_MAX_RETRIES:-10}"
+ local heartbeat="${CI_PROGRESS_HEARTBEAT_SECONDS:-60}"
+ local attempt=1
+ local log rc path start now elapsed hb_pid flattened saw_invalid_path saw_cachix_signature saw_fetch_signature had_errexit
+
+ start="$(date +%s)"
+
+ write_summary() {
+ [ -n "${GITHUB_STEP_SUMMARY:-}" ] || return 0
+ {
+ echo "### CI Task"
+ echo "- Task: $task"
+ echo "- Status: $1"
+ echo "- Duration: $elapsed s"
+ echo "- Attempts: $attempt/$max"
+ [ -z "${2:-}" ] || echo "- Note: $2"
+ } >> "$GITHUB_STEP_SUMMARY"
+ }
+
+ while [ "$attempt" -le "$max" ]; do
+ echo "::notice::[ci] starting $task (attempt $attempt/$max)"
+ (
+ while sleep "$heartbeat"; do
+ now=$(date +%s)
+ elapsed=$((now - start))
+ echo "::notice::[ci] $task still running after $elapsed s (attempt $attempt/$max)"
+ done
+ ) &
+ hb_pid=$!
+
+ log=$(mktemp)
+ had_errexit=false
+ case $- in
+ *e*) had_errexit=true ;;
+ esac
+ set +e
+ eval "$command" > >(tee -a "$log") 2> >(tee -a "$log" >&2)
+ rc=$?
+ if [ "$had_errexit" = true ]; then
+ set -e
+ fi
+
+ kill "$hb_pid" 2>/dev/null || true
+ wait "$hb_pid" 2>/dev/null || true
+
+ now=$(date +%s)
+ elapsed=$((now - start))
+
+ if [ "$rc" -eq 0 ]; then
+ echo "::notice::[ci] completed $task in $elapsed s"
+ if [ "$attempt" -gt 1 ]; then
+ write_summary success "Recovered from transient Nix failure after retry"
+ else
+ write_summary success
+ fi
+ rm -f "$log"
+ return 0
+ fi
+
+ flattened=$(tr '\r\n' ' ' < "$log" | sed -E $'s/\x1B\[[0-9;]*m//g')
+ path=$(printf '%s' "$flattened" |
+ grep -o "error:[[:space:]]*path '/nix/store/[^']*'[[:space:]]*is not valid" |
+ head -1 |
+ grep -o "/nix/store/[^']*" |
+ tr -d '[:space:]' || true)
+ saw_invalid_path=false
+ saw_cachix_signature=false
+ saw_fetch_signature=false
+ [ -n "$path" ] && saw_invalid_path=true
+ printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*Failed to convert config\.cachix to JSON' && saw_cachix_signature=true || true
+ printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*.*while evaluating the option.*cachix\.package' && saw_cachix_signature=true || true
+ printf '%s' "$flattened" | grep -Eq 'error:[[:space:]]*cannot read file from tarball:[[:space:]]*Truncated tar archive detected while reading data' && saw_fetch_signature=true || true
+ rm -f "$log"
+
+ if [ "$saw_invalid_path" != true ] && [ "$saw_cachix_signature" != true ] && [ "$saw_fetch_signature" != true ]; then
+ echo "::warning::[ci] $task failed after $elapsed s without a detected transient Nix failure"
+ write_summary failure "No transient Nix failure signature detected"
+ return "$rc"
+ fi
+
+ if [ "$saw_fetch_signature" = true ]; then
+ echo "::warning::Nix source fetch corruption detected for $task (attempt $attempt/$max); retrying with a refreshed eval cache"
+ elif [ "$saw_cachix_signature" = true ] && [ -n "$path" ]; then
+ echo "::warning::Nix store validity race detected for $task via cachix eval wrapper (attempt $attempt/$max): $path"
+ elif [ "$saw_cachix_signature" = true ]; then
+ echo "::warning::Nix store validity race detected for $task via cachix eval wrapper without extracted store path (attempt $attempt/$max)"
+ else
+ echo "::warning::Nix store validity race detected for $task (attempt $attempt/$max): $path"
+ fi
+
+ [ -z "$path" ] || nix-store --realise "$path" 2>/dev/null || true
+ rm -rf ~/.cache/nix/eval-cache-*
+ attempt=$((attempt + 1))
+ done
+
+ now=$(date +%s)
+ elapsed=$((now - start))
+ echo "::error::Transient Nix retry exhausted for $task ($max attempts)"
+ write_summary failure "Transient Nix retry exhausted"
+ return 1
+ }
+ EOF
+ . "$__nix_gc_retry_helper"
+ rm -f "$__nix_gc_retry_helper"
+ run_nix_gc_race_retry 'Cold pnpm deps validation' 'set -euo pipefail
+ for attr in '"'"'.#genie-pnpm-deps'"'"' '"'"'.#megarepo-pnpm-deps'"'"' '"'"'.#oxc-config-plugin-pnpm-deps'"'"'; do
echo "::group::rebuild-check $attr"
# Step 1: Realize once (may substitute) so rebuild has a trusted output to compare against.
- nix build --no-link "$attr" --option substituters 'https://cache.nixos.org'
+ nix build --no-link "$attr" --option substituters '"'"'https://cache.nixos.org'"'"'
# Step 2: Rebuild and compare locally. This fails on stale fixed-output hashes without
# relying on whether a shared daemon store made the prior out path disappear.
- nix build --no-link --rebuild "$attr" --option substituters 'https://cache.nixos.org'
+ nix build --no-link --rebuild "$attr" --option substituters '"'"'https://cache.nixos.org'"'"'
echo "::endgroup::"
- done
+ done'
- name: Nix diagnostics summary
if: failure()
shell: bash
@@ -2464,7 +2575,16 @@ jobs:
return 1
fi
if tool_out="$(nix build --no-link --print-out-paths "nixpkgs#$nix_attr" 2>/dev/null)"; then
- export PATH="$tool_out/bin:$PATH"
+ while IFS= read -r tool_path; do
+ [ -n "$tool_path" ] || continue
+ [ -d "$tool_path/bin" ] || continue
+ export PATH="$tool_path/bin:$PATH"
+ if command -v "$tool_name" >/dev/null 2>&1; then
+ return 0
+ fi
+ done </dev/null 2>&1
}
@@ -2479,8 +2599,8 @@ jobs:
exit 1
}
- require_ci_measurement_tool awk gawk
- require_ci_measurement_tool jq jq
+ require_ci_measurement_tool awk gawk.out
+ require_ci_measurement_tool jq jq.bin
ARTIFACT_DIR="$(mkdir -p "$ARTIFACT_DIR" && cd "$ARTIFACT_DIR" && pwd -P)"
CI_MEASUREMENT_HEAD_DIR="${CI_MEASUREMENT_HEAD_DIR:-$PWD}"
@@ -3816,7 +3936,16 @@ jobs:
return 1
fi
if tool_out="$(nix build --no-link --print-out-paths "nixpkgs#$nix_attr" 2>/dev/null)"; then
- export PATH="$tool_out/bin:$PATH"
+ while IFS= read -r tool_path; do
+ [ -n "$tool_path" ] || continue
+ [ -d "$tool_path/bin" ] || continue
+ export PATH="$tool_path/bin:$PATH"
+ if command -v "$tool_name" >/dev/null 2>&1; then
+ return 0
+ fi
+ done </dev/null 2>&1
}
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index 65554699a..e23323506 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -549,7 +549,16 @@ const ciMeasurementToolBootstrapScript = String.raw`ensure_ci_measurement_tool()
return 1
fi
if tool_out="$(nix build --no-link --print-out-paths "nixpkgs#$nix_attr" 2>/dev/null)"; then
- export PATH="$tool_out/bin:$PATH"
+ while IFS= read -r tool_path; do
+ [ -n "$tool_path" ] || continue
+ [ -d "$tool_path/bin" ] || continue
+ export PATH="$tool_path/bin:$PATH"
+ if command -v "$tool_name" >/dev/null 2>&1; then
+ return 0
+ fi
+ done </dev/null 2>&1
}
@@ -573,8 +582,8 @@ const renderDevenvPerfScript = (
return String.raw`set -euo pipefail
${ciMeasurementToolBootstrapScript}
-require_ci_measurement_tool awk gawk
-require_ci_measurement_tool jq jq
+require_ci_measurement_tool awk gawk.out
+require_ci_measurement_tool jq jq.bin
ARTIFACT_DIR="$(mkdir -p "$ARTIFACT_DIR" && cd "$ARTIFACT_DIR" && pwd -P)"
CI_MEASUREMENT_HEAD_DIR="${dollar}{CI_MEASUREMENT_HEAD_DIR:-$PWD}"
diff --git a/genie/ci-workflow/setup.ts b/genie/ci-workflow/setup.ts
index f7c2e7b49..95091eb64 100644
--- a/genie/ci-workflow/setup.ts
+++ b/genie/ci-workflow/setup.ts
@@ -16,6 +16,7 @@ import {
runDevenvTasksBefore,
shellSingleQuote,
standardCIEnv,
+ withGcRaceRetry,
workspaceLocalNixCachePath,
workspaceLocalNixCacheRoot,
type NixBinaryCache,
@@ -735,7 +736,7 @@ export const validateColdPnpmDepsStep = ({
? ''
: ` --option substituters ${shellSingleQuote(substituters.join(' '))}`
- return [
+ const command = [
'set -euo pipefail',
`for attr in ${flakeRefs.map(shellSingleQuote).join(' ')}; do`,
' echo "::group::rebuild-check $attr"',
@@ -747,6 +748,8 @@ export const validateColdPnpmDepsStep = ({
' echo "::endgroup::"',
'done',
].join('\n')
+
+ return withGcRaceRetry({ command, label: name })
})(),
})
From b5220e8bc71249a0bf9a2b7d7defef782d5ba32f Mon Sep 17 00:00:00 2001
From: schickling-assistant
<261620128+schickling-assistant@users.noreply.github.com>
Date: Wed, 20 May 2026 19:19:40 +0200
Subject: [PATCH 81/81] Bound CI measurement baseline downloads
---
.github/workflows/ci.yml | 45 ++++++++++++++++++++++++++-----
genie/ci-workflow/measurements.ts | 16 +++++++++--
2 files changed, 53 insertions(+), 8 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 5f1f7a467..58a0aee1f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -4214,6 +4214,7 @@ jobs:
BASELINE_MAX_RUNS: '20'
BASELINE_MAX_CANDIDATE_RUNS: '60'
BASELINE_REQUIRED_OBSERVATIONS_JSON: '[]'
+ BASELINE_DOWNLOAD_TIMEOUT_SECONDS: '120'
run: |
set -euo pipefail
@@ -4273,6 +4274,10 @@ jobs:
if ! [[ "$max_runs" =~ ^[0-9]+$ ]] || [ "$max_runs" -lt 1 ]; then
max_runs=1
fi
+ download_timeout_seconds="${BASELINE_DOWNLOAD_TIMEOUT_SECONDS:-120}"
+ if ! [[ "$download_timeout_seconds" =~ ^[0-9]+$ ]] || [ "$download_timeout_seconds" -lt 1 ]; then
+ download_timeout_seconds=120
+ fi
write_baseline_observation_counts() {
local measurement_index="$BASELINE_OUTPUT_DIR/baseline-measurement-files.txt"
@@ -4354,7 +4359,7 @@ jobs:
current_artifact_id="$(printf '%s' "$artifact_json" | jq -r '.id')"
current_output_dir="$BASELINE_OUTPUT_DIR/run-$candidate_run"
mkdir -p "$current_output_dir"
- if "$GH_BIN" run download "$candidate_run" \
+ if timeout "$download_timeout_seconds" "$GH_BIN" run download "$candidate_run" \
--repo "$repo" \
--name "$current_artifact_name" \
--dir "$current_output_dir"; then
@@ -4371,7 +4376,13 @@ jobs:
'{runId:$runId, artifactName:$artifactName, artifactId:$artifactId, path:$path}' \
>>"$downloaded_runs_file"
else
- echo "::notice::failed to download baseline artifact $current_artifact_name from run $candidate_run"
+ status="$?"
+ rm -rf "$current_output_dir"
+ if [ "$status" -eq 124 ]; then
+ echo "::notice::timed out after ${download_timeout_seconds}s downloading baseline artifact $current_artifact_name from run $candidate_run; skipping candidate"
+ else
+ echo "::notice::failed to download baseline artifact $current_artifact_name from run $candidate_run (exit $status)"
+ fi
fi
fi
done
@@ -4422,6 +4433,7 @@ jobs:
BASELINE_MAX_RUNS: '20'
BASELINE_MAX_CANDIDATE_RUNS: '60'
BASELINE_REQUIRED_OBSERVATIONS_JSON: '[]'
+ BASELINE_DOWNLOAD_TIMEOUT_SECONDS: '120'
run: |
set -euo pipefail
@@ -4481,6 +4493,10 @@ jobs:
if ! [[ "$max_runs" =~ ^[0-9]+$ ]] || [ "$max_runs" -lt 1 ]; then
max_runs=1
fi
+ download_timeout_seconds="${BASELINE_DOWNLOAD_TIMEOUT_SECONDS:-120}"
+ if ! [[ "$download_timeout_seconds" =~ ^[0-9]+$ ]] || [ "$download_timeout_seconds" -lt 1 ]; then
+ download_timeout_seconds=120
+ fi
write_baseline_observation_counts() {
local measurement_index="$BASELINE_OUTPUT_DIR/baseline-measurement-files.txt"
@@ -4562,7 +4578,7 @@ jobs:
current_artifact_id="$(printf '%s' "$artifact_json" | jq -r '.id')"
current_output_dir="$BASELINE_OUTPUT_DIR/run-$candidate_run"
mkdir -p "$current_output_dir"
- if "$GH_BIN" run download "$candidate_run" \
+ if timeout "$download_timeout_seconds" "$GH_BIN" run download "$candidate_run" \
--repo "$repo" \
--name "$current_artifact_name" \
--dir "$current_output_dir"; then
@@ -4579,7 +4595,13 @@ jobs:
'{runId:$runId, artifactName:$artifactName, artifactId:$artifactId, path:$path}' \
>>"$downloaded_runs_file"
else
- echo "::notice::failed to download baseline artifact $current_artifact_name from run $candidate_run"
+ status="$?"
+ rm -rf "$current_output_dir"
+ if [ "$status" -eq 124 ]; then
+ echo "::notice::timed out after ${download_timeout_seconds}s downloading baseline artifact $current_artifact_name from run $candidate_run; skipping candidate"
+ else
+ echo "::notice::failed to download baseline artifact $current_artifact_name from run $candidate_run (exit $status)"
+ fi
fi
fi
done
@@ -4630,6 +4652,7 @@ jobs:
BASELINE_MAX_RUNS: '20'
BASELINE_MAX_CANDIDATE_RUNS: '60'
BASELINE_REQUIRED_OBSERVATIONS_JSON: '[]'
+ BASELINE_DOWNLOAD_TIMEOUT_SECONDS: '120'
run: |
set -euo pipefail
@@ -4689,6 +4712,10 @@ jobs:
if ! [[ "$max_runs" =~ ^[0-9]+$ ]] || [ "$max_runs" -lt 1 ]; then
max_runs=1
fi
+ download_timeout_seconds="${BASELINE_DOWNLOAD_TIMEOUT_SECONDS:-120}"
+ if ! [[ "$download_timeout_seconds" =~ ^[0-9]+$ ]] || [ "$download_timeout_seconds" -lt 1 ]; then
+ download_timeout_seconds=120
+ fi
write_baseline_observation_counts() {
local measurement_index="$BASELINE_OUTPUT_DIR/baseline-measurement-files.txt"
@@ -4770,7 +4797,7 @@ jobs:
current_artifact_id="$(printf '%s' "$artifact_json" | jq -r '.id')"
current_output_dir="$BASELINE_OUTPUT_DIR/run-$candidate_run"
mkdir -p "$current_output_dir"
- if "$GH_BIN" run download "$candidate_run" \
+ if timeout "$download_timeout_seconds" "$GH_BIN" run download "$candidate_run" \
--repo "$repo" \
--name "$current_artifact_name" \
--dir "$current_output_dir"; then
@@ -4787,7 +4814,13 @@ jobs:
'{runId:$runId, artifactName:$artifactName, artifactId:$artifactId, path:$path}' \
>>"$downloaded_runs_file"
else
- echo "::notice::failed to download baseline artifact $current_artifact_name from run $candidate_run"
+ status="$?"
+ rm -rf "$current_output_dir"
+ if [ "$status" -eq 124 ]; then
+ echo "::notice::timed out after ${download_timeout_seconds}s downloading baseline artifact $current_artifact_name from run $candidate_run; skipping candidate"
+ else
+ echo "::notice::failed to download baseline artifact $current_artifact_name from run $candidate_run (exit $status)"
+ fi
fi
fi
done
diff --git a/genie/ci-workflow/measurements.ts b/genie/ci-workflow/measurements.ts
index e23323506..6341d8f16 100644
--- a/genie/ci-workflow/measurements.ts
+++ b/genie/ci-workflow/measurements.ts
@@ -227,6 +227,7 @@ export type GitHubPreviousArtifactStepOptions = {
readonly seedRunIds?: readonly string[]
readonly maxRuns?: number
readonly maxCandidateRuns?: number
+ readonly downloadTimeoutSeconds?: number
readonly requiredObservations?: readonly CiMeasurementRequiredBaselineObservation[]
readonly tokenExpression?: string
}
@@ -1215,6 +1216,7 @@ export const downloadPreviousGitHubArtifactStep = (opts: GitHubPreviousArtifactS
opts.maxCandidateRuns ?? Math.max((opts.maxRuns ?? 5) * 3, 20),
),
BASELINE_REQUIRED_OBSERVATIONS_JSON: ciMeasurementRequiredObservationsJson(opts),
+ BASELINE_DOWNLOAD_TIMEOUT_SECONDS: String(opts.downloadTimeoutSeconds ?? 120),
},
run: String.raw`set -euo pipefail
@@ -1274,6 +1276,10 @@ max_runs="${dollar}{BASELINE_MAX_RUNS:-5}"
if ! [[ "$max_runs" =~ ^[0-9]+$ ]] || [ "$max_runs" -lt 1 ]; then
max_runs=1
fi
+download_timeout_seconds="${dollar}{BASELINE_DOWNLOAD_TIMEOUT_SECONDS:-120}"
+if ! [[ "$download_timeout_seconds" =~ ^[0-9]+$ ]] || [ "$download_timeout_seconds" -lt 1 ]; then
+ download_timeout_seconds=120
+fi
write_baseline_observation_counts() {
local measurement_index="$BASELINE_OUTPUT_DIR/baseline-measurement-files.txt"
@@ -1355,7 +1361,7 @@ for candidate_run in $candidate_runs; do
current_artifact_id="$(printf '%s' "$artifact_json" | jq -r '.id')"
current_output_dir="$BASELINE_OUTPUT_DIR/run-$candidate_run"
mkdir -p "$current_output_dir"
- if "$GH_BIN" run download "$candidate_run" \
+ if timeout "$download_timeout_seconds" "$GH_BIN" run download "$candidate_run" \
--repo "$repo" \
--name "$current_artifact_name" \
--dir "$current_output_dir"; then
@@ -1372,7 +1378,13 @@ for candidate_run in $candidate_runs; do
'{runId:$runId, artifactName:$artifactName, artifactId:$artifactId, path:$path}' \
>>"$downloaded_runs_file"
else
- echo "::notice::failed to download baseline artifact $current_artifact_name from run $candidate_run"
+ status="$?"
+ rm -rf "$current_output_dir"
+ if [ "$status" -eq 124 ]; then
+ echo "::notice::timed out after ${dollar}{download_timeout_seconds}s downloading baseline artifact $current_artifact_name from run $candidate_run; skipping candidate"
+ else
+ echo "::notice::failed to download baseline artifact $current_artifact_name from run $candidate_run (exit $status)"
+ fi
fi
fi
done