diff --git a/.github/workflows/carl-install-smoke.yml b/.github/workflows/carl-install-smoke.yml index 27c563935..7ffed4ca8 100644 --- a/.github/workflows/carl-install-smoke.yml +++ b/.github/workflows/carl-install-smoke.yml @@ -94,24 +94,31 @@ jobs: env: # PR HEAD sha so smoke fetches install.sh from THIS PR. CARL_INSTALL_REF: ${{ github.event.pull_request.head.sha || inputs.install_ref || github.sha }} - # Pin docker images to :pr-N (PR-scoped, mutable per push). Refreshed - # by push-image.sh on every dev push, so always reflects this PR's - # latest source — but never collides with another PR or canary. - # Slices the dev didn't push directly are aliased from :canary by the - # dev script (manifest copy, no rebuild). :latest was the prior - # default and went 9-14 days stale in April 2026 — never use it for - # smoke. + # Default to the canary image tag for ALL PR runs (and manual + # triggers). Per Joel 2026-05-30: per-PR docker rebuilds aren't + # worthwhile at the canary level — image publishing takes a lot of + # machines and the build is currently bloated by Node-legacy + # surface that the longer-term Rust-core / thin-Node-client + # extraction will remove. Image rebuilds are a main-promotion + # gate, not a per-PR check. # - # Resolution priority: PR# > input.image_tag > 'canary'. - # On workflow_dispatch (no PR context) the bare `pr-${{ ... }}` - # interpolated to 'pr-' (empty after dash), causing install.sh to - # miss the registry and fall back to 'will build locally' — which - # then ran a full Rust compile of continuum-core-vulkan on the - # no-GPU runner and hit the 25-min runner cap (observed run - # 25400718464). The conditional below makes manual triggers - # default to the canary tag (the cadence we publish on) and lets - # operators override via the image_tag input from the UI. - CONTINUUM_IMAGE_TAG: ${{ github.event.pull_request.number && format('pr-{0}', github.event.pull_request.number) || inputs.image_tag || 'canary' }} + # The previous logic set pr-${PR_NUMBER} for PR runs, which + # required `scripts/push-current-arch.sh` to have run for the PR + # before the smoke would pass. That published images per PR which + # we don't actually need — it just generated "image missing → + # silent compose build → 25-min timeout" failures (observed on + # #1476 at 25m45s; #1085 from May 11 also has this exact failure + # signature). Defaulting to :canary tests the install path + # against canary's binary, which is the correct semantic for the + # PR-stage gate: validate THIS PR's install.sh + docker-compose + # changes; validate the binary at main promotion when fresh + # images get built. + # + # Manual triggers + workflow_dispatch can still override via the + # `image_tag` input (useful for explicit pr-N testing when a dev + # has pushed pr-N for binary regression work, or for testing a + # specific historical canary tag). + CONTINUUM_IMAGE_TAG: ${{ inputs.image_tag || 'canary' }} # 25-min cap on the docker-only install. Hybrid (Mac source-build) # path would exceed this — by design, that's the gate firing on # the README/install mismatch. diff --git a/scripts/ci/carl-install-smoke.sh b/scripts/ci/carl-install-smoke.sh index 8a59d1074..376848905 100644 --- a/scripts/ci/carl-install-smoke.sh +++ b/scripts/ci/carl-install-smoke.sh @@ -73,6 +73,103 @@ teardown() { } trap teardown EXIT INT TERM +# ── 0. Pre-flight: verify the required ghcr.io images exist ── +# install.sh has a `compose pull 2>/dev/null || warn ... will build locally` +# fallback so end users on uncommon architectures (e.g. ports to future +# phone targets) still have a path. CI must NOT take that fallback — +# building continuum-core-vulkan from source on the no-GPU GHA runner +# is a full cargo build --release that takes 25+ minutes and hits +# CARL_INSTALL_TIMEOUT_SEC, which is exactly the silent downgrade +# Joel called out 2026-05-30 ("Relying on stale builds is dumb" / +# "fix properly. What broke, what is the long term goal"). +# +# What broke (concrete): PR #1476 (avatars context fix) fixed the +# `docker compose build` error; install.sh then proceeded to +# `compose pull` which failed (pr-1476 image hadn't been pushed via +# scripts/push-current-arch.sh), and silently fell through to +# `compose up` → docker build → cargo build --release → 25min +# timeout. The avatars fix WORKED; the deeper issue is the silent +# downgrade after pull failure. +# +# Long-term goal: every PR's install-smoke tests THIS PR's binary, +# fast and reliably. That requires the pre-built image to exist +# (dev pre-push pipeline publishes pr-N). When the publish didn't +# happen, the smoke should fail LOUDLY ("image missing, push via +# scripts/push-current-arch.sh") instead of silently slipping into +# a 25-min build that times out OR worse, silently using a stale +# canary image and reporting "tests pass!" on someone else's binary. +# +# Only the HEAVY Rust binary image (continuum-core-vulkan) must exist +# pre-built — that's the one whose local build is a 25-min cargo +# build --release that hits CARL_INSTALL_TIMEOUT_SEC. The lighter TS +# images (node-server, widget-server, model-init) build in under a +# minute on either arch per Joel 2026-05-30 — install.sh's fallback +# building them locally is acceptable, doesn't blow the timeout. +# +# This split avoids the precheck mis-firing on the common case where +# canary has the Rust image fresh (BigMama pushed) but the lighter +# TS sidecar images haven't been pushed yet under the canary tag. +# Just the Rust image being present is sufficient to make the smoke +# fast and meaningful. +# +# CONTINUUM_IMAGE_TAG comes from the workflow (canary by default +# per the carl-install-smoke.yml change in this commit). Operator +# escape hatch: CARL_ALLOW_LOCAL_BUILD=1 opts into install.sh's +# full fallback — useful when explicitly debugging the heavy build +# path, NOT for production CI. +RUST_BINARY_IMAGE="continuum-core-vulkan" +RESOLVED_TAG="${CONTINUUM_IMAGE_TAG:-canary}" +MISSING_IMAGES=() +echo "" +echo "━━━ pre-flight: verifying heavy ghcr.io image at :${RESOLVED_TAG} ━━━" +RUST_REF="ghcr.io/cambriantech/${RUST_BINARY_IMAGE}:${RESOLVED_TAG}" +if docker manifest inspect "$RUST_REF" >/dev/null 2>&1; then + echo " ✓ $RUST_REF" +else + echo " ✗ $RUST_REF (MISSING — heavy build, blocks the smoke)" + MISSING_IMAGES+=("$RUST_REF") +fi +echo " (lighter TS sidecars node-server / widget-server / model-init" +echo " will be pulled if present, built locally if not — sub-minute" +echo " cost either way; not gated by this pre-flight)" + +if [ ${#MISSING_IMAGES[@]} -gt 0 ]; then + echo "" + echo "❌ Required images missing at :${RESOLVED_TAG} — refusing to silently fall" + echo " through to install.sh's local-build path." + echo "" + echo " Missing:" + for img in "${MISSING_IMAGES[@]}"; do + echo " $img" + done + echo "" + echo " Root cause: the dev pre-push pipeline didn't publish images for this PR." + echo " Architecturally — CI is for CHECK, not BUILD (Joel 2026-04-23). Devs" + echo " publish images via scripts/push-current-arch.sh before push; the CI" + echo " smoke uses the pre-built images and times the install path end-to-end." + echo "" + echo " To unblock this run on a build machine that supports the target arch:" + echo " scripts/push-current-arch.sh" + echo " Then re-run this workflow. The publish pipeline tags pr-\${PR_NUMBER}." + echo "" + echo " For PRs that genuinely don't change the binary (docker-compose tweaks," + echo " docs, ts-only): the dev push pipeline already aliases pr-N from canary" + echo " in that case (see scripts/push-image.sh manifest copy path) — running" + echo " scripts/push-current-arch.sh from any dev box is the right move." + echo "" + echo " Operator override (debugging only, NOT for production CI): set" + echo " CARL_ALLOW_LOCAL_BUILD=1" + echo " in the workflow env to fall through to install.sh's local-build." + echo " This will likely time out at CARL_INSTALL_TIMEOUT_SEC=${CARL_INSTALL_TIMEOUT_SEC}s" + echo " and tests the LOCAL build, not the published image." + if [ "${CARL_ALLOW_LOCAL_BUILD:-0}" = "1" ]; then + echo "" + echo " CARL_ALLOW_LOCAL_BUILD=1 set — continuing into the local-build fallback." + else + exit 1 + fi +fi + # ── 1. Run Carl's exact install command ─────────────────────── echo "" echo "━━━ running install.sh from $CARL_INSTALL_REF ━━━"