diff --git a/.github/workflows/carl-install-smoke.yml b/.github/workflows/carl-install-smoke.yml index d93e0bc76..e744a4db7 100644 --- a/.github/workflows/carl-install-smoke.yml +++ b/.github/workflows/carl-install-smoke.yml @@ -71,12 +71,37 @@ jobs: - name: Login to ghcr.io (so install.sh can pull pre-built images) run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin + - name: Compute SHORT_SHA for image tag pin + id: shortsha + run: | + # SHA-pinned image tag. Why: smoke MUST exercise images built + # from THIS PR's source — :latest / :canary are mutable and have + # historically gone stale (April 2026: a 2-week-old :canary tag + # silently passed smoke against pre-fix bits, validating nothing + # about the actual PR). 7-char short SHA matches what + # push-image.sh / push-current-arch.sh tag on push (`git rev-parse + # --short HEAD` is 7 by default). The image either exists at this + # exact ref (dev pushed for this SHA) or smoke fails with + # "manifest unknown" — at which point the dev applies the + # `ci-build:` label OR rebases onto canary HEAD where a + # fresh image exists. Either way, smoke can never silently pull + # stale bits. + FULL_SHA="${{ github.event.pull_request.head.sha || github.sha }}" + SHORT_SHA="${FULL_SHA:0:7}" + echo "short_sha=$SHORT_SHA" >> "$GITHUB_OUTPUT" + echo "Pinning CONTINUUM_IMAGE_TAG=$SHORT_SHA (from $FULL_SHA)" + - name: Run carl-install smoke env: # Pass the PR HEAD sha so the smoke fetches the install.sh from # THIS PR (not main). Falls back to manual workflow_dispatch input # when not in a PR context. CARL_INSTALL_REF: ${{ github.event.pull_request.head.sha || inputs.install_ref || github.sha }} + # See "Compute SHORT_SHA" step above for the rationale on + # SHA-pinning. install.sh honors $CONTINUUM_IMAGE_TAG and + # substitutes it into docker-compose's + # `ghcr.io/cambriantech/continuum-*:${CONTINUUM_IMAGE_TAG:-latest}`. + CONTINUUM_IMAGE_TAG: ${{ steps.shortsha.outputs.short_sha }} # 25-min cap on the docker-only install. Hybrid (Mac source-build) # path would exceed this — by design, that's the gate firing on # the README/install mismatch. diff --git a/.github/workflows/docker-images.yml b/.github/workflows/docker-images.yml index 1f43ac356..645064128 100644 --- a/.github/workflows/docker-images.yml +++ b/.github/workflows/docker-images.yml @@ -421,177 +421,75 @@ jobs: # Single source of truth, identical failure surface, easy local testing. run: bash scripts/ci/install-and-run-gate.sh - # ── Rebuild Stale Arches (CI auto-rebuild fallback) ──────────────── - # Closes the cross-developer push race that the SHA-revision gate - # surfaces: when one dev pushes, their arch is current but the other - # dev's arch goes stale. Without this job, the off-host dev would - # have to manually rebuild on their machine before the gate passes — - # serial coordination dance that blocks every cross-dev PR. - # - # Per Joel (2026-04-23): "you can't have one [check] that's yaml and - # another that's shell. you have to reuse otherwise they diverge." - # So this job is THIN: pick the right native runner via matrix, - # set up registry auth, then invoke the SAME `scripts/push-current-arch.sh` - # the developer pre-push hook calls. No build logic in CI yaml. When - # push-current-arch.sh changes (new variant, new --label, new arch), - # CI inherits the change automatically. - # - # Slice efficiency: registry buildcache (--cache-from on push-image.sh) - # means unchanged layers (rust base, apt installs, cargo-chef workspace - # deps) replay from cache. Typical incremental rebuild: 5-15 min on - # cache hit, well under the GHA timeout. - # - # See #965 for the full design rationale. - rebuild-stale-amd64: + # CI build escape hatch: fires only when a `ci-build:` label is on + # the PR. Default path remains dev-pushes-from-native-host. Label is the + # explicit "the right human can't push, please build" signal — without it + # a stale image hard-fails the merge. Labels: ci-build:{amd64,arm64, + # vulkan,cuda,core,livekit-bridge,all}. + ci-build-on-label: needs: verify-architectures - if: needs.verify-architectures.outputs.stale_amd64 != '[]' - runs-on: ubuntu-latest + if: | + (needs.verify-architectures.outputs.stale_amd64 != '[]' || + needs.verify-architectures.outputs.stale_arm64 != '[]') && + (contains(github.event.pull_request.labels.*.name, 'ci-build:amd64') || + contains(github.event.pull_request.labels.*.name, 'ci-build:arm64') || + contains(github.event.pull_request.labels.*.name, 'ci-build:vulkan') || + contains(github.event.pull_request.labels.*.name, 'ci-build:cuda') || + contains(github.event.pull_request.labels.*.name, 'ci-build:core') || + contains(github.event.pull_request.labels.*.name, 'ci-build:livekit-bridge') || + contains(github.event.pull_request.labels.*.name, 'ci-build:all')) + strategy: + fail-fast: false + matrix: + include: + - arch: amd64 + runs-on: ubuntu-latest + - arch: arm64 + runs-on: ubuntu-24.04-arm + runs-on: ${{ matrix.runs-on }} permissions: contents: read packages: write steps: - uses: actions/checkout@v4 with: - # CRITICAL: check out the PR HEAD, NOT the synthetic merge commit - # GitHub creates by default. Without this, push-current-arch.sh's - # `git rev-parse HEAD` returns the merge SHA, images get labeled - # with that SHA, and verify-image-revisions.sh (which expects - # github.event.pull_request.head.sha) flags them STALE forever. - # 2026-04-24: hit this exact failure — labels said 9dc97ea (merge - # SHA), expected 056978cde (PR HEAD), every rebuild produced more - # mismatched labels. ref: ${{ github.event.pull_request.head.sha || github.sha }} - # Full history needed for the re-check step to invoke - # verify-image-revisions.sh's smart staleness diff (compares - # the older labeled SHA against HEAD to skip rebuilds for - # non-context changes). fetch-depth: 0 - # Recursive submodules required: vendor/llama.cpp is checked out - # as a submodule and the docker build CACHED layer references its - # CMakeLists.txt presence. Without this, the rebuild dies with - # "vendor/llama.cpp is empty — host submodule not initialized." - # Bigmama caught this 2026-04-24 after the rebuild-stale-amd64 job - # first fired post-stale-image-gate-restoration. submodules: recursive - - name: Login to ghcr.io - run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - name: Install Rust toolchain (push-current-arch may invoke pre-build cargo checks) - run: | - # We don't actually need a host-side cargo build — push-image.sh - # builds inside the docker buildx context — but if push-current-arch.sh - # ever runs `cargo test` as Phase 0, we need the toolchain present. - # Cheap when not used, prevents a future surprise. - if ! command -v cargo >/dev/null; then - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable --profile minimal - echo "$HOME/.cargo/bin" >> "$GITHUB_PATH" - fi - - name: Re-check staleness (skip if a human caught up between gate and now) - id: recheck_amd64 - env: - EXPECTED_SHA: ${{ needs.verify-architectures.outputs.expected_sha }} - TAG: pr-${{ github.event.pull_request.number }} - STALE_AMD64_OUT: ${{ runner.temp }}/stale-amd64-recheck.txt - STALE_ARM64_OUT: /dev/null - GHCR_USER: ${{ github.actor }} - GHCR_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - # The verify-architectures gate's stale list is a SNAPSHOT from - # gate-time. If a developer (bigmama on amd64, anvil on arm64) - # pushed the missing arch between gate-time and rebuild-time, the - # rebuild would otherwise burn 30+ min of GHA on work that's - # already done — pure waste. Re-check now and exit early if the - # human path beat us. Costs ~5-10s. - bash scripts/verify-image-revisions.sh || true - if [ ! -s "$STALE_AMD64_OUT" ]; then - echo "✅ amd64 staleness resolved between gate and rebuild — skipping." - echo "still_stale=false" >> "$GITHUB_OUTPUT" - else - echo "amd64 still stale, proceeding with rebuild:" - cat "$STALE_AMD64_OUT" - echo "still_stale=true" >> "$GITHUB_OUTPUT" - fi - - name: Rebuild stale amd64 images via push-current-arch.sh - if: steps.recheck_amd64.outputs.still_stale == 'true' - env: - # SKIP_PHASE_0=1: push-image.sh's cargo-test phase needs models on disk - # which CI doesn't have. The slice tests inside test-slices.sh still run - # (HTTP probe + container liveness) — those don't need models. - SKIP_PHASE_0: '1' - # PR_NUMBER lets push-current-arch.sh emit the :pr- tag. Without - # this it falls back to gh-cli lookup which works if gh is logged in. - PR_NUMBER: ${{ github.event.pull_request.number }} - run: | - echo "Rebuilding amd64 images that drifted from HEAD." - echo "Stale list: ${{ needs.verify-architectures.outputs.stale_amd64 }}" - bash scripts/push-current-arch.sh - - rebuild-stale-arm64: - needs: verify-architectures - if: needs.verify-architectures.outputs.stale_arm64 != '[]' - runs-on: ubuntu-24.04-arm - permissions: - contents: read - packages: write - steps: - - uses: actions/checkout@v4 - with: - ref: ${{ github.event.pull_request.head.sha || github.sha }} # PR HEAD, not merge commit — see amd64 job comment - fetch-depth: 0 # full history — see amd64 job comment - submodules: recursive # vendor/llama.cpp — see amd64 job comment - - name: Login to ghcr.io - run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - name: Install Rust toolchain (push-current-arch may invoke pre-build cargo checks) + - run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin + - uses: docker/setup-buildx-action@v3 + - name: Install Rust toolchain run: | if ! command -v cargo >/dev/null; then curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable --profile minimal echo "$HOME/.cargo/bin" >> "$GITHUB_PATH" fi - - name: Re-check staleness (skip if a human caught up between gate and now) - id: recheck_arm64 + - name: Resolve VARIANT from labels (most-specific wins) + id: scope env: - EXPECTED_SHA: ${{ needs.verify-architectures.outputs.expected_sha }} - TAG: pr-${{ github.event.pull_request.number }} - STALE_AMD64_OUT: /dev/null - STALE_ARM64_OUT: ${{ runner.temp }}/stale-arm64-recheck.txt - GHCR_USER: ${{ github.actor }} - GHCR_TOKEN: ${{ secrets.GITHUB_TOKEN }} + LABELS: ${{ toJSON(github.event.pull_request.labels.*.name) }} run: | - # See amd64 job comment — re-check at job start so we don't burn - # 30+ min of arm64 GHA when anvil already pushed from a Mac. - bash scripts/verify-image-revisions.sh || true - if [ ! -s "$STALE_ARM64_OUT" ]; then - echo "✅ arm64 staleness resolved between gate and rebuild — skipping." - echo "still_stale=false" >> "$GITHUB_OUTPUT" - else - echo "arm64 still stale, proceeding with rebuild:" - cat "$STALE_ARM64_OUT" - echo "still_stale=true" >> "$GITHUB_OUTPUT" + if echo "$LABELS" | grep -q '"ci-build:vulkan"'; then V=vulkan + elif echo "$LABELS" | grep -q '"ci-build:cuda"'; then V=cuda + elif echo "$LABELS" | grep -q '"ci-build:core"'; then V=core + elif echo "$LABELS" | grep -q '"ci-build:livekit-bridge"'; then V=livekit-bridge + else V= fi - - name: Rebuild stale arm64 images via push-current-arch.sh - if: steps.recheck_arm64.outputs.still_stale == 'true' + echo "variant=$V" >> "$GITHUB_OUTPUT" + - name: Build via push-current-arch.sh (slice tests still run) env: + # SKIP_PHASE_0: CI has no GPU/models for `cargo test -p llama`. + # Phase 2 slice tests (HTTP + GPU-IPC contract) still run. The + # ci-build: label is the human accepting this tradeoff. SKIP_PHASE_0: '1' PR_NUMBER: ${{ github.event.pull_request.number }} - run: | - echo "Rebuilding arm64 images that drifted from HEAD." - echo "Stale list: ${{ needs.verify-architectures.outputs.stale_arm64 }}" - bash scripts/push-current-arch.sh + VARIANT: ${{ steps.scope.outputs.variant }} + run: bash scripts/push-current-arch.sh - # ── Final verification (post-rebuild) ──────────────────────────── - # Re-runs the SAME revision-check script after any rebuilds. This - # job is the actual merge gate — verify-architectures' initial run - # is informational + matrix-input only. With both rebuilds done - # (or skipped because nothing was stale), every image at the - # expected tag should now have its revision label matching HEAD. + # Final merge gate. always() so it runs even if ci-build-on-label was + # skipped (no label, or nothing was stale). verify-after-rebuild: - needs: [verify-architectures, rebuild-stale-amd64, rebuild-stale-arm64] - # always() so this job runs even if rebuild-stale-* skipped (which - # they do when verify-architectures had nothing stale OR when no - # docker-relevant changes per the #974 self-aware-skip path). + needs: [verify-architectures, ci-build-on-label] if: always() runs-on: ubuntu-latest steps: diff --git a/scripts/push-current-arch.sh b/scripts/push-current-arch.sh index e2ca7c434..f3f0e4931 100755 --- a/scripts/push-current-arch.sh +++ b/scripts/push-current-arch.sh @@ -46,31 +46,31 @@ ARCH="$(uname -m)" # the builder image's repo tree (vendored or pullable). case "$OS/$ARCH" in Darwin/arm64) - # Mac M-series: linux/arm64 is natively buildable via Docker Desktop's - # Linux VM. Mac uses Metal natively (continuum-core base, not vulkan) - # and Docker Desktop has no GPU passthrough — there's no point shipping - # vulkan/arm64 from this host. Core + livekit-bridge cover the arm64 - # leg. Vulkan + CUDA come from BigMama (linux/amd64). + # Mac M-series via Docker Desktop's linux/arm64 VM. Docker Desktop has + # NO GPU passthrough → inside the container there's no Metal (we're in + # Linux), no CUDA, no Vulkan device. continuum-core enforces "lack of + # GPU integration is forbidden" and panics at startup. So core is NOT + # shippable from this host — only livekit-bridge (CPU-only). HOST_PLATFORM="linux/arm64" - HEAVY_VARIANTS=("core" "livekit-bridge") + HEAVY_VARIANTS=("livekit-bridge") ;; Linux/x86_64) - # Linux amd64 (BigMama, Windows WSL2): native platform. Core + vulkan - # + livekit-bridge always; CUDA only when Nvidia driver is present - # (nvidia-smi reports a GPU). Vulkan here covers Linux + Windows WSL2 - # consumer GPU users. + # Linux amd64 (BigMama, Windows WSL2): vulkan + livekit-bridge always + # (vulkan covers Linux + Windows WSL2 consumer GPUs via mesa or vendor + # ICD). CUDA only when Nvidia driver is present. core variant is being + # deprecated (CPU-only is unshippable per architectural rule); not + # built here. HOST_PLATFORM="linux/amd64" - HEAVY_VARIANTS=("core" "vulkan" "livekit-bridge") + HEAVY_VARIANTS=("vulkan" "livekit-bridge") if command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi >/dev/null 2>&1; then HEAVY_VARIANTS+=("cuda") fi ;; Linux/aarch64 | Linux/arm64) - # Linux arm64 (e.g. Raspberry Pi, Nvidia Jetson, ARM cloud host). - # Same logic as Mac: no realistic vulkan/arm64 consumer story, so - # core + livekit-bridge only. + # Linux arm64 (Pi, Jetson, ARM cloud). Same GPU-passthrough constraint + # as Mac arm64 → only livekit-bridge. HOST_PLATFORM="linux/arm64" - HEAVY_VARIANTS=("core" "livekit-bridge") + HEAVY_VARIANTS=("livekit-bridge") ;; *) echo "ERROR: push-current-arch.sh — unsupported host $OS/$ARCH" >&2 diff --git a/scripts/verify-image-revisions.sh b/scripts/verify-image-revisions.sh index 306cdf780..65e671273 100755 --- a/scripts/verify-image-revisions.sh +++ b/scripts/verify-image-revisions.sh @@ -261,14 +261,21 @@ if [ "$WARN_ARM64" -ne 0 ]; then echo "" echo "⚠️ arm64 stale on $(wc -l < "$STALE_ARM64_OUT" | tr -d ' ') image(s):" while IFS= read -r REF; do echo " - $REF"; done < "$STALE_ARM64_OUT" - echo " Mac M-series dev: run \`scripts/push-current-arch.sh\` to refresh." - echo " Not blocking — CI auto-rebuild will catch this once #965 lands GitHub arm64 runner support." + echo " Mac M-series dev: \`cd src && npm run docker:push\` to refresh," + echo " OR apply ci-build:arm64 / ci-build:core / ci-build:livekit-bridge label to PR." fi if [ "$FAILED" -ne 0 ]; then echo "" echo "❌ STALE-IMAGE GATE FAILED — amd64 image(s) at :$TAG built from a different commit." - echo " The user-facing target must always be current. Re-push from the Linux/amd64 host and re-run." + echo "" + echo " To unblock:" + echo " 1) Preferred — Linux/amd64 host: cd src && npm run docker:push" + echo " (Phase 0 cargo test + Phase 2 slice tests on real hardware)" + echo " 2) Escape hatch — apply ci-build: label to the PR:" + echo " ci-build:vulkan | ci-build:cuda | ci-build:core |" + echo " ci-build:livekit-bridge | ci-build:amd64 | ci-build:all" + echo " (CI builds the slice; Phase 0 skipped — no GPU/models in CI)" exit 1 fi echo ""