Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions .github/workflows/carl-install-smoke.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,37 @@ jobs:
- name: Login to ghcr.io (so install.sh can pull pre-built images)
run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin

- name: Compute SHORT_SHA for image tag pin
id: shortsha
run: |
# SHA-pinned image tag. Why: smoke MUST exercise images built
# from THIS PR's source — :latest / :canary are mutable and have
# historically gone stale (April 2026: a 2-week-old :canary tag
# silently passed smoke against pre-fix bits, validating nothing
# about the actual PR). 7-char short SHA matches what
# push-image.sh / push-current-arch.sh tag on push (`git rev-parse
# --short HEAD` is 7 by default). The image either exists at this
# exact ref (dev pushed for this SHA) or smoke fails with
# "manifest unknown" — at which point the dev applies the
# `ci-build:<slice>` label OR rebases onto canary HEAD where a
# fresh image exists. Either way, smoke can never silently pull
# stale bits.
FULL_SHA="${{ github.event.pull_request.head.sha || github.sha }}"
SHORT_SHA="${FULL_SHA:0:7}"
echo "short_sha=$SHORT_SHA" >> "$GITHUB_OUTPUT"
echo "Pinning CONTINUUM_IMAGE_TAG=$SHORT_SHA (from $FULL_SHA)"

- name: Run carl-install smoke
env:
# Pass the PR HEAD sha so the smoke fetches the install.sh from
# THIS PR (not main). Falls back to manual workflow_dispatch input
# when not in a PR context.
CARL_INSTALL_REF: ${{ github.event.pull_request.head.sha || inputs.install_ref || github.sha }}
# See "Compute SHORT_SHA" step above for the rationale on
# SHA-pinning. install.sh honors $CONTINUUM_IMAGE_TAG and
# substitutes it into docker-compose's
# `ghcr.io/cambriantech/continuum-*:${CONTINUUM_IMAGE_TAG:-latest}`.
CONTINUUM_IMAGE_TAG: ${{ steps.shortsha.outputs.short_sha }}
# 25-min cap on the docker-only install. Hybrid (Mac source-build)
# path would exceed this — by design, that's the gate firing on
# the README/install mismatch.
Expand Down
194 changes: 46 additions & 148 deletions .github/workflows/docker-images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -421,177 +421,75 @@ jobs:
# Single source of truth, identical failure surface, easy local testing.
run: bash scripts/ci/install-and-run-gate.sh

# ── Rebuild Stale Arches (CI auto-rebuild fallback) ────────────────
# Closes the cross-developer push race that the SHA-revision gate
# surfaces: when one dev pushes, their arch is current but the other
# dev's arch goes stale. Without this job, the off-host dev would
# have to manually rebuild on their machine before the gate passes —
# serial coordination dance that blocks every cross-dev PR.
#
# Per Joel (2026-04-23): "you can't have one [check] that's yaml and
# another that's shell. you have to reuse otherwise they diverge."
# So this job is THIN: pick the right native runner via matrix,
# set up registry auth, then invoke the SAME `scripts/push-current-arch.sh`
# the developer pre-push hook calls. No build logic in CI yaml. When
# push-current-arch.sh changes (new variant, new --label, new arch),
# CI inherits the change automatically.
#
# Slice efficiency: registry buildcache (--cache-from on push-image.sh)
# means unchanged layers (rust base, apt installs, cargo-chef workspace
# deps) replay from cache. Typical incremental rebuild: 5-15 min on
# cache hit, well under the GHA timeout.
#
# See #965 for the full design rationale.
rebuild-stale-amd64:
# CI build escape hatch: fires only when a `ci-build:<slice>` label is on
# the PR. Default path remains dev-pushes-from-native-host. Label is the
# explicit "the right human can't push, please build" signal — without it
# a stale image hard-fails the merge. Labels: ci-build:{amd64,arm64,
# vulkan,cuda,core,livekit-bridge,all}.
ci-build-on-label:
needs: verify-architectures
if: needs.verify-architectures.outputs.stale_amd64 != '[]'
runs-on: ubuntu-latest
if: |
(needs.verify-architectures.outputs.stale_amd64 != '[]' ||
needs.verify-architectures.outputs.stale_arm64 != '[]') &&
(contains(github.event.pull_request.labels.*.name, 'ci-build:amd64') ||
contains(github.event.pull_request.labels.*.name, 'ci-build:arm64') ||
contains(github.event.pull_request.labels.*.name, 'ci-build:vulkan') ||
contains(github.event.pull_request.labels.*.name, 'ci-build:cuda') ||
contains(github.event.pull_request.labels.*.name, 'ci-build:core') ||
contains(github.event.pull_request.labels.*.name, 'ci-build:livekit-bridge') ||
contains(github.event.pull_request.labels.*.name, 'ci-build:all'))
strategy:
fail-fast: false
matrix:
include:
- arch: amd64
runs-on: ubuntu-latest
- arch: arm64
runs-on: ubuntu-24.04-arm
runs-on: ${{ matrix.runs-on }}
permissions:
contents: read
packages: write
steps:
- uses: actions/checkout@v4
with:
# CRITICAL: check out the PR HEAD, NOT the synthetic merge commit
# GitHub creates by default. Without this, push-current-arch.sh's
# `git rev-parse HEAD` returns the merge SHA, images get labeled
# with that SHA, and verify-image-revisions.sh (which expects
# github.event.pull_request.head.sha) flags them STALE forever.
# 2026-04-24: hit this exact failure — labels said 9dc97ea (merge
# SHA), expected 056978cde (PR HEAD), every rebuild produced more
# mismatched labels.
ref: ${{ github.event.pull_request.head.sha || github.sha }}
# Full history needed for the re-check step to invoke
# verify-image-revisions.sh's smart staleness diff (compares
# the older labeled SHA against HEAD to skip rebuilds for
# non-context changes).
fetch-depth: 0
# Recursive submodules required: vendor/llama.cpp is checked out
# as a submodule and the docker build CACHED layer references its
# CMakeLists.txt presence. Without this, the rebuild dies with
# "vendor/llama.cpp is empty — host submodule not initialized."
# Bigmama caught this 2026-04-24 after the rebuild-stale-amd64 job
# first fired post-stale-image-gate-restoration.
submodules: recursive
- name: Login to ghcr.io
run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Install Rust toolchain (push-current-arch may invoke pre-build cargo checks)
run: |
# We don't actually need a host-side cargo build — push-image.sh
# builds inside the docker buildx context — but if push-current-arch.sh
# ever runs `cargo test` as Phase 0, we need the toolchain present.
# Cheap when not used, prevents a future surprise.
if ! command -v cargo >/dev/null; then
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable --profile minimal
echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
fi
- name: Re-check staleness (skip if a human caught up between gate and now)
id: recheck_amd64
env:
EXPECTED_SHA: ${{ needs.verify-architectures.outputs.expected_sha }}
TAG: pr-${{ github.event.pull_request.number }}
STALE_AMD64_OUT: ${{ runner.temp }}/stale-amd64-recheck.txt
STALE_ARM64_OUT: /dev/null
GHCR_USER: ${{ github.actor }}
GHCR_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
# The verify-architectures gate's stale list is a SNAPSHOT from
# gate-time. If a developer (bigmama on amd64, anvil on arm64)
# pushed the missing arch between gate-time and rebuild-time, the
# rebuild would otherwise burn 30+ min of GHA on work that's
# already done — pure waste. Re-check now and exit early if the
# human path beat us. Costs ~5-10s.
bash scripts/verify-image-revisions.sh || true
if [ ! -s "$STALE_AMD64_OUT" ]; then
echo "✅ amd64 staleness resolved between gate and rebuild — skipping."
echo "still_stale=false" >> "$GITHUB_OUTPUT"
else
echo "amd64 still stale, proceeding with rebuild:"
cat "$STALE_AMD64_OUT"
echo "still_stale=true" >> "$GITHUB_OUTPUT"
fi
- name: Rebuild stale amd64 images via push-current-arch.sh
if: steps.recheck_amd64.outputs.still_stale == 'true'
env:
# SKIP_PHASE_0=1: push-image.sh's cargo-test phase needs models on disk
# which CI doesn't have. The slice tests inside test-slices.sh still run
# (HTTP probe + container liveness) — those don't need models.
SKIP_PHASE_0: '1'
# PR_NUMBER lets push-current-arch.sh emit the :pr-<N> tag. Without
# this it falls back to gh-cli lookup which works if gh is logged in.
PR_NUMBER: ${{ github.event.pull_request.number }}
run: |
echo "Rebuilding amd64 images that drifted from HEAD."
echo "Stale list: ${{ needs.verify-architectures.outputs.stale_amd64 }}"
bash scripts/push-current-arch.sh

rebuild-stale-arm64:
needs: verify-architectures
if: needs.verify-architectures.outputs.stale_arm64 != '[]'
runs-on: ubuntu-24.04-arm
permissions:
contents: read
packages: write
steps:
- uses: actions/checkout@v4
with:
ref: ${{ github.event.pull_request.head.sha || github.sha }} # PR HEAD, not merge commit — see amd64 job comment
fetch-depth: 0 # full history — see amd64 job comment
submodules: recursive # vendor/llama.cpp — see amd64 job comment
- name: Login to ghcr.io
run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Install Rust toolchain (push-current-arch may invoke pre-build cargo checks)
- run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin
- uses: docker/setup-buildx-action@v3
- name: Install Rust toolchain
run: |
if ! command -v cargo >/dev/null; then
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable --profile minimal
echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
fi
- name: Re-check staleness (skip if a human caught up between gate and now)
id: recheck_arm64
- name: Resolve VARIANT from labels (most-specific wins)
id: scope
env:
EXPECTED_SHA: ${{ needs.verify-architectures.outputs.expected_sha }}
TAG: pr-${{ github.event.pull_request.number }}
STALE_AMD64_OUT: /dev/null
STALE_ARM64_OUT: ${{ runner.temp }}/stale-arm64-recheck.txt
GHCR_USER: ${{ github.actor }}
GHCR_TOKEN: ${{ secrets.GITHUB_TOKEN }}
LABELS: ${{ toJSON(github.event.pull_request.labels.*.name) }}
run: |
# See amd64 job comment — re-check at job start so we don't burn
# 30+ min of arm64 GHA when anvil already pushed from a Mac.
bash scripts/verify-image-revisions.sh || true
if [ ! -s "$STALE_ARM64_OUT" ]; then
echo "✅ arm64 staleness resolved between gate and rebuild — skipping."
echo "still_stale=false" >> "$GITHUB_OUTPUT"
else
echo "arm64 still stale, proceeding with rebuild:"
cat "$STALE_ARM64_OUT"
echo "still_stale=true" >> "$GITHUB_OUTPUT"
if echo "$LABELS" | grep -q '"ci-build:vulkan"'; then V=vulkan
elif echo "$LABELS" | grep -q '"ci-build:cuda"'; then V=cuda
elif echo "$LABELS" | grep -q '"ci-build:core"'; then V=core
elif echo "$LABELS" | grep -q '"ci-build:livekit-bridge"'; then V=livekit-bridge
else V=
fi
- name: Rebuild stale arm64 images via push-current-arch.sh
if: steps.recheck_arm64.outputs.still_stale == 'true'
echo "variant=$V" >> "$GITHUB_OUTPUT"
- name: Build via push-current-arch.sh (slice tests still run)
env:
# SKIP_PHASE_0: CI has no GPU/models for `cargo test -p llama`.
# Phase 2 slice tests (HTTP + GPU-IPC contract) still run. The
# ci-build:<slice> label is the human accepting this tradeoff.
SKIP_PHASE_0: '1'
PR_NUMBER: ${{ github.event.pull_request.number }}
run: |
echo "Rebuilding arm64 images that drifted from HEAD."
echo "Stale list: ${{ needs.verify-architectures.outputs.stale_arm64 }}"
bash scripts/push-current-arch.sh
VARIANT: ${{ steps.scope.outputs.variant }}
run: bash scripts/push-current-arch.sh

# ── Final verification (post-rebuild) ────────────────────────────
# Re-runs the SAME revision-check script after any rebuilds. This
# job is the actual merge gate — verify-architectures' initial run
# is informational + matrix-input only. With both rebuilds done
# (or skipped because nothing was stale), every image at the
# expected tag should now have its revision label matching HEAD.
# Final merge gate. always() so it runs even if ci-build-on-label was
# skipped (no label, or nothing was stale).
verify-after-rebuild:
needs: [verify-architectures, rebuild-stale-amd64, rebuild-stale-arm64]
# always() so this job runs even if rebuild-stale-* skipped (which
# they do when verify-architectures had nothing stale OR when no
# docker-relevant changes per the #974 self-aware-skip path).
needs: [verify-architectures, ci-build-on-label]
if: always()
runs-on: ubuntu-latest
steps:
Expand Down
30 changes: 15 additions & 15 deletions scripts/push-current-arch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,31 +46,31 @@ ARCH="$(uname -m)"
# the builder image's repo tree (vendored or pullable).
case "$OS/$ARCH" in
Darwin/arm64)
# Mac M-series: linux/arm64 is natively buildable via Docker Desktop's
# Linux VM. Mac uses Metal natively (continuum-core base, not vulkan)
# and Docker Desktop has no GPU passthrough — there's no point shipping
# vulkan/arm64 from this host. Core + livekit-bridge cover the arm64
# leg. Vulkan + CUDA come from BigMama (linux/amd64).
# Mac M-series via Docker Desktop's linux/arm64 VM. Docker Desktop has
# NO GPU passthrough → inside the container there's no Metal (we're in
# Linux), no CUDA, no Vulkan device. continuum-core enforces "lack of
# GPU integration is forbidden" and panics at startup. So core is NOT
# shippable from this host — only livekit-bridge (CPU-only).
HOST_PLATFORM="linux/arm64"
HEAVY_VARIANTS=("core" "livekit-bridge")
HEAVY_VARIANTS=("livekit-bridge")
;;
Linux/x86_64)
# Linux amd64 (BigMama, Windows WSL2): native platform. Core + vulkan
# + livekit-bridge always; CUDA only when Nvidia driver is present
# (nvidia-smi reports a GPU). Vulkan here covers Linux + Windows WSL2
# consumer GPU users.
# Linux amd64 (BigMama, Windows WSL2): vulkan + livekit-bridge always
# (vulkan covers Linux + Windows WSL2 consumer GPUs via mesa or vendor
# ICD). CUDA only when Nvidia driver is present. core variant is being
# deprecated (CPU-only is unshippable per architectural rule); not
# built here.
HOST_PLATFORM="linux/amd64"
HEAVY_VARIANTS=("core" "vulkan" "livekit-bridge")
HEAVY_VARIANTS=("vulkan" "livekit-bridge")
if command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi >/dev/null 2>&1; then
HEAVY_VARIANTS+=("cuda")
fi
;;
Linux/aarch64 | Linux/arm64)
# Linux arm64 (e.g. Raspberry Pi, Nvidia Jetson, ARM cloud host).
# Same logic as Mac: no realistic vulkan/arm64 consumer story, so
# core + livekit-bridge only.
# Linux arm64 (Pi, Jetson, ARM cloud). Same GPU-passthrough constraint
# as Mac arm64 → only livekit-bridge.
HOST_PLATFORM="linux/arm64"
HEAVY_VARIANTS=("core" "livekit-bridge")
HEAVY_VARIANTS=("livekit-bridge")
;;
*)
echo "ERROR: push-current-arch.sh — unsupported host $OS/$ARCH" >&2
Expand Down
13 changes: 10 additions & 3 deletions scripts/verify-image-revisions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -261,14 +261,21 @@ if [ "$WARN_ARM64" -ne 0 ]; then
echo ""
echo "⚠️ arm64 stale on $(wc -l < "$STALE_ARM64_OUT" | tr -d ' ') image(s):"
while IFS= read -r REF; do echo " - $REF"; done < "$STALE_ARM64_OUT"
echo " Mac M-series dev: run \`scripts/push-current-arch.sh\` to refresh."
echo " Not blocking — CI auto-rebuild will catch this once #965 lands GitHub arm64 runner support."
echo " Mac M-series dev: \`cd src && npm run docker:push\` to refresh,"
echo " OR apply ci-build:arm64 / ci-build:core / ci-build:livekit-bridge label to PR."
fi

if [ "$FAILED" -ne 0 ]; then
echo ""
echo "❌ STALE-IMAGE GATE FAILED — amd64 image(s) at :$TAG built from a different commit."
echo " The user-facing target must always be current. Re-push from the Linux/amd64 host and re-run."
echo ""
echo " To unblock:"
echo " 1) Preferred — Linux/amd64 host: cd src && npm run docker:push"
echo " (Phase 0 cargo test + Phase 2 slice tests on real hardware)"
echo " 2) Escape hatch — apply ci-build:<slice> label to the PR:"
echo " ci-build:vulkan | ci-build:cuda | ci-build:core |"
echo " ci-build:livekit-bridge | ci-build:amd64 | ci-build:all"
echo " (CI builds the slice; Phase 0 skipped — no GPU/models in CI)"
exit 1
fi
echo ""
Expand Down
Loading