From 7c4409714028c15c81cb0e3284faf588526d2ebb Mon Sep 17 00:00:00 2001 From: Bilal Godil Date: Mon, 13 Apr 2026 10:19:40 -0700 Subject: [PATCH 1/3] emulator: replace docker save/nuke/reload with in-place prune After flattening, reclaim intermediate layers with `docker rmi` + `docker image prune -af` rather than round-tripping the final image through a tar and wiping /var/lib/docker. The round-trip cost ~15 min under same-arch TCG on the arm64 runner because every byte of the image is read, written to tar, then read and written back. Relies on the drive's `discard=on,detect-zeroes=unmap` + fstrim to return freed clusters to the qcow2, which also lets the zero-fill `dd` go. --- .../qemu/cloud-init/emulator/user-data | 26 +++++-------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/docker/local-emulator/qemu/cloud-init/emulator/user-data b/docker/local-emulator/qemu/cloud-init/emulator/user-data index c1d0d0f9bf..ad70bdc8df 100644 --- a/docker/local-emulator/qemu/cloud-init/emulator/user-data +++ b/docker/local-emulator/qemu/cloud-init/emulator/user-data @@ -399,28 +399,16 @@ write_files: - stack-local-emulator:final log "Flatten done." - log "Saving final image to /var/tmp..." + log "Pruning intermediate images in place..." docker rm flatten - docker save stack-local-emulator:final -o /var/tmp/final-image.tar - mv /var/lib/docker/volumes /var/tmp/volumes-backup - log "Nuking Docker storage and reloading..." - systemctl stop docker containerd - rm -rf /var/lib/docker /var/lib/containerd - systemctl start docker containerd - until docker info >/dev/null 2>&1; do sleep 1; done - docker load -i /var/tmp/final-image.tar + docker rmi stack-local-emulator stack-local-emulator-slim || true docker tag stack-local-emulator:final stack-local-emulator docker rmi stack-local-emulator:final || true - rm -f /var/tmp/final-image.tar - systemctl stop docker - rm -rf /var/lib/docker/volumes - mv /var/tmp/volumes-backup /var/lib/docker/volumes - systemctl start docker - log "Docker storage rebuilt." - - log "Zeroing free space for qcow2 compression..." - dd if=/dev/zero of=/zero.fill bs=1M 2>/dev/null || true - rm -f /zero.fill + docker builder prune -af || true + docker image prune -af || true + log "Intermediate images pruned." + + log "Releasing free space for qcow2 compression (fstrim)..." sync fstrim -av 2>/dev/null || true log "slim-docker-image done." From 8146d359a86e49501f166fb02c6c222d91f8daea Mon Sep 17 00:00:00 2001 From: Bilal Godil Date: Mon, 13 Apr 2026 12:18:43 -0700 Subject: [PATCH 2/3] emulator: don't use docker image prune -a (would remove final image) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With -a, docker image prune removes every image that isn't referenced by a running or stopped container. At this point in provisioning the flatten container has been rm'd and stack.service is only enabled (not started), so the freshly-tagged stack-local-emulator image has zero container refs and was getting nuked. The VM then booted cleanly but stack.service failed to `docker run` the image on startup, producing a green systemd log with no services reachable on their ports — the symptom we saw in the amd64 run. Drop -a so we only prune dangling (untagged) images. The explicit rmi of the fat + slim intermediates still leaves them dangling, so they still get reclaimed. --- docker/local-emulator/qemu/cloud-init/emulator/user-data | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docker/local-emulator/qemu/cloud-init/emulator/user-data b/docker/local-emulator/qemu/cloud-init/emulator/user-data index ad70bdc8df..69481da602 100644 --- a/docker/local-emulator/qemu/cloud-init/emulator/user-data +++ b/docker/local-emulator/qemu/cloud-init/emulator/user-data @@ -405,7 +405,12 @@ write_files: docker tag stack-local-emulator:final stack-local-emulator docker rmi stack-local-emulator:final || true docker builder prune -af || true - docker image prune -af || true + # Must be `prune -f` (dangling only), NOT `prune -af`. With -a, docker + # deletes every image that isn't referenced by a running/stopped + # container — at this point stack.service is only systemctl enable'd, + # not yet started, so the freshly-tagged stack-local-emulator image + # has zero container refs and would be nuked, bricking the final qcow2. + docker image prune -f || true log "Intermediate images pruned." log "Releasing free space for qcow2 compression (fstrim)..." From 1910ea0ebfba89afa063322373935de63e1c0057 Mon Sep 17 00:00:00 2001 From: Bilal Godil Date: Mon, 13 Apr 2026 12:24:52 -0700 Subject: [PATCH 3/3] ci: skip emulator boot/verify on arm64 (same-arch TCG can't run backend) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Start/Verify/Stop emulator steps boot the freshly-built qcow2 and wait for all services — including the Next.js backend — to respond on their ports. Under same-arch TCG on ubuntu-24.04-arm there's no KVM, so the backend can't come up within any reasonable window (this is the same reason the build-time smoke test is already skipped on arm64). Today the step just burns the 53-minute EMULATOR_READY_TIMEOUT and fails. Gate those three steps to amd64. The build step still fully produces and validates the arm64 image; it just doesn't try to run it under emulation. The amd64 job continues to prove the image's service stack end-to-end, and the arm64 artifact is trusted to be equivalent since real arm64 hardware has KVM. --- .github/workflows/qemu-emulator-build.yaml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/qemu-emulator-build.yaml b/.github/workflows/qemu-emulator-build.yaml index 0957d80f0d..08b4ab62cc 100644 --- a/.github/workflows/qemu-emulator-build.yaml +++ b/.github/workflows/qemu-emulator-build.yaml @@ -80,7 +80,13 @@ jobs: - name: Generate emulator env run: node docker/local-emulator/generate-env-development.mjs + # VM boot + service verification is amd64-only. Under same-arch TCG on + # the arm64 runner there's no KVM, and the Next.js backend can't come + # up within any reasonable window under software emulation — same + # reason the build-time smoke test is skipped on arm64. The arm64 + # image is built and uploaded blind; it runs on real arm64 hardware. - name: Start emulator and verify + if: matrix.arch == 'amd64' run: | chmod +x docker/local-emulator/qemu/run-emulator.sh EMULATOR_ARCH=${{ matrix.arch }} \ @@ -88,12 +94,13 @@ jobs: docker/local-emulator/qemu/run-emulator.sh start - name: Verify services are healthy + if: matrix.arch == 'amd64' run: | EMULATOR_ARCH=${{ matrix.arch }} \ docker/local-emulator/qemu/run-emulator.sh status - name: Stop emulator - if: always() + if: always() && matrix.arch == 'amd64' run: | EMULATOR_ARCH=${{ matrix.arch }} \ docker/local-emulator/qemu/run-emulator.sh stop