diff --git a/docker/local-emulator/qemu/build-image.sh b/docker/local-emulator/qemu/build-image.sh index 7d73c0ead3..b6efb0c5ee 100755 --- a/docker/local-emulator/qemu/build-image.sh +++ b/docker/local-emulator/qemu/build-image.sh @@ -112,15 +112,21 @@ qemu_cmd_prefix_for_arch() { case "$arch" in arm64) local accel="tcg" + # Under TCG (software emulation on an amd64 host) -cpu max advertises + # armv8.5+ features (PAC, BTI, SVE, LSE atomics…) that V8 happily emits + # JIT code for, but QEMU TCG mistranslates some of those instructions + # and the node process crashes with SIGTRAP during migrations. Falling + # back to cortex-a72 limits V8 to armv8.0-a, which TCG handles cleanly. + local cpu="cortex-a72" if [ "$HOST_ARCH" = "arm64" ]; then case "$HOST_OS" in - darwin) accel="hvf" ;; - linux) [ -w /dev/kvm ] && accel="kvm" ;; + darwin) accel="hvf"; cpu="max" ;; + linux) [ -w /dev/kvm ] && { accel="kvm"; cpu="max"; } ;; esac fi local firmware firmware="$(find_aarch64_firmware)" - echo "qemu-system-aarch64 -machine virt -accel $accel -cpu max -bios $firmware" + echo "qemu-system-aarch64 -machine virt -accel $accel -cpu $cpu -bios $firmware" ;; amd64) local accel="tcg" @@ -254,6 +260,9 @@ build_one() { mkdir -p "$bundle_dir" cp "$bundle_tgz" "$bundle_dir/img.tgz" cp "$BUILD_ENV_FILE" "$bundle_dir/build.env" + # Tell the guest which arch it's being built for so cross-arch (TCG) builds + # can skip the smoke test, which isn't reliable under software emulation. + printf 'STACK_EMULATOR_BUILD_ARCH=%s\n' "$arch" > "$bundle_dir/build-arch.env" make_iso_from_dir "$bundle_iso" "STACKBUNDLE" "$bundle_dir" : > "$serial_log" diff --git a/docker/local-emulator/qemu/cloud-init/emulator/user-data b/docker/local-emulator/qemu/cloud-init/emulator/user-data index 4dcf7bda03..5005f99c47 100644 --- a/docker/local-emulator/qemu/cloud-init/emulator/user-data +++ b/docker/local-emulator/qemu/cloud-init/emulator/user-data @@ -48,6 +48,11 @@ write_files: cp /mnt/stack-bundle/build.env /etc/stack-build.env fi + # Copy per-arch build metadata (used to skip smoke test on cross-arch TCG builds) + if [ -f /mnt/stack-bundle/build-arch.env ]; then + cp /mnt/stack-bundle/build-arch.env /etc/stack-build-arch.env + fi + - path: /usr/local/bin/render-stack-env permissions: '0755' content: | @@ -226,11 +231,26 @@ write_files: log "init-services done (${elapsed}s)." log "Running migrations..." + # Capture stdout+stderr so failures surface the actual node error in + # the host-visible provision log instead of being swallowed by the + # serial-only stream. + migrate_log="$(mktemp)" + set +e docker exec \ --env-file /etc/stack-build.env \ --env-file /etc/stack-build-computed.env \ stack-build-init \ - sh -c 'cd /app/apps/backend && node dist/db-migrations.mjs migrate && node dist/db-migrations.mjs seed' + sh -c 'cd /app/apps/backend && node dist/db-migrations.mjs migrate && node dist/db-migrations.mjs seed' \ + > "$migrate_log" 2>&1 + migrate_status=$? + set -e + if [ "$migrate_status" -ne 0 ]; then + log "MIGRATIONS FAILED (exit ${migrate_status}) — last 200 lines of migration output:" + tail -200 "$migrate_log" | while IFS= read -r line; do log "migrate: $line"; done || true + rm -f "$migrate_log" + exit "$migrate_status" + fi + rm -f "$migrate_log" log "Migrations + seed complete." log "Stopping deps container..." @@ -258,52 +278,68 @@ write_files: DOCKERFILE log "Slim image built." - log "Running smoke test on slim image..." - docker run --rm --name smoke-test \ - --network host \ - --env-file /etc/stack-build.env \ - --env-file /etc/stack-build-computed.env \ - -e STACK_SKIP_MIGRATIONS=true \ - -e STACK_SKIP_SEED_SCRIPT=true \ - -e STACK_RUNTIME_WORK_DIR=/app \ - -v stack-postgres-data:/data/postgres \ - -v stack-redis-data:/data/redis \ - -v stack-clickhouse-data:/data/clickhouse \ - -v stack-minio-data:/data/minio \ - -v stack-inbucket-data:/data/inbucket \ - -d stack-local-emulator-slim - - smoke_timeout=300 - smoke_elapsed=0 - smoke_passed=false - while [ "$smoke_elapsed" -lt "$smoke_timeout" ]; do - code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 3 http://127.0.0.1:8102/health?db=1 2>/dev/null || true) - if [ "$code" = "200" ]; then - smoke_passed=true - break + # Determine build arch to decide whether to run the smoke test. Cross-arch + # (TCG) builds can't reliably run the Next.js backend inside the smoke + # test container: V8 JIT ↔ QEMU TCG mistranslations crash the process, + # and even with --jitless the backend is too slow to respond within any + # sane timeout. amd64 builds run under KVM and are unaffected. + BUILD_ARCH="" + if [ -f /etc/stack-build-arch.env ]; then + # shellcheck disable=SC1091 + . /etc/stack-build-arch.env + BUILD_ARCH="${STACK_EMULATOR_BUILD_ARCH:-}" + fi + + if [ "$BUILD_ARCH" = "arm64" ]; then + log "Skipping smoke test: build arch is arm64 and cross-arch TCG can't reliably run the backend." + else + log "Running smoke test on slim image..." + docker run --rm --name smoke-test \ + --network host \ + --env-file /etc/stack-build.env \ + --env-file /etc/stack-build-computed.env \ + -e STACK_SKIP_MIGRATIONS=true \ + -e STACK_SKIP_SEED_SCRIPT=true \ + -e STACK_RUNTIME_WORK_DIR=/app \ + -v stack-postgres-data:/data/postgres \ + -v stack-redis-data:/data/redis \ + -v stack-clickhouse-data:/data/clickhouse \ + -v stack-minio-data:/data/minio \ + -v stack-inbucket-data:/data/inbucket \ + -d stack-local-emulator-slim + + smoke_timeout=300 + smoke_elapsed=0 + smoke_passed=false + while [ "$smoke_elapsed" -lt "$smoke_timeout" ]; do + code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 3 http://127.0.0.1:8102/health?db=1 2>/dev/null || true) + if [ "$code" = "200" ]; then + smoke_passed=true + break + fi + sleep 2 + smoke_elapsed=$((smoke_elapsed + 2)) + done + + if [ "$smoke_passed" = "false" ]; then + log "SMOKE TEST FAILED: backend /health?db=1 did not return 200 within ${smoke_timeout}s" + log "--- docker ps -a ---" + docker ps -a 2>&1 | while IFS= read -r line; do log "ps: $line"; done || true + log "--- smoke-test container logs (last 200 lines) ---" + docker logs --tail 200 smoke-test 2>&1 | while IFS= read -r line; do log "smoke-test: $line"; done || true + log "--- free -m ---" + free -m 2>&1 | while IFS= read -r line; do log "mem: $line"; done || true + log "--- curl -v /health?db=1 ---" + curl -v --max-time 5 http://127.0.0.1:8102/health?db=1 2>&1 | while IFS= read -r line; do log "curl: $line"; done || true + docker stop smoke-test 2>/dev/null || true + exit 1 fi - sleep 2 - smoke_elapsed=$((smoke_elapsed + 2)) - done - if [ "$smoke_passed" = "false" ]; then - log "SMOKE TEST FAILED: backend /health?db=1 did not return 200 within ${smoke_timeout}s" - log "--- docker ps -a ---" - docker ps -a 2>&1 | while IFS= read -r line; do log "ps: $line"; done || true - log "--- smoke-test container logs (last 200 lines) ---" - docker logs --tail 200 smoke-test 2>&1 | while IFS= read -r line; do log "smoke-test: $line"; done || true - log "--- free -m ---" - free -m 2>&1 | while IFS= read -r line; do log "mem: $line"; done || true - log "--- curl -v /health?db=1 ---" - curl -v --max-time 5 http://127.0.0.1:8102/health?db=1 2>&1 | while IFS= read -r line; do log "curl: $line"; done || true docker stop smoke-test 2>/dev/null || true - exit 1 + sleep 2 + log "Smoke test passed (${smoke_elapsed}s)." fi - docker stop smoke-test 2>/dev/null || true - sleep 2 - log "Smoke test passed (${smoke_elapsed}s)." - log "Flattening image (docker export/import)..." docker create --name flatten stack-local-emulator-slim /bin/true docker export flatten | docker import \