diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..0a680fcc4 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,22 @@ +* text=auto eol=lf + +# Anything that gets executed inside an image must keep LF endings; CRLF +# on shebang lines breaks the interpreter lookup with `bad interpreter: +# /bin/bash^M`. +*.sh text eol=lf +*.py text eol=lf +*.service text eol=lf +*.network text eol=lf +*.yaml text eol=lf +*.yml text eol=lf + +# Binary artifacts — never normalize. +*.vhdx binary +*.cosi binary +*.qcow2 binary +*.iso binary +*.raw binary +*.png binary +*.jpg binary +*.zst binary +*.patch text eol=lf diff --git a/.gitignore b/.gitignore index e7d3febb7..a8fd85236 100644 --- a/.gitignore +++ b/.gitignore @@ -366,4 +366,7 @@ vendor/ # Virtdeploy files /tools/vm-netlaunch.yaml -/tools/virt-deploy-metadata.json \ No newline at end of file +/tools/virt-deploy-metadata.json +# AZL4 trident binary baked into test image (built locally) +tests/images/trident-vm-testimage/base/trident-bin/ +tests/images/trident-vm-testimage/base/osmodifier-bin/ diff --git a/.pipelines/templates/e2e-template.yml b/.pipelines/templates/e2e-template.yml index a0654303a..6322b480e 100644 --- a/.pipelines/templates/e2e-template.yml +++ b/.pipelines/templates/e2e-template.yml @@ -224,6 +224,41 @@ stages: micVersion: ${{ parameters.micVersion }} dependsOnStage: ${{ parameters.baseImageArtifactStage }} + # Build the AZL4 test images. + # + # Uses build-image-azl4.yml (MCR MIC + blob-sourced base VHDX) instead + # of the standard build-image.yml path. See build-image-azl4.yml for + # the merge-back TODO. + # + # Gating mirrors the AzL installer ISO below so AZL4 builds run in + # every stage type that gates a trunk merge. + - ${{ if or(eq(parameters.stageType, 'pr-e2e'), eq(parameters.stageType, 'ci'), eq(parameters.stageType, 'pr-e2e-azure'), eq(parameters.stageType, 'azl-validation'), eq(parameters.stageType, 'full-validation')) }}: + - template: stages/build_image/build-image-azl4.yml + parameters: + imageName: trident-vm-grub-testimage-azl4 + dependsOnStage: ${{ parameters.baseImageArtifactStage }} + + # AZL4 base qcow2 — boot point for the VM offline-init / rollback + # path. Same build template as the COSI above; output_format + # differs (QCOW2 vs COSI) per the testimages.py registration. + - template: stages/build_image/build-image-azl4.yml + parameters: + imageName: trident-vm-grub-testimage-azl4-base + dependsOnStage: ${{ parameters.baseImageArtifactStage }} + + # AZL4 BM-simulated netlaunch test. Uses the AZL3 MOS installer ISO + # (built by TridentTestImg_trident_installer below) plus the AZL4 + # COSI built above. Trident runs from the live MOS environment and + # installs the AZL4 COSI onto a fresh virtdeploy VM disk. This is + # the same flow we proved out manually on karhu-ubuntu. + - template: stages/testing_vm/netlaunch-testing-azl4.yml + + # AZL4 VM offline-init rollback test. The base qcow2 already has + # trident's datastore populated by its first-boot offline-init + # oneshot, so storm-trident can drive A/B update + rollback against + # the AZL4 COSI without the MOS bridge. + - template: stages/testing_rollback/vm-testing-azl4.yml + # Build AzL installer ISO (attended and unattended) - ${{ if or(eq(parameters.stageType, 'pr-e2e'), eq(parameters.stageType, 'ci'), eq(parameters.stageType, 'pr-e2e-azure'), eq(parameters.stageType, 'azl-validation')) }}: - template: stages/azl_installer/azl-installer.yml diff --git a/.pipelines/templates/stages/build_image/build-image-azl4.yml b/.pipelines/templates/stages/build_image/build-image-azl4.yml new file mode 100644 index 000000000..a2901cd84 --- /dev/null +++ b/.pipelines/templates/stages/build_image/build-image-azl4.yml @@ -0,0 +1,81 @@ +# AZL4 variant of build-image.yml. +# +# Forked from build-image.yml on 2026-05-13. Calls build-image-template-azl4.yml +# (which uses MCR MIC container + blob-sourced base VHDX) instead of the +# external test-images repo template. +# +# TODO(azl4-merge-back): Merge this back into build-image.yml with an +# `azureLinuxVersion` parameter switch once AZL4 base VHDX acquisition +# and trident-service RPM packaging are resolved. The base VHDX may +# continue to come from blob storage (not the AzureLinuxArtifacts ADO +# feed); the RPM will come from an AZL4 package repo, not ADO. + +parameters: + - name: imageName + type: string + + - name: clones + displayName: "Number of clones to generate" + type: number + default: 2 + + - name: dependsOnTrident + type: boolean + default: true + + - name: dependsOnStage + type: string + default: "" + +stages: + - stage: TridentTestImg_${{ replace(parameters.imageName, '-', '_') }} + displayName: Build ${{ parameters.imageName }} + ${{ if parameters.dependsOnTrident }}: + dependsOn: + # AZL4 doesn't have RPM publication so we depend on the + # trident-binaries artifact (which the GetTridentBinaries stage + # produces and copies to artifacts/binaries/trident). + - GetTridentBinaries_rpms_amd64 + # PrepareSSHKeys produces the shared 'ssh-keys' artifact. + # build-image-template-azl4.yml stages it into the testimage + # tree so qcow2 + cosi builds share the same SSH keypair, + # which lets storm-trident SSH into both A/B sides after + # update. + - PrepareSSHKeys + - ${{ if ne(parameters.dependsOnStage, '') }}: + - ${{ parameters.dependsOnStage }} + ${{ elseif ne(parameters.dependsOnStage, '') }}: + dependsOn: + - PrepareSSHKeys + - ${{ parameters.dependsOnStage }} + + jobs: + - job: BuildTridentTestImgAzl4 + displayName: Build (AZL4 MIC) + # Pinned MIC container build adds ~5 min cold-cache. Bump the timeout + # accordingly. TODO(azl4-release): lower back to 20 min once we use a + # released MIC container. + timeoutInMinutes: 30 + pool: + type: linux + + variables: + ob_outputDirectory: /tmp/output + ob_artifactBaseName: ${{ parameters.imageName }} + + steps: + - template: ../common_tasks/checkout_trident.yml + + - task: DownloadPipelineArtifact@2 + inputs: + buildType: current + artifactName: trident-binaries + targetPath: "$(Build.ArtifactStagingDirectory)/trident-binaries" + displayName: Download Trident binaries + condition: eq('${{ parameters.dependsOnTrident }}', true) + + - template: build-image-template-azl4.yml + parameters: + tridentSourceDirectory: $(TRIDENT_SOURCE_DIR) + imageName: ${{ parameters.imageName }} + clones: ${{ parameters.clones }} diff --git a/.pipelines/templates/stages/build_image/build-image-template-azl4.yml b/.pipelines/templates/stages/build_image/build-image-template-azl4.yml new file mode 100644 index 000000000..db9dda989 --- /dev/null +++ b/.pipelines/templates/stages/build_image/build-image-template-azl4.yml @@ -0,0 +1,166 @@ +# AZL4 variant of build-image-template.yml. +# +# Forked from build-image-template.yml on 2026-05-13. The AZL3 path pulls the +# base VHDX from the AzureLinuxArtifacts ADO feed and the Trident RPM from the +# trident-binaries pipeline artifact, then runs `testimages.py build`. AZL4 +# uses different acquisition paths: +# +# 1. Base VHDX comes from the AZL preview gallery's backing storage +# (azlpubdev2mruiyvi/images-dev). See the BlobImageManifest +# registration in tests/images/testimages.py. +# +# 2. There is no Trident RPM for AZL4 yet. The binary is baked in via +# additionalFiles in tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml. +# +# TODO(azl4-merge-back): Fold this template back into build-image-template.yml +# once the AZL4 base VHDX and trident-service RPM acquisition paths are +# standardized. The base VHDX may stay as a blob download; the RPM will +# come from an AZL4 package repo. + +parameters: + - name: tridentSourceDirectory + type: string + + - name: imageName + type: string + + - name: clones + type: number + default: 1 + displayName: Number of clones to create + + # The AZL4 base VHDX is sourced from the Azure Linux preview gallery's + # backing storage account. The pipeline service connection at + # $(BLOB_SERVICE_CONNECTION) must have `Storage Blob Data Reader` on + # this account. See tests/images/SERVICE-CONNECTION-RUNBOOK.md. + - name: blobStorageAccount + type: string + default: "azlpubdev2mruiyvi" + + - name: blobContainer + type: string + default: "images-dev" + + - name: blobSubscription + type: string + # Subscription where the storage account lives. The SC's default + # subscription may differ — we explicitly set context before download. + default: "e4ab81f8-030f-4593-a8f2-3ea2c7630a19" + + - name: blobServiceConnection + type: string + # NB: this must be a service connection that exists in the ADO project. + # Created manually by trident infra team. + default: "trident-azl4-blob-reader" + + - name: micContainerTag + type: string + default: "imagecustomizer:1.4.0-1" + +steps: + - template: ../common_tasks/avoid-pypi-usage.yml + + - template: common/sfi-enforce-isolation-with-etc-hosts.yaml@platform-pipelines + + # Stage the Trident binary that gets baked into the COSI via additionalFiles. + # The trident-binaries artifact comes from the same upstream Trident build + # stage the AZL3 path uses; we just copy the binary rather than installing + # an RPM. + # + # TODO(azl4-rpm): replace this binary copy with an RPM install once the + # trident-service RPM is packaged for AZL4 (same TODO as in + # tests/images/testimages.py registration). + - bash: | + set -euxo pipefail + TRIDENT_BIN_SRC="$(Build.ArtifactStagingDirectory)/trident-binaries" + TRIDENT_BIN_DEST="${{ parameters.tridentSourceDirectory }}/tests/images/trident-vm-testimage/base/trident-bin" + + if [ ! -f "$TRIDENT_BIN_SRC/trident" ]; then + echo "trident binary not found at $TRIDENT_BIN_SRC/trident" + echo "Available artifacts:" + find "$TRIDENT_BIN_SRC" -type f 2>/dev/null | head -20 || true + exit 1 + fi + + mkdir -p "$TRIDENT_BIN_DEST" + cp "$TRIDENT_BIN_SRC/trident" "$TRIDENT_BIN_DEST/trident" + chmod +x "$TRIDENT_BIN_DEST/trident" + file "$TRIDENT_BIN_DEST/trident" + displayName: "Stage Trident binary into testimage tree" + workingDirectory: ${{ parameters.tridentSourceDirectory }} + + # Pull the released MIC container from MCR. AZL4 support is included + # in imagecustomizer >= 1.4.0. Tag it locally so testimages.py can + # reference it by short name. + - bash: | + set -euxo pipefail + docker pull "mcr.microsoft.com/azurelinux/${{ parameters.micContainerTag }}" + docker tag "mcr.microsoft.com/azurelinux/${{ parameters.micContainerTag }}" "${{ parameters.micContainerTag }}" + displayName: "Pull MIC container from MCR" + + # Stage the pipeline-wide SSH key into the testimage tree before + # MIC runs. testimages.py's generate_ssh_keys() generates a new + # keypair UNLESS files/id_rsa.pub already exists at the source path + # — in which case it reuses it. By dropping the shared key from the + # PrepareSSHKeys artifact here, both the qcow2 base build and the + # COSI build end up with the same key baked into testuser's + # authorized_keys, so storm-trident's A/B update test can SSH into + # both A-side and B-side after the update reboot. + # + # The matching private key lives at ssh-keys/id_rsa from the + # PrepareSSHKeys stage. storm-trident's rollback stage picks it up + # the same way for AZL3 builds. + - task: DownloadPipelineArtifact@2 + displayName: "Download shared SSH keys" + inputs: + buildType: current + artifactName: "ssh-keys" + targetPath: "$(Build.ArtifactStagingDirectory)/ssh-keys" + + - bash: | + set -euxo pipefail + SSH_PUB_SRC="$(Build.ArtifactStagingDirectory)/ssh-keys/id_rsa.pub" + SSH_PUB_DEST="${{ parameters.tridentSourceDirectory }}/tests/images/trident-vm-testimage/base/files/id_rsa.pub" + if [ ! -f "$SSH_PUB_SRC" ]; then + echo "shared SSH public key not found at $SSH_PUB_SRC" + find "$(Build.ArtifactStagingDirectory)/ssh-keys" -type f + exit 1 + fi + cp "$SSH_PUB_SRC" "$SSH_PUB_DEST" + echo "Staged shared SSH public key:" + cat "$SSH_PUB_DEST" + displayName: "Stage shared SSH key into testimage tree" + workingDirectory: ${{ parameters.tridentSourceDirectory }} + + # Download the AZL4 base VHDX from the preview gallery's backing storage. + # Authenticates via the federated identity attached to the service + # connection — no storage keys handled here. + # + # The SC's default subscription (Polar_ImageTools_Staging) differs from + # the storage account's subscription (ControlTower_Test). We must switch + # context so `az storage blob list` resolves the account correctly. + - task: AzureCLI@2 + displayName: "Download AZL4 base VHDX from blob" + inputs: + azureSubscription: ${{ parameters.blobServiceConnection }} + scriptType: bash + scriptLocation: inlineScript + workingDirectory: ${{ parameters.tridentSourceDirectory }} + inlineScript: | + set -euxo pipefail + az account set --subscription "${{ parameters.blobSubscription }}" + python3 ./tests/images/testimages.py download-image azl4_qemu_guest \ + --blob-storage-account "${{ parameters.blobStorageAccount }}" \ + --blob-container "${{ parameters.blobContainer }}" + ls -la artifacts/azl4_qemu_guest.vhdx + + - bash: | + set -euxo pipefail + python3 ./tests/images/testimages.py build \ + "${{ parameters.imageName }}" \ + --container "${{ parameters.micContainerTag }}" \ + --output-dir "$(ob_outputDirectory)" \ + --no-download \ + --clones ${{ parameters.clones }} + displayName: "Build ${{ parameters.imageName }}" + workingDirectory: ${{ parameters.tridentSourceDirectory }} diff --git a/.pipelines/templates/stages/testing_rollback/vm-testing-azl4.yml b/.pipelines/templates/stages/testing_rollback/vm-testing-azl4.yml new file mode 100644 index 000000000..b9aa7fb91 --- /dev/null +++ b/.pipelines/templates/stages/testing_rollback/vm-testing-azl4.yml @@ -0,0 +1,222 @@ +# AZL4 VM offline-init rollback test stage. +# +# Complement to testing_vm/netlaunch-testing-azl4.yml (BM-simulated install). +# This stage exercises the VM offline-init path: pre-baked AZL4 qcow2 boots +# directly, then storm-trident drives A/B update from the AZL4 COSI and +# tests rollback. +# +# Inputs (both built by other AZL4 stages in this pipeline): +# - trident-vm-grub-testimage-azl4-base.qcow2 (base qcow2 with trident +# systemd units + first-boot offline-init oneshot) +# - trident-vm-grub-testimage-azl4.cosi (update target, same cosi +# the BM-sim stage uses for fresh installs) +# +# Differences from testing_rollback/testing-template.yml: +# * No test matrix. One configuration: AZL4 rollback. +# * No extension testing (--skip-extension-testing). The AZL4 cosi +# doesn't ship the sysext machinery yet. +# * No netplan runtime testing (--skip-netplan-runtime-testing). +# base-azl4 trident-config omits the os: section because the AZL3 +# MOS install path doesn't have osmodifier available; the qcow2 +# base shouldn't need netplan runtime tweaks either. +# * No manual rollback testing (--skip-manual-rollbacks) for first +# iteration; add once basic A/B works. +# * No runtime updates (--skip-runtime-updates) for first iteration. +# +# When AZL4 grows a trident-service RPM, sysext / netplan / runtime +# variants will reuse the AZL3 testing-template.yml as a matrix entry. + +parameters: + - name: baseQcowArtifact + type: string + default: "trident-vm-grub-testimage-azl4-base" + + - name: cosiArtifact + type: string + default: "trident-vm-grub-testimage-azl4" + + - name: dependsOnStage + type: string + default: "" + + - name: verboseLogging + type: boolean + default: true + + - name: pool + type: string + default: "trident-ubuntu-1es-pool-eastus2" + +stages: + - stage: RollbackTesting_AZL4 + displayName: Rollback Testing - AZL4 (VM offline-init) + dependsOn: + - BuildingTools + - PrepareSSHKeys + - TridentTestImg_trident_vm_grub_testimage_azl4 + - TridentTestImg_trident_vm_grub_testimage_azl4_base + - ${{ if ne(parameters.dependsOnStage, '') }}: + - ${{ parameters.dependsOnStage }} + + variables: + - group: servicing_testing_params + - name: SSH_PRIVATE_KEY_PATH + value: "$HOME/.ssh/id_rsa" + - name: SSH_PUBLIC_KEY_PATH + value: "$(SSH_PRIVATE_KEY_PATH).pub" + + jobs: + - job: RollbackTestingAzl4 + displayName: Rollback Testing AZL4 + timeoutInMinutes: 30 + pool: + type: linux + name: ${{ parameters.pool }} + hostArchitecture: amd64 + + variables: + ob_outputDirectory: /tmp/deployment_logs_azl4_rollback + ob_artifactBaseName: "rollback-testing-azl4" + + steps: + - template: ../common_tasks/checkout_trident.yml + + - task: DownloadPipelineArtifact@2 + displayName: "Download AZL4 base qcow2" + inputs: + buildType: current + artifactName: "${{ parameters.baseQcowArtifact }}" + targetPath: "$(Build.ArtifactStagingDirectory)/base-qcow" + + - task: DownloadPipelineArtifact@2 + displayName: "Download AZL4 update COSI" + inputs: + buildType: current + artifactName: "${{ parameters.cosiArtifact }}" + targetPath: "$(Build.ArtifactStagingDirectory)/update-cosi" + + - task: DownloadPipelineArtifact@2 + displayName: "Download SSH keys" + inputs: + buildType: current + artifactName: "ssh-keys" + targetPath: "$(Build.ArtifactStagingDirectory)/ssh" + + - task: DownloadPipelineArtifact@2 + displayName: "Download go-tools" + inputs: + buildType: current + artifactName: "go-tools" + patterns: | + netlisten + storm-trident + targetPath: "$(TRIDENT_SOURCE_DIR)/bin" + + - bash: | + set -euxo pipefail + chmod +x $(TRIDENT_SOURCE_DIR)/bin/netlisten + chmod +x $(TRIDENT_SOURCE_DIR)/bin/storm-trident + cp $(Build.ArtifactStagingDirectory)/ssh/id_rsa* ~/.ssh/ + # Targeted permissions on the keys we just staged. Avoid + # `chmod -R 700 ~/.ssh/` because self-hosted agents may + # reuse the directory across jobs and we shouldn't trample + # other tooling's known_hosts / config / id_*. + chmod 700 ~/.ssh/ || true + chmod 600 ~/.ssh/id_rsa + chmod 644 ~/.ssh/id_rsa.pub + mkdir -p $(ob_outputDirectory) + + # Both the qcow2 base build and the COSI build stage the + # shared 'ssh-keys' artifact into their MIC trees (see + # .pipelines/templates/stages/build_image/build-image-template-azl4.yml). + # So the pipeline-wide PrepareSSHKeys id_rsa we just + # copied to ~/.ssh/ matches both A-side and B-side of + # the test VM. No per-build key swap needed. + ls -l ~/.ssh/ + + # storm-trident expects the artifacts laid out under + # one directory. testimages.py output uses a clone-index + # suffix; rename to the conventional names storm-trident + # script prepare-images would produce. + ARTIFACTS=$(Build.ArtifactStagingDirectory)/storm-input + mkdir -p "$ARTIFACTS" + + # storm-trident's qemu deploy looks for a qcow2 matching the + # regex `^trident-vm-.*-testimage.qcow2$` (see + # tools/storm/utils/vm/qemu/qemu.go:34). Our build artifact + # is named trident-vm-grub-testimage-azl4-base.qcow2 which + # doesn't match (-base.qcow2 not -testimage.qcow2 at end). + # Stage it under a name that matches. + QCOW_SRC="" + for c in \ + "$(Build.ArtifactStagingDirectory)/base-qcow/trident-vm-grub-testimage-azl4-base_0.qcow2" \ + "$(Build.ArtifactStagingDirectory)/base-qcow/trident-vm-grub-testimage-azl4-base.qcow2"; do + if [ -f "$c" ]; then QCOW_SRC="$c"; break; fi + done + if [ -z "$QCOW_SRC" ]; then + echo "Could not find AZL4 base qcow2. Contents:" + find "$(Build.ArtifactStagingDirectory)/base-qcow" -type f + exit 1 + fi + cp "$QCOW_SRC" "$ARTIFACTS/trident-vm-azl4-base-testimage.qcow2" + + # storm-trident's rollback test looks for any *.cosi in the + # artifacts dir (see tools/storm/rollback/tests/rollback.go:29). + # No rename needed beyond the clone-index suffix. + COSI_SRC="" + for c in \ + "$(Build.ArtifactStagingDirectory)/update-cosi/trident-vm-grub-testimage-azl4_0.cosi" \ + "$(Build.ArtifactStagingDirectory)/update-cosi/trident-vm-grub-testimage-azl4.cosi"; do + if [ -f "$c" ]; then COSI_SRC="$c"; break; fi + done + if [ -z "$COSI_SRC" ]; then + echo "Could not find AZL4 update COSI. Contents:" + find "$(Build.ArtifactStagingDirectory)/update-cosi" -type f + exit 1 + fi + cp "$COSI_SRC" "$ARTIFACTS/trident-vm-azl4-update-testimage.cosi" + + ls -lh "$ARTIFACTS" + displayName: "Stage artifacts for storm-trident" + workingDirectory: $(TRIDENT_SOURCE_DIR) + + - bash: | + set -euxo pipefail + + STORM_DYNAMIC_FLAGS="" + if [ "${{ parameters.verboseLogging }}" == "True" ]; then + STORM_DYNAMIC_FLAGS="$STORM_DYNAMIC_FLAGS --verbose" + fi + + # First-iteration AZL4 skips: see file header for rationale. + STORM_DYNAMIC_FLAGS="$STORM_DYNAMIC_FLAGS \ + --skip-extension-testing \ + --skip-netplan-runtime-testing \ + --skip-manual-rollbacks \ + --skip-runtime-updates" + + sudo ./bin/storm-trident run rollback -a $STORM_DYNAMIC_FLAGS \ + --artifacts-dir $(Build.ArtifactStagingDirectory)/storm-input \ + --output-path $(ob_outputDirectory) \ + --platform qemu \ + --user testing-user \ + --ssh-private-key-path $(SSH_PRIVATE_KEY_PATH) \ + --ssh-public-key-path $(SSH_PUBLIC_KEY_PATH) \ + --force-cleanup + displayName: "🚀 Storm-trident rollback test (AZL4)" + workingDirectory: $(TRIDENT_SOURCE_DIR) + timeoutInMinutes: 20 + + - bash: | + set -eux + sudo zstd -T0 $(Build.ArtifactStagingDirectory)/booted.qcow2 || true + sudo mv $(Build.ArtifactStagingDirectory)/booted.qcow2.zst $(ob_outputDirectory)/ || true + workingDirectory: $(TRIDENT_SOURCE_DIR) + condition: failed() + displayName: "Publish OS disk on failure" + timeoutInMinutes: 5 + + - template: ../testing_common/fix-output-directory-for-one-branch-step.yml + parameters: + outputDir: $(ob_outputDirectory) + condition: always() diff --git a/.pipelines/templates/stages/testing_vm/netlaunch-testing-azl4.yml b/.pipelines/templates/stages/testing_vm/netlaunch-testing-azl4.yml new file mode 100644 index 000000000..77cd57803 --- /dev/null +++ b/.pipelines/templates/stages/testing_vm/netlaunch-testing-azl4.yml @@ -0,0 +1,364 @@ +# AZL4 BM-simulated netlaunch test stage. +# +# Drives an AZL3 MOS installer ISO + AZL4 COSI through netlaunch to validate +# that the AZL4 COSI can be installed by Trident onto a fresh virtdeploy VM. +# Trident runs from the live MOS environment (AZL3), `trident install` +# partitions the disk and streams the AZL4 COSI to it, the target boots +# into AZL4. +# +# Differences from netlaunch-testing.yml: +# * No test matrix. Hardcoded to the `base-azl4` configuration in +# tests/e2e_tests/trident_configurations/. +# * Host runtimeEnv only. Container variant is a follow-on. +# * No ACR push. The AZL4 COSI is served locally by netlaunch. +# * No SELinux check. AZL4 SELinux integration is its own follow-on. +# * No matrix-driven test execution after install. First iteration only +# validates that the VM provisions and is reachable over SSH. +# +# TODO(azl4-merge-back): Once AZL4 has a published trident-service RPM and +# all the bits below (SELinux, container path, metrics) are wired up for +# AZL4, fold this back into netlaunch-testing.yml as an additional matrix +# entry. + +parameters: + - name: installerISOArtifact + type: string + # AZL3 MOS ISO is the live OS Trident runs from. It does not need to + # match the target OS version since the target comes from the COSI. + default: "trident-installer" + + - name: cosiArtifact + type: string + # Artifact published by stages/build_image/build-image-azl4.yml. The + # actual COSI file inside is trident-vm-grub-testimage-azl4.cosi. + default: "trident-vm-grub-testimage-azl4" + + - name: tridentConfiguration + type: string + # Lives at tests/e2e_tests/trident_configurations/base-azl4/. + default: "base-azl4" + + - name: dependsOnStage + type: string + default: "" + +stages: + - stage: NetlaunchTesting_AZL4 + displayName: Netlaunch Testing - AZL4 (BM-simulated) + dependsOn: + - BuildingTools + - PrepareSSHKeys + - TridentTestImg_trident_installer + - TridentTestImg_trident_vm_grub_testimage_azl4 + - ${{ if ne(parameters.dependsOnStage, '') }}: + - ${{ parameters.dependsOnStage }} + + jobs: + - job: NetlaunchAzl4 + displayName: Netlaunch (AZL3 ISO + AZL4 COSI) + timeoutInMinutes: 30 + pool: + type: linux + name: trident-ubuntu-1es-pool-eastus2 + hostArchitecture: amd64 + + variables: + - name: ob_outputDirectory + value: /tmp/deployment_logs_azl4 + - name: ob_artifactBaseName + value: "netlaunch-testing-azl4" + + - name: tridentConfigPath + value: tests/e2e_tests/trident_configurations/${{ parameters.tridentConfiguration }} + + - name: netlaunchPort + value: 4001 + + steps: + - template: ../common_tasks/checkout_trident.yml + + - task: DownloadPipelineArtifact@2 + displayName: "Download AZL3 installer ISO" + inputs: + buildType: current + artifactName: "${{ parameters.installerISOArtifact }}" + targetPath: "$(TRIDENT_SOURCE_DIR)/artifacts/iso" + + - task: DownloadPipelineArtifact@2 + displayName: "Download AZL4 COSI" + inputs: + buildType: current + artifactName: "${{ parameters.cosiArtifact }}" + targetPath: "$(Build.ArtifactStagingDirectory)/cosi-azl4" + + # PrepareSSHKeys produces the shared 'ssh-keys' artifact whose + # id_rsa.pub is baked into the AZL4 COSI at MIC build time (see + # build-image-template-azl4.yml). The matching private key + # `ssh-keys/id_rsa` is what we use locally to SSH into the + # post-install AZL4 VM. Until 2026-05-17 we generated a fresh + # per-build keypair inside testimages.py and published the + # private half alongside the COSI, but the qcow2 + cosi builds + # for VM-testing need to share a key (the same VM A/B-updates + # from qcow2 to cosi), so we standardized on the shared artifact. + - task: DownloadPipelineArtifact@2 + displayName: "Download shared SSH key" + inputs: + buildType: current + artifactName: "ssh-keys" + targetPath: "$(Build.ArtifactStagingDirectory)/ssh-keys" + + - task: DownloadPipelineArtifact@2 + displayName: "Download go-tools" + inputs: + buildType: current + artifactName: "go-tools" + patterns: | + netlaunch + netlisten + storm-trident + virtdeploy + targetPath: "$(TRIDENT_SOURCE_DIR)/bin" + + # Install libvirt / qemu / OVMF and configure libvirt access. Without + # this, virt-deploy fails creating bridge interfaces ("Operation not + # permitted") on the OneBranch Ubuntu runner. + - template: netlaunch-prep.yml + + # NOTE: we intentionally do NOT run testing_common/trident-prep.yml. + # That template runs edit_host_config.py, which injects the test + # SSH key into trident-config's os.users section. The AZL4 + # `base-azl4` trident-config omits the os: section entirely + # because the AZL3 MOS installer ISO has no /usr/bin/osmodifier, + # so trident can't drive os.users at install time. Instead we + # use the per-image SSH key that testimages.py baked into the + # AZL4 COSI at MIC time (set up below). + + - bash: | + set -euxo pipefail + + chmod +x "$(TRIDENT_SOURCE_DIR)"/bin/{netlaunch,netlisten,storm-trident,virtdeploy} + + # Stage the AZL4 COSI as regular.cosi where netlaunch will + # serve it. The trident-config for base-azl4 references + # http://NETLAUNCH_HOST_ADDRESS/files/regular.cosi. + SERVE_DIR="$(TRIDENT_SOURCE_DIR)/artifacts/test-image" + mkdir -p "$SERVE_DIR" + + # The artifact may contain the file with the imageName as + # prefix; tolerate both layouts. The clone-index suffix + # (`_0.cosi`) is what testimages.py produces when called + # with the default --clones >= 1. + COSI_SRC="" + for candidate in \ + "$(Build.ArtifactStagingDirectory)/cosi-azl4/trident-vm-grub-testimage-azl4_0.cosi" \ + "$(Build.ArtifactStagingDirectory)/cosi-azl4/trident-vm-grub-testimage-azl4.cosi" \ + "$(Build.ArtifactStagingDirectory)/cosi-azl4/regular.cosi"; do + if [ -f "$candidate" ]; then + COSI_SRC="$candidate" + break + fi + done + + if [ -z "$COSI_SRC" ]; then + echo "Could not find AZL4 COSI. Artifact contents:" + find "$(Build.ArtifactStagingDirectory)/cosi-azl4" -type f | head -20 + exit 1 + fi + + cp "$COSI_SRC" "$SERVE_DIR/regular.cosi" + ls -alh "$SERVE_DIR" + + # Install the shared SSH private key (from the + # PrepareSSHKeys artifact) as the test framework's + # helpers/key. Its matching public key was baked into the + # AZL4 COSI at MIC build time, so post-install we can SSH + # into the target as testing-user with this key. + KEY_SRC="$(Build.ArtifactStagingDirectory)/ssh-keys/id_rsa" + if [ ! -f "$KEY_SRC" ]; then + echo "Could not find shared SSH key at $KEY_SRC. Artifact contents:" + find "$(Build.ArtifactStagingDirectory)/ssh-keys" -type f + exit 1 + fi + cp "$KEY_SRC" "$(TRIDENT_SOURCE_DIR)/tests/e2e_tests/helpers/key" + chmod 600 "$(TRIDENT_SOURCE_DIR)/tests/e2e_tests/helpers/key" + # Convert to PEM if not already (matches what trident-prep + # does for AZL3 keys). + # Convert the per-build SSH key to PEM if it isn't already. + # `ssh-keygen -p -P "" -N "" -m PEM -f ...` is a no-op on + # already-PEM keys and explicitly tells ssh-keygen that + # the existing passphrase is empty (so it doesn't read + # stdin if it can't guess). + ssh-keygen -p -P "" -N "" -m PEM -f "$(TRIDENT_SOURCE_DIR)/tests/e2e_tests/helpers/key" + ls -alh "$(TRIDENT_SOURCE_DIR)/artifacts/iso" + ls -alh "$(TRIDENT_SOURCE_DIR)/bin" + displayName: "Stage AZL4 COSI as regular.cosi" + + - bash: | + set -eux + # Disable virtlogd rollover so we keep full logs. + echo "max_size = 0" | sudo tee -a /etc/libvirt/virtlogd.conf + sudo systemctl restart virtlogd.socket + + ./tools/virt-deploy create --mem 12 --disks 32,32 + workingDirectory: $(TRIDENT_SOURCE_DIR) + displayName: "Create virt-deploy VM" + + - bash: | + set -euxo pipefail + + TRIDENT_CONFIG="$(TRIDENT_SOURCE_DIR)/$(tridentConfigPath)/trident-config.yaml" + + # Run netlaunch in the background so we can monitor its log + # for the install-success marker. The base-azl4 cosi does + # not yet ship trident systemd units (trident.service, + # tridentd.socket) so the installed AZL4 system never + # phones home post-reboot. netlaunch's ListenLoop always + # waits for at least one phone-home regardless of flags, + # so we treat trident's own "Rebooting system" log line + # (emitted by crates/trident/src/reboot.rs from the install + # success path) as our completion signal and terminate + # netlaunch cleanly. Phone-home wiring belongs with the + # VM-testing work where the trident systemd stack lands. + # + # netlaunch is launched with `setsid` so we can signal the + # whole process group on shutdown — otherwise the child + # HTTP/TFTP server processes get reparented to PID 1 and + # may leak ports / qcow2 file locks to the next job on the + # same agent. + setsid ./bin/netlaunch \ + --iso ./artifacts/iso/${{ parameters.installerISOArtifact }}.iso \ + --config $(TRIDENT_SOURCE_DIR)/tools/vm-netlaunch.yaml \ + --trident "$TRIDENT_CONFIG" \ + --servefolder ./artifacts/test-image \ + --logstream \ + --force-color \ + --full-logstream logstream-full.log \ + --only-print-exit-code \ + --port $(netlaunchPort) > ./clean-install-azl4.log 2>&1 & + NETLAUNCH_PID=$! + NETLAUNCH_PGID="$NETLAUNCH_PID" + echo "netlaunch pid: $NETLAUNCH_PID (pgid $NETLAUNCH_PGID)" + + # Watch for the install-success marker for up to 12 minutes. + # Real install completes in 1-3 minutes once netlaunch + # finishes booting the MOS ISO via HTTP boot. UEFI HTTP + # boot can occasionally need 5+ minutes, so 12 minutes is + # generous. + # + # The marker regex is tightened to trident's own + # log-record prefix to avoid false-positives from any + # kernel / systemd / dracut "Restarting system" line that + # might fire on an error path before trident itself + # actually completes. + REBOOT_RE='trident[^[:space:]]*[[:space:]]+(INFO|WARN)[[:space:]].*Rebooting system' + FATAL_RE='kernel panic|dracut:.*FATAL|Emergency mode|emergency!' + DEADLINE=$((SECONDS + 720)) + INSTALL_OK=0 + while [ $SECONDS -lt $DEADLINE ]; do + if grep -Eq "$FATAL_RE" ./clean-install-azl4.log 2>/dev/null; then + echo "FATAL marker observed before install success — aborting" + break + fi + if ! kill -0 $NETLAUNCH_PID 2>/dev/null; then + echo "netlaunch exited on its own" + if wait $NETLAUNCH_PID; then + INSTALL_OK=1 + fi + break + fi + if grep -Eq "$REBOOT_RE" ./clean-install-azl4.log 2>/dev/null; then + echo "install completed (saw trident 'Rebooting system' marker)" + INSTALL_OK=1 + break + fi + sleep 10 + done + + # Always show the netlaunch log tail for diagnostics + echo "--- netlaunch log tail ---" + tail -50 ./clean-install-azl4.log || true + + if [ $INSTALL_OK -eq 1 ]; then + echo "Killing netlaunch process group (install completed; not waiting for phone-home)" + # SIGTERM the whole group; netlaunch's children include + # an HTTP/TFTP server we need to release the port on. + kill -TERM -"$NETLAUNCH_PGID" 2>/dev/null || true + # Generous grace so --full-logstream finishes flushing + # to logstream-full.log (which the failure-diagnostic + # display-logs step uploads). + for _ in 1 2 3 4 5 6 7 8 9 10; do + if ! kill -0 $NETLAUNCH_PID 2>/dev/null; then + break + fi + sleep 1 + done + kill -KILL -"$NETLAUNCH_PGID" 2>/dev/null || true + wait $NETLAUNCH_PID 2>/dev/null || true + exit 0 + fi + + echo "Install marker not observed within timeout (or fatal seen)" + kill -TERM -"$NETLAUNCH_PGID" 2>/dev/null || true + sleep 5 + kill -KILL -"$NETLAUNCH_PGID" 2>/dev/null || true + exit 1 + workingDirectory: $(TRIDENT_SOURCE_DIR) + displayName: "🚀 Run netlaunch (AZL3 ISO installs AZL4 COSI)" + # 14 minutes covers the 12-minute install-success watcher + # plus a couple minutes of slack. + timeoutInMinutes: 14 + + - bash: | + set -eux + sudo ./bin/storm-trident helper wait-for-login -a \ + --vm-name "$(jq -r '.virtualmachines[0].name' $(TRIDENT_SOURCE_DIR)/tools/virt-deploy-metadata.json)" \ + --artifacts-folder "$(ob_outputDirectory)" + timeoutInMinutes: 5 + # `succeeded()` (not `succeededOrFailed()`) so a failed + # SSH-up after a "successful" netlaunch actually fails the + # stage. Combined with the tightened install marker above, + # this closes the structural bias-toward-green where the + # netlaunch wrapper could exit 0 on a false-positive log + # line and let everything downstream gloss over the failure. + condition: succeeded() + workingDirectory: $(TRIDENT_SOURCE_DIR) + displayName: "📄 Wait for target OS to be reachable" + + - bash: | + set -eux + ./bin/storm-trident script capture-screenshot \ + --screenshot-filename "install-azl4.png" \ + --artifacts-folder "$(ob_outputDirectory)" + displayName: "📷 Capture screenshot" + workingDirectory: $(TRIDENT_SOURCE_DIR) + condition: succeededOrFailed() + + - bash: | + set -eux + sudo ./bin/storm-trident helper display-logs -a \ + --serial-log-artifact-file-name "azl4-install-target-os-A-serial.log" \ + --trident-trace-log-file "$(TRIDENT_SOURCE_DIR)/logstream-full.log" \ + --artifacts-folder "$(ob_outputDirectory)" + displayName: "📄 Display install logs" + workingDirectory: $(TRIDENT_SOURCE_DIR) + condition: succeededOrFailed() + + - bash: | + set -eux + sudo virsh shutdown virtdeploy-vm-0 || true + mkdir -p $(ob_outputDirectory) + sudo cp /var/lib/libvirt/images/virtdeploy-pool/virtdeploy-vm-0-0-volume.qcow2 $(ob_outputDirectory)/ || true + sudo zstd -T0 $(ob_outputDirectory)/virtdeploy-vm-0-0-volume.qcow2 || true + sudo cp $(TRIDENT_SOURCE_DIR)/tests/e2e_tests/helpers/key $(ob_outputDirectory) || true + # Owner-only readable. Previously this was `chmod 777` + # which produced a SARIF-flaggable artifact even though + # the key is per-build ephemeral. + sudo chmod 600 $(ob_outputDirectory)/key || true + workingDirectory: $(TRIDENT_SOURCE_DIR) + condition: failed() + displayName: "Publish OS disk on failure" + + - template: ../testing_common/fix-output-directory-for-one-branch-step.yml + parameters: + outputDir: $(ob_outputDirectory) + condition: always() diff --git a/crates/osmodifier/src/grub_cfg.rs b/crates/osmodifier/src/grub_cfg.rs index ade45dca9..46dd82c97 100644 --- a/crates/osmodifier/src/grub_cfg.rs +++ b/crates/osmodifier/src/grub_cfg.rs @@ -18,6 +18,10 @@ use crate::OsModifierContext; /// Possible grub.cfg locations, tried in order. const GRUB_CFG_PATHS: &[&str] = &["/boot/grub2/grub.cfg", "/boot/grub/grub.cfg"]; +/// BLS (Boot Loader Spec) entry directory. Fedora-based distros (including +/// AZL4) store kernel boot entries here instead of inline in grub.cfg. +const BLS_ENTRIES_DIR: &str = "/boot/loader/entries"; + /// Extract boot arguments from the generated grub.cfg. /// /// Returns a tuple of (args_to_sync, optional_root_device). @@ -37,7 +41,14 @@ pub fn extract_boot_args_from_grub_cfg( // Find the non-recovery linux command lines. // Go expects exactly one; error otherwise. - let linux_lines = find_non_recovery_linux_lines(&content)?; + let linux_lines = match find_non_recovery_linux_lines(&content) { + Ok(lines) => lines, + Err(_) if content.contains("blscfg") => { + debug!("grub.cfg uses BLS (blscfg); reading boot args from BLS entries"); + extract_options_from_bls_entries(ctx)? + } + Err(e) => return Err(e), + }; if linux_lines.len() != 1 { bail!( "expected 1 non-recovery linux line, found {}", @@ -94,6 +105,67 @@ fn find_grub_cfg(ctx: &OsModifierContext) -> Result { bail!("Could not find grub.cfg at any of: {:?}", GRUB_CFG_PATHS) } +/// Read boot arguments from BLS (Boot Loader Spec) entries. +/// +/// Scans `{root}/boot/loader/entries/*.conf`, skips entries whose title +/// contains "rescue" or "recovery" (case-insensitive), and returns the +/// `options` line from the first valid entry (sorted lexically, matching +/// grub's ordering). +fn extract_options_from_bls_entries(ctx: &OsModifierContext) -> Result, Error> { + let entries_dir = ctx.path(BLS_ENTRIES_DIR); + let mut conf_files: Vec = fs::read_dir(&entries_dir) + .with_context(|| format!("Failed to read BLS entries dir '{}'", entries_dir.display()))? + .filter_map(|e| e.ok()) + .map(|e| e.path()) + .filter(|p| p.extension().is_some_and(|ext| ext == "conf")) + .collect(); + + conf_files.sort(); + + for conf_path in &conf_files { + let content = fs::read_to_string(conf_path) + .with_context(|| format!("Failed to read BLS entry '{}'", conf_path.display()))?; + + let mut title = None; + let mut options = None; + + for line in content.lines() { + if let Some(value) = line.strip_prefix("title") { + title = Some(value.trim().to_string()); + } else if let Some(value) = line.strip_prefix("options") { + options = Some(value.trim().to_string()); + } + } + + // Skip recovery/rescue entries. + if let Some(ref t) = title { + let lower = t.to_lowercase(); + if lower.contains("rescue") || lower.contains("recovery") { + trace!( + "Skipping BLS rescue/recovery entry: {}", + conf_path.display() + ); + continue; + } + } + + if let Some(opts) = options { + debug!( + "Using BLS entry '{}': options = {opts}", + conf_path.display() + ); + // Return as a synthetic "linux" line: prepend a dummy kernel path + // so the downstream parser (which skips the first token) works. + return Ok(vec![format!("/boot/vmlinuz {opts}")]); + } + } + + bail!( + "no non-recovery BLS entry found in '{}'", + entries_dir.display() + ) +} + /// Return the first whitespace-delimited word from a line, or None if the /// line is empty / whitespace-only. fn first_word(line: &str) -> Option<&str> { @@ -757,4 +829,120 @@ mod tests { assert_eq!(count_braces("menuentry 'title {x}' {"), (1, 0)); assert_eq!(count_braces(r#"menuentry "title {x}" {"#), (1, 0)); } + + // ======================= BLS entry support ======================= + + #[test] + fn test_extract_bls_fallback() { + let tmp = tempdir().unwrap(); + + // Write a BLS-style grub.cfg (contains blscfg, no inline linux lines) + let grub_dir = tmp.path().join("boot/grub2"); + std::fs::create_dir_all(&grub_dir).unwrap(); + std::fs::write( + grub_dir.join("grub.cfg"), + indoc::indoc! {r#" + set timeout=5 + load_env -f /boot/grub2/grubenv + blscfg + "#}, + ) + .unwrap(); + + // Write a BLS entry + let bls_dir = tmp.path().join("boot/loader/entries"); + std::fs::create_dir_all(&bls_dir).unwrap(); + std::fs::write( + bls_dir.join("azl4.conf"), + indoc::indoc! {r#" + title Azure Linux 4.0 (6.6.60) + version 6.6.60 + linux /boot/vmlinuz-6.6.60 + initrd /boot/initramfs-6.6.60.img + options root=/dev/sda2 ro selinux=1 rd.overlayfs=lower,upper,work,/dev/sda5 + "#}, + ) + .unwrap(); + + let ctx = OsModifierContext { + root: tmp.path().to_path_buf(), + }; + + let (args, root_device) = extract_boot_args_from_grub_cfg(&ctx).unwrap(); + assert_eq!(root_device, Some("/dev/sda2".to_string())); + assert!(args.contains(&"selinux=1".to_string())); + } + + #[test] + fn test_extract_bls_skips_recovery() { + let tmp = tempdir().unwrap(); + + let grub_dir = tmp.path().join("boot/grub2"); + std::fs::create_dir_all(&grub_dir).unwrap(); + std::fs::write(grub_dir.join("grub.cfg"), "set timeout=5\nblscfg\n").unwrap(); + + let bls_dir = tmp.path().join("boot/loader/entries"); + std::fs::create_dir_all(&bls_dir).unwrap(); + + // Rescue entry (should be skipped) + std::fs::write( + bls_dir.join("rescue.conf"), + indoc::indoc! {r#" + title Azure Linux 4.0 rescue + version 6.6.60 + linux /boot/vmlinuz-6.6.60 + initrd /boot/initramfs-6.6.60.img + options root=/dev/sda2 ro single + "#}, + ) + .unwrap(); + + // Normal entry (should be used) + std::fs::write( + bls_dir.join("zzz-normal.conf"), + indoc::indoc! {r#" + title Azure Linux 4.0 (6.6.60) + version 6.6.60 + linux /boot/vmlinuz-6.6.60 + initrd /boot/initramfs-6.6.60.img + options root=/dev/sda2 ro selinux=1 + "#}, + ) + .unwrap(); + + let ctx = OsModifierContext { + root: tmp.path().to_path_buf(), + }; + + let (args, root_device) = extract_boot_args_from_grub_cfg(&ctx).unwrap(); + assert_eq!(root_device, Some("/dev/sda2".to_string())); + assert!(args.contains(&"selinux=1".to_string())); + // "single" from rescue entry should NOT appear + assert!(!args.iter().any(|a| a.contains("single"))); + } + + #[test] + fn test_extract_bls_no_entries() { + let tmp = tempdir().unwrap(); + + let grub_dir = tmp.path().join("boot/grub2"); + std::fs::create_dir_all(&grub_dir).unwrap(); + std::fs::write(grub_dir.join("grub.cfg"), "set timeout=5\nblscfg\n").unwrap(); + + // Empty BLS entries dir + let bls_dir = tmp.path().join("boot/loader/entries"); + std::fs::create_dir_all(&bls_dir).unwrap(); + + let ctx = OsModifierContext { + root: tmp.path().to_path_buf(), + }; + + let result = extract_boot_args_from_grub_cfg(&ctx); + assert!(result.is_err()); + let err_msg = result.unwrap_err().to_string(); + assert!( + err_msg.contains("no non-recovery BLS entry found"), + "Error should mention no BLS entries, got: {err_msg}" + ); + } } diff --git a/crates/osutils/src/grub.rs b/crates/osutils/src/grub.rs index 92782bbf7..352064bee 100644 --- a/crates/osutils/src/grub.rs +++ b/crates/osutils/src/grub.rs @@ -231,9 +231,18 @@ impl GrubConfig { } /// Update the search command in the GRUB config. + /// + /// Three variants of the GRUB stub `search` line exist in practice: + /// + /// 1. The upstream legacy form: `search -n -u -s` + /// 2. AZL3 / standard form: `search --no-floppy --fs-uuid --set=root ` + /// 3. AZL4 / Fedora-based form: `search --fs-uuid --set=root ` + /// (`--no-floppy` is a Mariner-specific convention; Fedora's grub2 + /// scripts don't emit it, and it's redundant on EFI machines.) pub fn update_search(&mut self, uuid: &Uuid) -> Result<(), Error> { let re = Regex::new(r"(?m)^(\s*)search -n -u [\w-]+ -s$").unwrap(); let re2 = Regex::new(r"(?m)^(\s*)search --no-floppy --fs-uuid --set=root [\w-]+$").unwrap(); + let re3 = Regex::new(r"(?m)^(\s*)search --fs-uuid --set=root [\w-]+$").unwrap(); if re.is_match(&self.contents) { self.contents = re @@ -246,6 +255,13 @@ impl GrubConfig { &format!("${{1}}search --no-floppy --fs-uuid --set=root {uuid}"), ) .to_string(); + } else if re3.is_match(&self.contents) { + self.contents = re3 + .replace( + &self.contents, + &format!("${{1}}search --fs-uuid --set=root {uuid}"), + ) + .to_string(); } else { bail!( "Unable to find search command in '{}'", @@ -953,6 +969,52 @@ mod tests { .unwrap(); } + #[test] + fn test_update_search_azl3_form() { + // AZL3 stubs use `search --no-floppy --fs-uuid --set=root `. + let mut grub_config = GrubConfig { + path: PathBuf::new(), + contents: indoc::indoc! { r#" + set timeout=0 + search --no-floppy --fs-uuid --set=root deadbeef-cafe-babe-0000-111122223333 + "# } + .to_owned(), + linux_command_line: None, + }; + + let new_uuid = Uuid::parse_str("9e6a9d2c-b7fe-4359-ac45-18b505e29d8c").unwrap(); + grub_config.update_search(&new_uuid).unwrap(); + + assert!(grub_config.contents.contains(&format!( + "search --no-floppy --fs-uuid --set=root {new_uuid}" + ))); + assert!(!grub_config.contents.contains("deadbeef")); + } + + #[test] + fn test_update_search_azl4_form() { + // AZL4 (Fedora-based) stubs omit --no-floppy. + let mut grub_config = GrubConfig { + path: PathBuf::new(), + contents: indoc::indoc! { r#" + set timeout=0 + search --fs-uuid --set=root deadbeef-cafe-babe-0000-111122223333 + "# } + .to_owned(), + linux_command_line: None, + }; + + let new_uuid = Uuid::parse_str("9e6a9d2c-b7fe-4359-ac45-18b505e29d8c").unwrap(); + grub_config.update_search(&new_uuid).unwrap(); + + assert!(grub_config + .contents + .contains(&format!("search --fs-uuid --set=root {new_uuid}"))); + assert!(!grub_config.contents.contains("deadbeef")); + // Must not accidentally insert --no-floppy. + assert!(!grub_config.contents.contains("--no-floppy")); + } + #[test] fn test_update_rootdevice() { // Define original GRUB config contents on target machine diff --git a/crates/osutils/src/mkinitrd.rs b/crates/osutils/src/mkinitrd.rs index c6ab3d2e1..d01831826 100644 --- a/crates/osutils/src/mkinitrd.rs +++ b/crates/osutils/src/mkinitrd.rs @@ -118,6 +118,8 @@ mod functional_test { fn test_regenerate_initrd() { let pattern = if osrelease::is_azl3().unwrap() { "/boot/initramfs-*.azl3.img" + } else if osrelease::is_azl4().unwrap() { + "/boot/initramfs-*.azl4.img" } else { "/boot/initrd.img-*" }; diff --git a/crates/osutils/src/osrelease.rs b/crates/osutils/src/osrelease.rs index e51926e74..c39981c6f 100644 --- a/crates/osutils/src/osrelease.rs +++ b/crates/osutils/src/osrelease.rs @@ -31,6 +31,11 @@ pub fn is_azl3() -> Result { Ok(OsRelease::read()?.get_distro().is_azl3()) } +/// Returns whether the host is running Azure Linux 4. +pub fn is_azl4() -> Result { + Ok(OsRelease::read()?.get_distro().is_azl4()) +} + /// Represents the contents of the /etc/os-release file. /// /// See @@ -146,6 +151,8 @@ impl OsRelease { AzureLinuxRelease::AzL2 } else if v.starts_with("3.") { AzureLinuxRelease::AzL3 + } else if v.starts_with("4.") { + AzureLinuxRelease::AzL4 } else { trace!("Unknown Azure Linux release: {v}"); AzureLinuxRelease::Other @@ -342,6 +349,10 @@ impl Distro { self == &Distro::AzureLinux(AzureLinuxRelease::AzL3) } + pub fn is_azl4(&self) -> bool { + self == &Distro::AzureLinux(AzureLinuxRelease::AzL4) + } + pub fn is_acl(&self) -> bool { self == &Distro::AzureContainerLinux } @@ -354,6 +365,7 @@ pub enum AzureLinuxRelease { Other, AzL2, AzL3, + AzL4, } #[cfg(test)] @@ -429,6 +441,41 @@ mod tests { ); } + #[test] + fn test_parse_azl4() { + let data = indoc::indoc! { + r#" + NAME="Azure Linux" + VERSION="4.0 (Four Alpha2)" + RELEASE_TYPE=development + ID=azurelinux + ID_LIKE=fedora + VERSION_ID="4.0" + VERSION_CODENAME="" + PRETTY_NAME="Azure Linux 4.0 (Four Alpha2)" + ANSI_COLOR="0;38;2;60;110;180" + LOGO=azurelinux-logo-icon + CPE_NAME="cpe:/o:azurelinuxproject:azurelinux:4.0" + DEFAULT_HOSTNAME="azurelinux" + HOME_URL="https://aka.ms/azurelinux" + DOCUMENTATION_URL="https://aka.ms/azurelinux" + SUPPORT_URL="https://aka.ms/azurelinux" + BUG_REPORT_URL="https://aka.ms/azurelinux" + SUPPORT_END=2026-05-15 + "#, + }; + + let os_release = OsRelease::parse(data); + assert_eq!(os_release.id, Some("azurelinux".to_string())); + assert_eq!(os_release.version_id, Some("4.0".to_string())); + assert_eq!(os_release.id_like, Some("fedora".to_string())); + assert_eq!(os_release.release_type, Some("development".to_string())); + assert_eq!( + os_release.get_distro(), + Distro::AzureLinux(AzureLinuxRelease::AzL4) + ); + } + #[test] fn test_parse_extension_release() { let data = indoc::indoc! { diff --git a/crates/osutils/src/sfdisk.rs b/crates/osutils/src/sfdisk.rs index 81eef21c7..f40276ad9 100644 --- a/crates/osutils/src/sfdisk.rs +++ b/crates/osutils/src/sfdisk.rs @@ -197,6 +197,61 @@ pub fn get_disk_uuid(disk: &Path) -> Result, Error> { Ok(Some(uuid)) } +/// Sets the disk-id (GPT header DiskGUID) of the given disk via sfdisk. +/// +/// `uuid` must parse as a valid GUID; this is checked before invoking +/// sfdisk so an accidental flag-like string (e.g. `--foo`) is rejected +/// here rather than mis-interpreted by sfdisk as an option. +/// +/// `--no-reread` + `--no-tell-kernel` are passed because the typical +/// caller is `trident offline-initialize` inside MIC's chroot, where +/// the disk's partitions are bind-mounted into the chroot. Requesting +/// `BLKRRPART` on a disk with mounted partitions returns EBUSY; we +/// only care about updating the on-disk GPT here. +pub fn set_disk_uuid(disk: &Path, uuid: &str) -> Result<(), Error> { + uuid::Uuid::parse_str(uuid) + .with_context(|| format!("'{uuid}' is not a valid GUID for sfdisk --disk-id"))?; + Dependency::Sfdisk + .cmd() + .arg("--no-reread") + .arg("--no-tell-kernel") + .arg("--disk-id") + .arg(disk) + .arg(uuid) + .run_and_check() + .context(format!( + "Failed to set disk-id on {} to {uuid}", + disk.display() + ))?; + Ok(()) +} + +/// Sets the GPT partition UUID for a specific partition by number on the +/// given disk. +/// +/// `uuid` is validated as a GUID first to avoid sfdisk mis-interpreting +/// a flag-like argument. `--no-reread` / `--no-tell-kernel` mirror +/// [`set_disk_uuid`] for safety inside MIC chroots with mounted +/// partitions. +pub fn set_part_uuid(disk: &Path, partition_number: usize, uuid: &str) -> Result<(), Error> { + uuid::Uuid::parse_str(uuid) + .with_context(|| format!("'{uuid}' is not a valid GUID for sfdisk --part-uuid"))?; + Dependency::Sfdisk + .cmd() + .arg("--no-reread") + .arg("--no-tell-kernel") + .arg("--part-uuid") + .arg(disk) + .arg(partition_number.to_string()) + .arg(uuid) + .run_and_check() + .context(format!( + "Failed to set partition UUID on {} partition {partition_number} to {uuid}", + disk.display() + ))?; + Ok(()) +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/osutils/src/testutils/osrelease.rs b/crates/osutils/src/testutils/osrelease.rs index 6feff02bc..27a2e5b17 100644 --- a/crates/osutils/src/testutils/osrelease.rs +++ b/crates/osutils/src/testutils/osrelease.rs @@ -38,11 +38,36 @@ const AZURE_LINUX_3_OS_RELEASE: &str = indoc::indoc! { "#, }; +/// Azure Linux 4.0 sample os-release file. +const AZURE_LINUX_4_OS_RELEASE: &str = indoc::indoc! { + r#" + NAME="Azure Linux" + VERSION="4.0 (Cloud Variant Beta)" + RELEASE_TYPE=development + ID=azurelinux + ID_LIKE=fedora + VERSION_ID="4.0" + VERSION_CODENAME="" + PRETTY_NAME="Azure Linux 4.0 (Cloud Variant Beta)" + ANSI_COLOR="0;38;2;60;110;180" + LOGO=azurelinux-logo-icon + CPE_NAME="cpe:/o:azurelinuxproject:azurelinux:4.0" + DEFAULT_HOSTNAME="azurelinux" + HOME_URL="https://aka.ms/azurelinux" + DOCUMENTATION_URL="https://aka.ms/azurelinux" + SUPPORT_URL="https://aka.ms/azurelinux" + BUG_REPORT_URL="https://aka.ms/azurelinux" + VARIANT="Cloud Variant" + VARIANT_ID=cloud + "#, +}; + /// Creates a mock /etc/os-release file with the given Azure Linux release. pub fn make_mock_os_release(root_path: &Path, azl_release: AzureLinuxRelease) -> Result<(), Error> { let os_release_content = match azl_release { AzureLinuxRelease::AzL2 => AZURE_LINUX_2_OS_RELEASE, AzureLinuxRelease::AzL3 => AZURE_LINUX_3_OS_RELEASE, + AzureLinuxRelease::AzL4 => AZURE_LINUX_4_OS_RELEASE, AzureLinuxRelease::Other => bail!("Unsupported Azure Linux release 'other'"), }; diff --git a/crates/trident/src/engine/boot/grub.rs b/crates/trident/src/engine/boot/grub.rs index b345f5c31..fb25b59c8 100644 --- a/crates/trident/src/engine/boot/grub.rs +++ b/crates/trident/src/engine/boot/grub.rs @@ -63,9 +63,10 @@ pub(super) fn update_configs(ctx: &EngineContext) -> Result<(), Error> { let boot_grub_config_path = Path::new(ROOT_MOUNT_POINT_PATH).join(GRUB2_CONFIG_RELATIVE_PATH); // Update GRUB config on the boot device (volume holding /boot) - match ctx.host_os_release.get_distro() { - Distro::AzureLinux(AzureLinuxRelease::AzL3) => { - update_grub_config_azl3(ctx, &root_device_path, &boot_grub_config_path)?; + // Use the *image* distro (the OS being installed), not the host (MOS ISO). + match ctx.image_distro() { + Distro::AzureLinux(AzureLinuxRelease::AzL3 | AzureLinuxRelease::AzL4) => { + update_grub_config(ctx, &root_device_path, &boot_grub_config_path)?; } d => bail!("Unsupported distro for GRUB config update: {d:?}"), @@ -86,7 +87,7 @@ pub(super) fn update_configs(ctx: &EngineContext) -> Result<(), Error> { } /// Updates the GRUB config for Azure Linux 3.0 using OS modifier. -fn update_grub_config_azl3( +fn update_grub_config( ctx: &EngineContext, root_device_path: &Path, boot_grub_config_path: &Path, diff --git a/crates/trident/src/engine/context/mod.rs b/crates/trident/src/engine/context/mod.rs index 73fe61f4e..4632acabc 100644 --- a/crates/trident/src/engine/context/mod.rs +++ b/crates/trident/src/engine/context/mod.rs @@ -441,8 +441,20 @@ impl EngineContext { } /// Retrieves the distribution of the OS image. + /// + /// Prefers the image's own os-release (e.g., from the COSI being installed). + /// Falls back to the host os-release only when no image is mounted + /// (functional tests, runtime operations outside an install flow). + /// + /// If an image IS present but its distro is unrecognized, the image's + /// distro is returned as-is (Distro::Other) so callers can bail + /// explicitly rather than silently using the host's distro. pub(crate) fn image_distro(&self) -> Distro { - self.image_os_release().get_distro() + if self.image.is_some() { + self.image_os_release().get_distro() + } else { + self.host_os_release.get_distro() + } } } diff --git a/crates/trident/src/init/offline/mod.rs b/crates/trident/src/init/offline/mod.rs index cdbeee23d..ef3d03872 100644 --- a/crates/trident/src/init/offline/mod.rs +++ b/crates/trident/src/init/offline/mod.rs @@ -11,7 +11,7 @@ use anyhow::{bail, Error}; use log::{debug, info, trace, warn}; use maplit::hashmap; -use osutils::lsblk; +use osutils::{lsblk, sfdisk}; use sysdefs::partition_types::DiscoverablePartitionType; use trident_api::{ config::{ @@ -256,22 +256,131 @@ fn generate_host_status( .structured(ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment) .message("Failed to find root device in lsblk output")?; - let disk_uuid = lsblk_device + let disk_uuid = match lsblk_device .ptuuid .clone() .and_then(|ptuuid| ptuuid.as_uuid()) - .structured(ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment) - .message("No UUID found for root device")?; + { + Some(uuid) => uuid, + None => { + // lsblk didn't surface a PTUUID. This can happen in chroot + // environments (e.g. image-customizer / MIC) where the + // exposed loop device has partition children but the GPT + // disk-id either isn't set on the partition table or isn't + // populated by lsblk's PTUUID column. Fall back to sfdisk + // (which reads the GPT directly), and if that also reports + // no disk-id, mint one and persist it so the resulting + // image carries it forward to runtime. + let disk_dev_path = PathBuf::from("/dev").join(&lsblk_device.name); + warn!( + "PTUUID not reported by lsblk for {}; falling back to sfdisk", + disk_dev_path.display() + ); + let from_sfdisk = sfdisk::get_disk_uuid(&disk_dev_path) + .structured(ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment) + .message("Failed to read GPT disk-id via sfdisk")? + .and_then(|u| u.as_uuid()); + match from_sfdisk { + Some(uuid) => uuid, + None => { + let new_uuid = uuid::Uuid::new_v4(); + warn!( + "No GPT disk-id present on {}; assigning {}", + disk_dev_path.display(), + new_uuid + ); + sfdisk::set_disk_uuid(&disk_dev_path, &new_uuid.to_string()) + .structured( + ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment, + ) + .message(format!( + "Failed to assign GPT disk-id on {}", + disk_dev_path.display() + ))?; + new_uuid + } + } + } + }; lsblk_device.children.sort_by_key(|p| p.partn); - for (i, part) in lsblk_device.children.iter().enumerate() { - if part.part_uuid.is_none() { - return Err(TridentError::new( - ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment, - )) - .message(format!("No part UUID found for partition {}", i + 1)); + // Compute disk_dev_path once for partition-UUID fallback below. + let disk_dev_path = PathBuf::from("/dev").join(&lsblk_device.name); + + // For each partition, ensure we have a usable PARTUUID. Mirror the + // disk-id fallback above: prefer lsblk, then sfdisk, then mint a + // fresh one and persist it via sfdisk. Some chroot environments + // don't surface PARTUUID via lsblk --output-all and may also leave + // the value unset on the underlying GPT. + for (i, part) in lsblk_device.children.iter_mut().enumerate() { + if part.part_uuid.as_ref().and_then(|u| u.as_uuid()).is_some() { + continue; } + let partn = part.partn.unwrap_or((i + 1) as u32) as usize; + warn!( + "PARTUUID not reported by lsblk for partition {} on {}; falling back to sfdisk", + partn, + disk_dev_path.display() + ); + // Re-read the disk via sfdisk -J to find any UUID already present + // on this partition (sfdisk reads the GPT directly). + let sf_info = sfdisk::SfDisk::get_info(&disk_dev_path) + .structured(ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment) + .message(format!( + "Failed to read GPT info via sfdisk for {}", + disk_dev_path.display() + ))?; + if let Some(existing) = sf_info + .partitions + .iter() + .find(|p| p.number == partn) + .and_then(|p| p.id.as_uuid()) + { + // Use the canonical form sfdisk reported, not a re-rendered + // copy — sfdisk normalizes UUIDs to upper-case on disk and + // downstream /dev/disk/by-partuuid/ lookups must match. + part.part_uuid = Some(existing.to_string().into()); + continue; + } + + let new_uuid = uuid::Uuid::new_v4(); + warn!( + "Partition {} on {} has no PARTUUID; assigning {}", + partn, + disk_dev_path.display(), + new_uuid + ); + sfdisk::set_part_uuid(&disk_dev_path, partn, &new_uuid.to_string()) + .structured(ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment) + .message(format!( + "Failed to assign PARTUUID on partition {} of {}", + partn, + disk_dev_path.display() + ))?; + + // Re-read to get sfdisk's canonical on-disk form (upper-case) + // rather than stamping our locally-generated lower-case Uuid. + // Avoids a subtle case-mismatch with udev's + // /dev/disk/by-partuuid/ symlinks. + let written_uuid = sfdisk::SfDisk::get_info(&disk_dev_path) + .structured(ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment) + .message(format!( + "Failed to re-read GPT info via sfdisk for {} after writing partition UUID", + disk_dev_path.display() + ))? + .partitions + .iter() + .find(|p| p.number == partn) + .and_then(|p| p.id.as_uuid()) + .ok_or_else(|| { + TridentError::new(ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment) + }) + .message(format!( + "sfdisk reported no PARTUUID for partition {} after writing {}", + partn, new_uuid + ))?; + part.part_uuid = Some(written_uuid.to_string().into()); } // Get partition paths created from combining Prism history and lsblk output. @@ -494,12 +603,21 @@ pub fn execute( trace!("Prism history contents:\n{history_file}"); + // Note: `disk` is the *runtime* device path that will be written + // into the datastore (e.g. /dev/sda). At build time inside Prism's + // chroot, this path generally does not exist because the disk is + // exposed as a loop device (the actual build-time device is + // auto-detected below by walking lsblk for the mount at "/"). + // Older code asserted that `disk` exist at build time, but that + // check tested the wrong invariant and broke AZL4 image builds + // where MIC does not bind a /dev/sda node into the chroot. let disk_path = Path::new(disk); if !disk_path.exists() { - return Err(TridentError::new( - ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment, - )) - .message(format!("Prism chroot environment doesn't contain {disk}")); + debug!( + "Runtime disk path {} not present in build environment; \ + this is expected when running inside MIC's chroot.", + disk_path.display() + ); } let history: Vec = diff --git a/crates/trident/src/subsystems/esp.rs b/crates/trident/src/subsystems/esp.rs index e3073aa8b..1ba98ea41 100644 --- a/crates/trident/src/subsystems/esp.rs +++ b/crates/trident/src/subsystems/esp.rs @@ -6,7 +6,7 @@ use std::{ }; use anyhow::{bail, ensure, Context, Error}; -use log::{debug, trace}; +use log::{debug, trace, warn}; use reqwest::Url; use tempfile::{NamedTempFile, TempDir}; @@ -290,8 +290,12 @@ fn copy_file_artifacts( // Copy the UKI from the image into the ESP directory uki::stage_uki_on_esp(temp_mount_dir, mount_point, &ctx.esp_mount_path)?; - } else { - // In non-UKI mode, bail if grub_noprefix.efi is not found in the image. + } else if ctx.image_distro().is_azl3() { + // AZL3 ships two GRUB variants: grub2-efi-binary (prefix-relative + // config lookup) and grub2-efi-binary-noprefix (root-device-relative + // config lookup). Trident's A/B update path requires the noprefix + // variant. If the image shipped the wrong one, fail early rather + // than producing an unbootable machine. ensure!( grub_noprefix || ctx @@ -558,7 +562,6 @@ fn copy_boot_files( esp_dir: &Path, boot_files: Vec, ) -> Result { - // Track whether grub-noprefix.efi is used let mut no_prefix = false; // Copy the specified files from temp_mount_path to esp_dir_path for boot_file in boot_files.iter() { @@ -605,6 +608,69 @@ fn copy_boot_files( Ok(no_prefix) } +/// Search EFI vendor directories for a specific binary. +/// +/// UEFI convention: each OS vendor installs its bootloader under +/// `EFI//` (e.g., `EFI/fedora/`, `EFI/azurelinux/`). +/// This function searches all subdirectories of the EFI directory +/// for the specified binary, skipping the BOOT fallback directory. +/// +/// Vendor dirs are iterated in sorted (lexicographic) order so the +/// selection is reproducible across builds when more than one vendor +/// directory contains a candidate. `read_dir` order alone is +/// filesystem-dependent (ext4 returns hash order, FAT returns +/// directory-entry order), which would produce irreproducible ESP +/// images on cross-builds and break attestation/PCR lock for the +/// selected bootloader. +fn find_efi_binary_in_vendor_dirs(efi_dir: &Path, binary_name: &str) -> Option { + let entries = match std::fs::read_dir(efi_dir) { + Ok(e) => e, + Err(e) => { + debug!("Cannot read EFI directory '{}': {}", efi_dir.display(), e); + return None; + } + }; + + // Materialize entries first so we can sort, and so a per-entry + // iterator error is logged instead of silently dropped. + let mut paths: Vec = Vec::new(); + for entry in entries { + match entry { + Ok(e) => paths.push(e.path()), + Err(e) => warn!( + "Failed to read entry under EFI directory '{}': {}", + efi_dir.display(), + e + ), + } + } + paths.sort(); + + for path in paths { + if !path.is_dir() { + continue; + } + + // Skip the BOOT directory (already checked by the caller) + if let Some(name) = path.file_name().and_then(|n| n.to_str()) { + if name.eq_ignore_ascii_case("BOOT") { + continue; + } + } + + let candidate = path.join(binary_name); + if candidate.exists() && candidate.is_file() { + debug!( + "Found GRUB EFI executable in vendor directory: '{}'", + candidate.display() + ); + return Some(candidate); + } + } + + None +} + /// Generates a list of filepaths to the boot files that need to be copied to implement file-based /// update of ESP, relative to the mounted directory. /// @@ -642,24 +708,35 @@ fn generate_boot_filepaths(temp_mount_dir: &Path, is_uki: bool) -> Result /etc/sudoers.d/testing-user +os: + selinux: + mode: enforcing + netplan: + version: 2 + ethernets: + vmeths: + match: + name: enp* + dhcp4: true + users: + - name: testing-user + sshPublicKeys: [] + sshMode: key-only diff --git a/tests/e2e_tests/trident_configurations/rollback-azl4/test-selection.yaml b/tests/e2e_tests/trident_configurations/rollback-azl4/test-selection.yaml new file mode 100644 index 000000000..cbfa81bbe --- /dev/null +++ b/tests/e2e_tests/trident_configurations/rollback-azl4/test-selection.yaml @@ -0,0 +1,3 @@ +compatible: + - rollback + - rollback-azl4 diff --git a/tests/e2e_tests/trident_configurations/rollback-azl4/trident-config.yaml b/tests/e2e_tests/trident_configurations/rollback-azl4/trident-config.yaml new file mode 100644 index 000000000..67a2c574a --- /dev/null +++ b/tests/e2e_tests/trident_configurations/rollback-azl4/trident-config.yaml @@ -0,0 +1,80 @@ +image: + url: http://NETLAUNCH_HOST_ADDRESS/files/regular.cosi + sha384: ignored +storage: + disks: + - id: os + device: /dev/disk/by-path/pci-0000:00:1f.2-ata-2 + partitionTableType: gpt + partitions: + - id: root-a + type: root + size: 8G + - id: root-b + type: root + size: 8G + - id: esp + type: esp + size: 1G + - id: trident + type: linux-generic + size: 1G + - id: disk2 + device: /dev/disk/by-path/pci-0000:00:1f.2-ata-3 + partitionTableType: gpt + partitions: [] + abUpdate: + volumePairs: + - id: root + volumeAId: root-a + volumeBId: root-b + filesystems: + - deviceId: trident + source: new + mountPoint: /var/lib/trident + - deviceId: esp + mountPoint: + path: /boot/efi + options: umask=0077 + - deviceId: root + mountPoint: / +os: + additionalFiles: + - destination: /var/lib/trident/local-health-check-file.sh + content: | + echo 'This is a local health check script.' + exit 0 +health: + checks: + - name: invoke-rollback-from-local-script + runOn: + - clean-install + path: /var/lib/trident/local-health-check-file.sh + - name: invoke-rollback-from-script + runOn: + - clean-install + content: | + exit 1 + - name: install-failure-systemd-check + runOn: + - clean-install + systemdServices: + - non-existent-service1.service + - non-existent-service2.service + timeoutSeconds: 15 +# AZL4 variant of the AZL3 `health-checks-install/` scenario. +# - No `users`/`selinux`/`netplan` — these are baked into the test image +# at MIC build time. +# - `os.additionalFiles` is used because health.checks references +# `path: /var/lib/trident/local-health-check-file.sh`, which needs to +# be on the target filesystem. +# +# Health-check failure expectations (asserted by tests/e2e_tests/rollback_test.py): +# - State transitions to `not-provisioned` (clean-install has no slot to +# roll back to; the install just fails). +# - `/var/lib/trident/trident-health-check-failure-*.log` is created. +# - The log contains: +# * `"Failed health check(s)"` +# * `"Script 'invoke-rollback-from-script' failed"` +# * `"Unit non-existent-service1.service could not be found"` +# * `"Unit non-existent-service2.service could not be found"` diff --git a/tests/images/builder/__init__.py b/tests/images/builder/__init__.py index ca82f58db..2881fe851 100644 --- a/tests/images/builder/__init__.py +++ b/tests/images/builder/__init__.py @@ -3,7 +3,7 @@ from dataclasses import dataclass, field, fields from enum import Enum from pathlib import Path -from typing import List, Optional +from typing import List, Optional, Union @dataclass @@ -16,6 +16,9 @@ class BaseImage(Enum): BAREMETAL = BaseImageData("baremetal", Path("artifacts/baremetal.vhdx")) CORE_SELINUX = BaseImageData("core_selinux", Path("artifacts/core_selinux.vhdx")) QEMU_GUEST = BaseImageData("qemu_guest", Path("artifacts/qemu_guest.vhdx")) + AZL4_QEMU_GUEST = BaseImageData( + "azl4_qemu_guest", Path("artifacts/azl4_qemu_guest.vhdx") + ) CORE_ARM64 = BaseImageData("core_arm64", Path("artifacts/core_arm64.vhdx")) MINIMAL = BaseImageData("minimal", Path("artifacts/minimal.vhdx")) MINIMAL_AARCH64 = BaseImageData( @@ -60,6 +63,34 @@ class BaseImageManifest: glob: str = "*.vhdx" +@dataclass +class BlobImageManifest: + """Manifest for a base image fetched from Azure Storage Blob. + + Used for distros that don't yet publish to an ADO universal artifact + feed (e.g., Azure Linux 4.0 alpha builds). The storage account name + and container are NOT baked in here -- they are supplied at + invocation time via the --blob-storage-account / --blob-container + flags (or the BLOB_STORAGE_ACCOUNT / BLOB_CONTAINER env vars) so the + pipeline can parameterize them and rotate the location without a + code change. + + Authentication is via `az` CLI logged-in identity (`--auth-mode + login`). The pipeline running this must have a federated identity + with read access to the storage account. + """ + + image: BaseImage + # Blob name prefix to search under + # (e.g. "azure-linux/core-efi-vhdx-4.0-amd64") + path_prefix: str + # Suffix the final blob name must end with. + # The downloader lists all blobs under path_prefix, filters to ones + # ending with this suffix, and picks the lexically largest (= most + # recent version) to download. + file_suffix: str = "/image.vhdx" + + class OutputFormat(Enum): BAREMETAL_IMAGE = "baremetal-image" COSI = "cosi" @@ -249,7 +280,9 @@ class ArtifactManifest: customizer_version: str customizer_container: str customizer_container_full: str = None - base_images: List[BaseImageManifest] = field(default_factory=list) + base_images: List[Union["BaseImageManifest", "BlobImageManifest"]] = field( + default_factory=list + ) def __post_init__(self): if self.customizer_container_full is None: @@ -264,7 +297,9 @@ def kebab_fields(cls) -> List[str]: """Return a list of fields in kebab-case.""" return [f.name.replace("_", "-") for f in fields(cls)] - def find_base_image(self, img: BaseImage) -> Optional[BaseImageManifest]: + def find_base_image( + self, img: BaseImage + ) -> Optional[Union["BaseImageManifest", "BlobImageManifest"]]: """Find a base image by its name.""" for base_image in self.base_images: if base_image.image == img: diff --git a/tests/images/builder/cli.py b/tests/images/builder/cli.py index 741f0c239..784c4c534 100644 --- a/tests/images/builder/cli.py +++ b/tests/images/builder/cli.py @@ -1,6 +1,7 @@ import argparse from enum import Enum import logging +import os from pathlib import Path from typing import List @@ -183,14 +184,26 @@ def setup_parser_download_image( ) -> None: parser_download_img = subparsers.add_parser( SubCommand.DOWNLOAD_IMAGE.value, - help="Download a base image from the Azure DevOps feed", + help="Download a base image.", ) parser_download_img.set_defaults(artifacts=artifacts) parser_download_img.add_argument( "image", - help="The image to download", + help="The image to download.", choices=[c.image.name for c in artifacts.base_images], ) + parser_download_img.add_argument( + "--blob-storage-account", + default=os.environ.get("BLOB_STORAGE_ACCOUNT"), + help="Azure Storage account name for blob-sourced images. " + "Env: BLOB_STORAGE_ACCOUNT.", + ) + parser_download_img.add_argument( + "--blob-container", + default=os.environ.get("BLOB_CONTAINER"), + help="Azure Storage container name for blob-sourced images. " + "Env: BLOB_CONTAINER.", + ) def setup_parser_matrix( @@ -285,6 +298,8 @@ def run_cmd( run.download_base_image( artifacts=args.artifacts, name=args.image, + blob_storage_account=args.blob_storage_account, + blob_container=args.blob_container, ) elif subcommand == SubCommand.MATRIX: run.generate_matrix( diff --git a/tests/images/builder/download.py b/tests/images/builder/download.py index 6f9db4c9f..56a1313af 100644 --- a/tests/images/builder/download.py +++ b/tests/images/builder/download.py @@ -1,9 +1,15 @@ +import json +import logging +import os +import re from pathlib import Path import shutil import subprocess import tempfile -from builder import BaseImageManifest +from builder import BaseImageManifest, BlobImageManifest + +log = logging.getLogger(__name__) def download_base_image(image: BaseImageManifest) -> None: @@ -39,3 +45,140 @@ def download_base_image(image: BaseImageManifest) -> None: # Copy the .vhdx file to the target location shutil.copy2(vhdx_files[0], image.image.path) + + +# Constrain blob filename selection to a date-prefixed shape so a stray +# blob with a name that lexically sorts last (`zzz-evil/image.vhdx`) +# cannot win selection. Matches `YYYYMMDD/` or `YYYY-MM-DD/`-style +# version prefixes, which is the upstream publisher's convention. +# +# This is defense against a broader governance issue: the storage account +# is owned by another team, so write access is out of Trident's control. +# The regex narrows the attack surface to "names matching this shape" +# while still letting us track the latest published version. Tracked +# longer-term in the AZL4 supply-chain governance discussion. +_BLOB_NAME_VERSION_RE = re.compile(r"/([^/]*\d{4}-?\d{2}-?\d{2}[^/]*)/") + + +def download_blob_image( + image: BlobImageManifest, + storage_account: str, + container: str, +) -> None: + """Download a base image from Azure Storage Blob. + + Lists blobs under `image.path_prefix`, filters to ones whose name + matches a date-prefixed version pattern AND ends with + `image.file_suffix`, picks the lexically largest (= most recent + date), and downloads it atomically to `image.image.path`. + + Requires `az` CLI with a logged-in identity that has read access + to the storage account. Uses `--auth-mode login` so no storage + keys are needed. + """ + if not storage_account or not container: + raise RuntimeError( + f"Blob storage account/container required to download " + f"'{image.image.name}'. Pass --blob-storage-account and " + f"--blob-container, or set BLOB_STORAGE_ACCOUNT and " + f"BLOB_CONTAINER env vars." + ) + + az = shutil.which("az") + if az is None: + raise RuntimeError( + "az CLI not found on PATH; required to fetch blob-sourced " + "base images. Install azure-cli." + ) + + log.info( + f"Listing blobs in '{storage_account}/{container}' under " + f"prefix '{image.path_prefix}/'" + ) + # No `--query` interpolation: do the filtering in Python so caller + # control of `image.file_suffix` (or any other field that might + # become externally settable later) cannot inject JMESPath. + list_proc = subprocess.run( + [ + az, + "storage", + "blob", + "list", + "--auth-mode", + "login", + "--account-name", + storage_account, + "--container-name", + container, + "--prefix", + f"{image.path_prefix}/", + "--query", + "[].name", + "-o", + "json", + ], + check=True, + capture_output=True, + text=True, + ) + all_names = json.loads(list_proc.stdout) + suffix = image.file_suffix + eligible = [ + n for n in all_names if n.endswith(suffix) and _BLOB_NAME_VERSION_RE.search(n) + ] + if not eligible: + raise RuntimeError( + f"No date-versioned blobs ending with '{suffix}' found under " + f"'{image.path_prefix}/' in '{storage_account}/{container}' " + f"(saw {len(all_names)} total blobs under the prefix)" + ) + + latest = sorted(eligible)[-1] + log.info(f"Latest: {latest}") + + image.image.path.parent.mkdir(parents=True, exist_ok=True) + + # Download to a sibling temp file then atomically rename. `az + # storage blob download` writes in place — if the step is killed + # (timeout / OOM / agent reboot) between create and complete, the + # next run sees a truncated VHDX and MIC fails with an opaque + # error. The temp-then-rename pattern guarantees the target either + # has the full bytes or doesn't exist. + target = image.image.path + fd, tmp_path = tempfile.mkstemp( + prefix=target.name + ".", + suffix=".part", + dir=str(target.parent), + ) + os.close(fd) + try: + subprocess.run( + [ + az, + "storage", + "blob", + "download", + "--auth-mode", + "login", + "--account-name", + storage_account, + "--container-name", + container, + "--name", + latest, + "--file", + tmp_path, + "--output", + "none", + ], + check=True, + ) + os.replace(tmp_path, target) + except BaseException: + # On any failure, remove the temp file so we don't leave + # partial-state debris next to the final path. + try: + os.unlink(tmp_path) + except FileNotFoundError: + pass + raise diff --git a/tests/images/builder/run.py b/tests/images/builder/run.py index d465beb2f..8c93bdcb1 100644 --- a/tests/images/builder/run.py +++ b/tests/images/builder/run.py @@ -3,7 +3,7 @@ import json from typing import List, Optional -from builder import ImageConfig, RpmSources, ArtifactManifest +from builder import ArtifactManifest, BlobImageManifest, ImageConfig, RpmSources from .builder import build_image from .convert import convert_image from . import download @@ -148,6 +148,8 @@ def download_base_image( *, artifacts: ArtifactManifest, name: str, + blob_storage_account: Optional[str] = None, + blob_container: Optional[str] = None, ) -> None: image_manifest = next( (img for img in artifacts.base_images if img.image.name == name), None @@ -155,7 +157,15 @@ def download_base_image( if image_manifest is None: raise ValueError(f"Image '{name}' not found in artifacts") log.info(f"Downloading base image '{name}' to '{image_manifest.image.path}'") - download.download_base_image(image_manifest) + + if isinstance(image_manifest, BlobImageManifest): + download.download_blob_image( + image_manifest, + storage_account=blob_storage_account, + container=blob_container, + ) + else: + download.download_base_image(image_manifest) def generate_matrix( diff --git a/tests/images/testimages.py b/tests/images/testimages.py index 9ab341cba..e71f9fda0 100755 --- a/tests/images/testimages.py +++ b/tests/images/testimages.py @@ -7,6 +7,7 @@ ArtifactManifest, BaseImage, BaseImageManifest, + BlobImageManifest, ImageConfig, OutputFormat, SystemArchitecture, @@ -132,6 +133,43 @@ config_file="base/updateimg-grub.yaml", ssh_key="files/id_rsa.pub", ), + ImageConfig( + # AZL4 (Fedora-derived) variant of trident-vm-grub-testimage. + # The base VHDX is pulled from Azure Storage (see + # BlobImageManifest below) since there is no AzureLinuxArtifacts + # ADO feed entry for AZL4 yet. The Trident binary is baked in + # via additionalFiles because the trident-service RPM is not + # yet packaged for AZL4. + "trident-vm-grub-testimage-azl4", + base_image=BaseImage.AZL4_QEMU_GUEST, + config="trident-vm-testimage", + config_file="base/updateimg-grub-azl4.yaml", + ssh_key="files/id_rsa.pub", + # No trident-service RPM for AZL4 yet — the binary is delivered + # via additionalFiles. extra_dependencies enforces it is in place + # before the image is built. + requires_trident=False, + extra_dependencies=[ + Path("tests/images/trident-vm-testimage/base/trident-bin/trident"), + ], + ), + ImageConfig( + # AZL4 BASE qcow2: a bootable disk with the AZL4 OS plus trident + # installed, so storm-trident rollback testing can boot a VM and + # immediately drive A/B updates targeting the .cosi above. + # Mirrors AZL3's `make artifacts/trident-vm-grub-testimage.qcow2` + # path. See baseimg-grub-azl4.yaml for the layout / package set. + "trident-vm-grub-testimage-azl4-base", + base_image=BaseImage.AZL4_QEMU_GUEST, + config="trident-vm-testimage", + config_file="base/baseimg-grub-azl4.yaml", + output_format=OutputFormat.QCOW2, + ssh_key="files/id_rsa.pub", + requires_trident=False, + extra_dependencies=[ + Path("tests/images/trident-vm-testimage/base/trident-bin/trident"), + ], + ), ImageConfig( "trident-vm-grub-verity-testimage", base_image=BaseImage.QEMU_GUEST, @@ -246,6 +284,23 @@ package_name="minimal_vhdx-3.0-stable", version="*", ), + BlobImageManifest( + # Azure Linux 4.0 base VHDX from the AZL preview gallery's + # backing storage. Pinned to a specific daily build — bump + # the version segment in path_prefix to pick up a newer one. + # + # Source gallery: + # azlpubDevGallery2mruiyvi / azure-linux-4-daily-x64 + # subscription e4ab81f8-030f-4593-a8f2-3ea2c7630a19 + # RG azl-acg-preview-publishing + # + # Storage account + container are supplied at runtime via + # --blob-storage-account / --blob-container CLI flags or + # the BLOB_STORAGE_ACCOUNT / BLOB_CONTAINER env vars. + image=BaseImage.AZL4_QEMU_GUEST, + path_prefix="staging/azure-linux-4-daily-x64/4.0.2026051502", + file_suffix=".vhdfixed", + ), ], ) diff --git a/tests/images/trident-vm-testimage/README.md b/tests/images/trident-vm-testimage/README.md index e527ae04e..7d4379ed5 100644 --- a/tests/images/trident-vm-testimage/README.md +++ b/tests/images/trident-vm-testimage/README.md @@ -35,3 +35,49 @@ To build the update images, run: | ----------- | --------------------------------------- | ----------------------------------- | | Regular | `make trident-vm-grub-testimage` | `artifacts/trident-vm-grub-testimage/*` | | With verity | `make trident-vm-grub-verity-testimage` | `artifacts/trident-vm-grub-testimage/*` | + +## AZL4 variant (`trident-vm-grub-testimage-azl4`) + +A Fedora-derived (Azure Linux 4.0) variant lives alongside the AZL3 image +above. It uses `base/updateimg-grub-azl4.yaml` instead of +`base/updateimg-grub.yaml` and consumes `BaseImage.AZL4_QEMU_GUEST`. + +### Two extra prerequisites for AZL4 + +1. **AZL4 base VHDX.** No prebuilt AZL4 VHDX is available in the ADO + Artifacts feed yet, so build one locally with Image Customizer: + + ```bash + sudo imagecustomizer create \ + --config-file path/to/azl4-qemu-guest.yaml \ + --rpm-source path/to/azl4.repo \ + --tools-file path/to/azl4-tools.tar.gz \ + --build-dir /tmp/azl4-base-build \ + --output-image-file artifacts/azl4_qemu_guest.vhdx \ + --output-image-format vhdx \ + --distro azurelinux --distro-version 4.0 + ``` + + See `wiki/playbooks/trident-azl4-e2e-manual.md` in the karhu repo for + a ready-to-paste base config and the alpha2 repo URL. + + When an AZL4 VHDX lands in the ADO feed, add a `BaseImageManifest` + entry for `AZL4_QEMU_GUEST` in `testimages.py` so `cli download` + fetches it the same way it does the AZL3 bases. + +2. **Trident binary baked in.** The AZL4 image bakes + `/usr/bin/trident` via `additionalFiles` because there is no + `trident-service` RPM packaged for AZL4 yet. Drop the built binary + at `base/trident-bin/trident` before invoking the builder: + + ```bash + mkdir -p base/trident-bin + cp base/trident-bin/trident + chmod +x base/trident-bin/trident + ``` + + The binary should be built from a stack including the AZL4 enabling + branches: `azl4-1-grub-native` + `azl4-2-esp-layouts` + + `azl4-3-configure-bls` + `azl4-4-osconfig-hostname`. Once those land + on main, a plain main build suffices. The `base/trident-bin/` + directory is gitignored. diff --git a/tests/images/trident-vm-testimage/base/baseimg-grub-azl4.yaml b/tests/images/trident-vm-testimage/base/baseimg-grub-azl4.yaml new file mode 100644 index 000000000..6237475c2 --- /dev/null +++ b/tests/images/trident-vm-testimage/base/baseimg-grub-azl4.yaml @@ -0,0 +1,189 @@ +# Base image config for trident-vm-grub-testimage-azl4. +# +# This builds the BOOTABLE base qcow2 that storm-trident rollback tests +# start the VM from. After this qcow2 boots, trident is installed and ready +# to drive A/B updates to the .cosi produced by updateimg-grub-azl4.yaml. +# +# Layout mirrors AZL3's baseimg-grub.yaml (A/B partitions) but uses +# AZL4-specific package names (dnf5, grub2-efi-x64, shim, etc.) matching +# the updateimg-grub-azl4.yaml flavor. +# +# TODO(azl4-rpm): Drop the trident additionalFiles entries +# once an AZL4 trident-service RPM is published. Until then we bake +# the binary inline. + +storage: + disks: + - partitionTableType: gpt + maxSize: 10G + partitions: + - id: esp + type: esp + # 64M (vs AZL3's 16M) because AZL4 ships larger grub/shim + # binaries (~5MB grubx64.efi) and trident's offline-init + # copies them to both /boot/efi/EFI/AZLA and /AZLB. + size: 64M + + - id: root-a + size: 4G + + - id: root-b + size: 4G + + - id: trident + size: 1G + + - id: srv + size: grow + + bootType: efi + + filesystems: + - deviceId: esp + type: fat32 + mountPoint: + path: /boot/efi + options: umask=0077 + + - deviceId: root-a + type: ext4 + mountPoint: / + + - deviceId: trident + type: ext4 + mountPoint: /var/lib/trident + + - deviceId: srv + type: ext4 + mountPoint: /srv + +os: + bootloader: + resetType: hard-reset + hostname: trident-vm-testimg + + selinux: + mode: disabled + + kernelCommandLine: + # Mirrors AZL3 baseimg-grub.yaml; same console + debug settings so + # serial output works the same on both flavors. `net.ifnames=0` + # keeps interface naming as eth0/eth1/... so the + # `99-dhcp-eth0.network` systemd-networkd config matches the only + # virtio NIC the qemu test VM ships with. + extraCommandLine: + - console=tty0 + - console=tty1 + - console=ttyS0 + - net.ifnames=0 + - rd.debug + - loglevel=6 + - log_buf_len=1M + - systemd.journald.forward_to_console=1 + + packages: + install: + # AZL4 equivalents of the AZL3 set. See updateimg-grub-azl4.yaml + # for the rationale on each substitution. + - curl + - dnf5 + - efibootmgr + - grub2-efi-x64 + - grub2-efi-x64-modules + - grub2-tools + - grub2-tools-efi + - iproute + - iptables-nft + - jq + - lsof + - netplan + - openssh-server + - shim + - sudo + - systemd-networkd + - systemd-resolved + - vim + + services: + enable: + - sshd + - systemd-networkd + - systemd-resolved + # Trident socket-activated daemon. Storm-trident drives all + # update/commit/rollback through `trident grpc-client ...` which + # talks to this socket. + - tridentd.socket + # Oneshot trident commit at boot. Marks A/B update commits when + # they complete after reboot. + - trident.service + + additionalFiles: + # TODO(azl4-rpm): replace these binary copies and unit-file copies + # with `packages.install: - trident-service` once the RPM is + # published for AZL4. + - source: trident-bin/trident + destination: /usr/bin/trident + permissions: "755" + + # Trident systemd units. AZL3 gets these from the trident-service + # RPM; AZL4 doesn't have that RPM yet so we ship them inline. The + # contents come straight from packaging/systemd/ in this repo so a + # source change requires a re-build of the qcow2 to pick up. + - source: ../../../../packaging/systemd/trident.service + destination: /usr/lib/systemd/system/trident.service + - source: ../../../../packaging/systemd/tridentd.service + destination: /usr/lib/systemd/system/tridentd.service + - source: ../../../../packaging/systemd/tridentd.socket + destination: /usr/lib/systemd/system/tridentd.socket + + # AZL4 lacks a /usr/bin/hostname binary; the pytest framework + # smoke-tests SSH with `hostname`, so we ship a tiny shim. + - source: files/hostname-shim.sh + destination: /usr/local/bin/hostname + permissions: "755" + - source: files/sudoers-wheel + destination: /etc/sudoers.d/wheel + - source: files/99-dhcp-eth0.network + destination: /etc/systemd/network/99-dhcp-eth0.network + - source: files/regen-sshd-keys.service + destination: /etc/systemd/system/regen-sshd-keys.service + + users: + - name: testing-user + sshPublicKeyPaths: + - files/id_rsa.pub + secondaryGroups: + - wheel + +scripts: + postCustomization: + # Mirrors AZL3's baseimg-grub.yaml ordering: post-install runs + # first, then we bake the trident datastore at build time (so first + # boot is fast and storm-trident can immediately drive updates), + # then ssh + network housekeeping, then initrd rebuild + xattr + # strip last. + - path: scripts/post-install.sh + # Bake trident's hoststatus into the datastore at build time. AZL3 + # does this via update-host-status.sh; AZL4 uses the same pattern + # via update-host-status-azl4.sh. Requires trident's offline-init + # to tolerate the absence of /dev/sda inside MIC's chroot (the + # `disk` argument is a runtime label, not a build-time assertion); + # the fix lives in crates/trident/src/init/offline/mod.rs. + - path: scripts/update-host-status-azl4.sh + - path: scripts/enable-trident-service-azl4.sh + - path: scripts/ssh-move-host-keys-azl4.sh + - path: scripts/enable-regen-sshd-keys.sh + # Rebuild initramfs with --no-hostonly + extra SATA drivers so the + # qcow2 boots regardless of which bus the consumer's libvirt config + # picks (storm-trident uses bus=sata; the original boot test on + # karhu-ubuntu used bus=virtio). MUST run BEFORE strip-selinux-xattrs + # because dracut writes new files with the build-time SELinux + # context, and we want those stripped too. + - path: scripts/rebuild-initrd-azl4.sh + # Strip security.selinux xattrs from all files. See updateimg-grub- + # azl4.yaml for the parallel write-up; the same MOS-side AZL3 + # SELinux policy rejects AZL4 contexts when any future operation + # tries to preserve them. Keeping the qcow2 label-free is defensive. + # MUST run LAST so it sweeps any files produced by earlier scripts + # (initrd, etc.). + - path: scripts/strip-selinux-xattrs.sh diff --git a/tests/images/trident-vm-testimage/base/files/hostname-shim.sh b/tests/images/trident-vm-testimage/base/files/hostname-shim.sh new file mode 100644 index 000000000..b12b3807c --- /dev/null +++ b/tests/images/trident-vm-testimage/base/files/hostname-shim.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# AZL4 doesn't ship a `hostname` binary in `coreutils` (Fedora moved it to +# its own package which AZL4 hasn't picked up yet). The pytest E2E +# framework uses `hostname` as a smoke test of the SSH session in +# tests/e2e_tests/conftest.py, so without this shim every test errors out +# at fixture setup. +# +# Tiny POSIX-only replacement that reads /etc/hostname, plus a passthrough +# for `hostname -s` and `hostname -f` for completeness. +case "$1" in + -s|--short) + cat /etc/hostname | cut -d. -f1 + ;; + -f|--fqdn|"") + cat /etc/hostname + ;; + *) + cat /etc/hostname + ;; +esac diff --git a/tests/images/trident-vm-testimage/base/files/regen-sshd-keys.service b/tests/images/trident-vm-testimage/base/files/regen-sshd-keys.service new file mode 100644 index 000000000..0fe938ddc --- /dev/null +++ b/tests/images/trident-vm-testimage/base/files/regen-sshd-keys.service @@ -0,0 +1,14 @@ +[Unit] +Description=Generate sshd host keys in /var/srv on first boot +ConditionPathExists=!/var/srv/etc/ssh/ssh_host_ed25519_key +Before=sshd.service +After=local-fs.target + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStartPre=/usr/bin/mkdir -p /var/srv/etc/ssh +ExecStart=/usr/bin/ssh-keygen -A -f /var/srv -q + +[Install] +WantedBy=multi-user.target diff --git a/tests/images/trident-vm-testimage/base/scripts/enable-regen-sshd-keys.sh b/tests/images/trident-vm-testimage/base/scripts/enable-regen-sshd-keys.sh new file mode 100755 index 000000000..bdf901cd2 --- /dev/null +++ b/tests/images/trident-vm-testimage/base/scripts/enable-regen-sshd-keys.sh @@ -0,0 +1,7 @@ +#!/bin/bash +# regen-sshd-keys is a one-shot service that generates SSH host keys in +# /var/srv on first boot. Enable it via wants symlink because the generic +# `services.enable` in MIC config is reserved for systemd unit names that +# come from packages, and our unit is delivered via additionalFiles. +ln -sf /etc/systemd/system/regen-sshd-keys.service \ + /etc/systemd/system/multi-user.target.wants/regen-sshd-keys.service diff --git a/tests/images/trident-vm-testimage/base/scripts/enable-trident-service-azl4.sh b/tests/images/trident-vm-testimage/base/scripts/enable-trident-service-azl4.sh new file mode 100644 index 000000000..29889ea58 --- /dev/null +++ b/tests/images/trident-vm-testimage/base/scripts/enable-trident-service-azl4.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Defensive enable of trident.service and tridentd.socket. +# +# AZL3 gets these via the trident-service RPM's %systemd_post scriptlet. +# AZL4 doesn't have that RPM yet, so we ship the units via additionalFiles +# and *should* be able to rely on baseimg-grub-azl4.yaml's `services.enable:` +# stanza. In practice, `services.enable` did not create the +# multi-user.target.wants/trident.service symlink in MIC AZL4 builds +# (build 1120959 showed multi-user.target reached but trident.service +# never started post-reboot, leaving servicingState stuck at +# ab-update-finalized). Until we figure out why, manually link the +# units defensively. +# +# tridentd.socket gets the same treatment because (a) if services.enable +# is unreliable for one unit, it's likely unreliable for the other, and +# (b) storm-trident drives every update/commit/rollback through the +# tridentd gRPC socket — a missing /run/trident/trident.sock at boot +# would fail every subsequent storm-trident invocation in the test +# pipeline. +set -euxo pipefail + +mkdir -p /etc/systemd/system/multi-user.target.wants +mkdir -p /etc/systemd/system/sockets.target.wants +ln -sf /usr/lib/systemd/system/trident.service \ + /etc/systemd/system/multi-user.target.wants/trident.service +ln -sf /usr/lib/systemd/system/tridentd.socket \ + /etc/systemd/system/sockets.target.wants/tridentd.socket + +# Belt and braces: log the enabled state for diagnostics. systemctl is-enabled +# may fail inside MIC's chroot without a running dbus, so don't gate the +# script on it. +systemctl is-enabled trident.service 2>&1 || true +systemctl is-enabled tridentd.socket 2>&1 || true +ls -l /etc/systemd/system/multi-user.target.wants/trident.service || true +ls -l /etc/systemd/system/sockets.target.wants/tridentd.socket || true diff --git a/tests/images/trident-vm-testimage/base/scripts/rebuild-initrd-azl4.sh b/tests/images/trident-vm-testimage/base/scripts/rebuild-initrd-azl4.sh new file mode 100644 index 000000000..b07b3a8c0 --- /dev/null +++ b/tests/images/trident-vm-testimage/base/scripts/rebuild-initrd-azl4.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# Regenerate initrd with --no-hostonly so all storage drivers are +# included, not just the ones MIC's build environment happens to need. +# +# Why: storm-trident's rollback test (tools/storm/utils/vm/qemu/qemu.go) +# attaches the qcow2 to a virt-install VM with `bus=sata`. MIC builds +# the qcow2 in a virtio-backed environment, so dracut's default +# hostonly mode produces an initramfs with only virtio drivers. On a +# SATA-backed boot, the initramfs can't find the root partition by +# UUID and systemd hangs forever waiting for /dev/disk/by-uuid/. +# +# Rebuilding with --no-hostonly bakes in ahci, ata_piix, sata_sil, etc. +# along with virtio so the same qcow2 boots regardless of the bus type +# the consumer chooses. +# +# Runs inside the MIC chroot where /sys and /proc are bind-mounted but +# the host's SELinux is not loaded (MIC strips that), so dracut's +# cp -a doesn't hit the security.selinux setxattr issue that bites in +# AZL3 MOS during install (see strip-selinux-xattrs.sh for the parallel +# write-up). + +set -euo pipefail + +# Find the kernel version installed in this image. We require exactly +# one — `ls | head -1` would silently pick the wrong one if any future +# AZL4 variant ships multiple (kernel + kernel-hyperv, extramodules-*, +# etc.). Fail loudly rather than generate an initramfs for the wrong +# kernel: the failure mode of that misstep is "boot hangs waiting for +# /dev/disk/by-uuid/", which is the exact bug this script is +# meant to prevent. +KVERS=( /usr/lib/modules/* ) +case ${#KVERS[@]} in + 0) + echo "ERROR: no kernel modules dir under /usr/lib/modules" >&2 + exit 1 + ;; + 1) + KVER=$(basename "${KVERS[0]}") + ;; + *) + echo "ERROR: expected exactly one kernel under /usr/lib/modules, found:" >&2 + printf ' %s\n' "${KVERS[@]}" >&2 + exit 1 + ;; +esac +echo "Regenerating initramfs for kernel $KVER with --no-hostonly" + +# `--no-hostonly` includes all storage modules; `--no-hostonly-cmdline` +# prevents dracut from baking the build-host's /proc/cmdline parameters +# into the initramfs (which would fight the qcow2's grub cmdline at +# runtime); `--reproducible` keeps the output bit-stable across builds +# so we can detect spurious regenerations. +dracut \ + --no-hostonly \ + --no-hostonly-cmdline \ + --reproducible \ + --add-drivers "ahci ata_piix sata_sil sata_nv sata_via sd_mod" \ + --force \ + --kver "$KVER" + +echo "Regenerated initramfs:" +ls -lh /boot/initramfs-* diff --git a/tests/images/trident-vm-testimage/base/scripts/ssh-move-host-keys-azl4.sh b/tests/images/trident-vm-testimage/base/scripts/ssh-move-host-keys-azl4.sh new file mode 100755 index 000000000..ede3fdbaa --- /dev/null +++ b/tests/images/trident-vm-testimage/base/scripts/ssh-move-host-keys-azl4.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# AZL4-compatible variant of ssh-move-host-keys.sh. +# +# AZL3 sshd reads the main /etc/ssh/sshd_config and we appended HostKey +# lines to it. AZL4 sshd 10.0+ supports drop-ins under /etc/ssh/sshd_config.d/ +# which is the cleaner approach. +SSH_VAR_DIR="/var/srv/etc/ssh" +mkdir -p /etc/ssh/sshd_config.d +cat > /etc/ssh/sshd_config.d/50-trident-host-keys.conf <&1 >/dev/null) || rc=$? && rc=${rc:-0} + if [ "$rc" -eq 0 ]; then + count=$((count + 1)) + elif echo "$err" | grep -qE "No such attribute|Operation not supported"; then + : # nothing to strip, expected for files without the xattr + else + fail_count=$((fail_count + 1)) + echo "setfattr failed on '$f': $err" >&2 + fi + rc=0 +done < <(find / \( -path /proc -o -path /sys -o -path /dev -o -path /run \) -prune \ + -o \( -type f -o -type d -o -type l \) -print0) + +echo "Stripped security.selinux from ${count} files/dirs" + +if [ "$fail_count" -gt 0 ]; then + echo "ERROR: setfattr failed (non-ENODATA) on ${fail_count} entries" >&2 + exit 1 +fi + +# Verify the strip actually took effect by scanning a representative set +# of paths (rootfs, /boot if present, /usr/lib/systemd, /etc). Any +# residual security.selinux means we missed something — fail loudly +# rather than warning, since the whole point of the script is to leave +# the image bare. +sentinel_dirs=( "/etc" "/usr/lib/systemd" "/usr/bin" ) +if [ -d /boot ]; then + sentinel_dirs+=( "/boot" ) +fi +for d in "${sentinel_dirs[@]}"; do + if getfattr -R -m security.selinux "$d" 2>/dev/null | grep -q security.selinux; then + echo "ERROR: security.selinux xattr still present under '$d'" >&2 + getfattr -R -m security.selinux "$d" 2>/dev/null | head -10 >&2 + exit 1 + fi +done diff --git a/tests/images/trident-vm-testimage/base/scripts/update-host-status-azl4.sh b/tests/images/trident-vm-testimage/base/scripts/update-host-status-azl4.sh new file mode 100644 index 000000000..a2cfbe27f --- /dev/null +++ b/tests/images/trident-vm-testimage/base/scripts/update-host-status-azl4.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# AZL4 equivalent of AZL3's update-host-status.sh. +# +# Runs inside MIC's chroot at qcow2 build time. Populates the trident +# datastore with the host status derived from Prism's history.json so +# the system boots ready for storm-trident to drive A/B updates -- no +# first-boot bootstrap, no datastore creation at runtime. +# +# Mirrors AZL3's pattern (scripts/update-host-status.sh, called from +# baseimg-grub.yaml). The trident binary in the chroot must understand +# that `--disk /dev/sda` is the runtime label and not a build-time +# existence assertion; see trident PR fixing the spurious check in +# crates/trident/src/init/offline/mod.rs. +set -euxo pipefail + +trident offline-initialize diff --git a/tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml b/tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml new file mode 100644 index 000000000..9cee3c809 --- /dev/null +++ b/tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml @@ -0,0 +1,152 @@ +storage: + bootType: efi + disks: + - maxSize: 5G + partitionTableType: gpt + partitions: + - id: esp + size: 64M + type: esp + - id: root + size: 4G + + filesystems: + - deviceId: esp + mountPoint: + options: umask=0077 + path: /boot/efi + type: fat32 + - deviceId: root + mountPoint: / + type: ext4 + +os: + hostname: trident-vm-testimg + packages: + install: + # AZL4 equivalents of the AZL3 set in updateimg-grub.yaml. + # Notable differences: + # - dnf5 (no `dnf` package on AZL4) + # - grub2-efi-x64 + grub2-efi-x64-modules (no `-noprefix` package) + # - iptables-nft (no plain `iptables` meta on AZL4 base) + # - netplan available but not strictly required (Trident host config + # drives target networking; we use systemd-networkd for the + # management environment) + # - shim added (AZL4 ships a real signed shim chain) + # - trident-service intentionally omitted: no AZL4 RPM yet, the + # binary is delivered via additionalFiles below. + - curl + - dnf5 + - efibootmgr + - grub2-efi-x64 + - grub2-efi-x64-modules + - grub2-tools + - grub2-tools-efi + - iproute + - iptables-nft + - jq + - lsof + - netplan + - openssh-server + - shim + - sudo + - systemd-networkd + - systemd-resolved + - vim + bootloader: + resetType: hard-reset + selinux: + mode: disabled + kernelCommandLine: + extraCommandLine: + - console=tty0 + - console=ttyS0 + - net.ifnames=0 + - rd.debug + - loglevel=6 + - log_buf_len=1M + - systemd.journald.forward_to_console=1 + services: + enable: + - sshd + - systemd-networkd + - systemd-resolved + # Trident socket-activated daemon. Storm-trident drives all + # update/commit/rollback through `trident grpc-client ...` which + # talks to this socket. + - tridentd.socket + # Oneshot trident commit at boot. Marks A/B update commits when + # they complete after reboot. The MIC `services.enable` stanza + # alone has not been sufficient to wire this in our AZL4 builds; + # postCustomization script enable-trident-service-azl4.sh + # belt-and-braces creates the multi-user.target.wants symlink. + - trident.service + users: + - name: testing-user + sshPublicKeyPaths: + - files/id_rsa.pub + secondaryGroups: + - wheel + additionalFiles: + # Bake the patched Trident binary into /usr/local/bin/trident. + # + # Path is resolved relative to this YAML file. Pipeline / local builds + # are expected to drop the binary at trident-bin/trident before invoking + # testimages.py build (see README in this directory). + # + # We deliberately use /usr/local/bin (not /usr/bin) so that when the + # AZL4 trident-service RPM eventually ships and installs to + # /usr/bin/trident, the RPM install is non-destructive: the package + # binary wins on PATH lookup (per FHS, /usr/local/bin sits before + # /usr/bin only in user shells; system PATH defaults put /usr/bin + # first), which is the correct outcome — once the RPM is the supply + # chain, additionalFiles should not be silently overriding it. To + # confirm which copy is in use at runtime, check + # `readlink -f $(command -v trident)`. + - source: trident-bin/trident + destination: /usr/bin/trident + permissions: "755" + # AZL4 lacks a /usr/bin/hostname binary; the pytest framework smoke- + # tests SSH with `hostname`, so we ship a tiny shim. + - source: files/hostname-shim.sh + destination: /usr/local/bin/hostname + permissions: "755" + - source: files/sudoers-wheel + destination: /etc/sudoers.d/wheel + - source: files/99-dhcp-eth0.network + destination: /etc/systemd/network/99-dhcp-eth0.network + - source: files/regen-sshd-keys.service + destination: /etc/systemd/system/regen-sshd-keys.service + + # Trident systemd units. AZL3 gets these from the trident-service + # RPM; AZL4 doesn't have that RPM yet, so we ship them inline in + # both base and update images so post-A/B-update boots have the + # trident-commit oneshot available on volume-b. + - source: ../../../../packaging/systemd/trident.service + destination: /usr/lib/systemd/system/trident.service + - source: ../../../../packaging/systemd/tridentd.service + destination: /usr/lib/systemd/system/tridentd.service + - source: ../../../../packaging/systemd/tridentd.socket + destination: /usr/lib/systemd/system/tridentd.socket + +scripts: + postCustomization: + # AZL3 image scripts are largely safe on AZL4, but a couple touch + # /etc paths that read-only differently. We use AZL4-specific copies + # as we discover divergence. + - path: scripts/post-install.sh + - path: scripts/ssh-move-host-keys-azl4.sh + - path: scripts/enable-regen-sshd-keys.sh + # Belt-and-braces enable of trident.service. services.enable in MIC + # AZL4 didn't create the multi-user.target.wants symlink reliably; + # this script does it explicitly. Without it, the post-A/B-update + # reboot can't run `trident commit`, leaving servicingState stuck at + # ab-update-finalized. + - path: scripts/enable-trident-service-azl4.sh + # Strip SELinux labels from all files. See the script header for the + # full rationale, but in short: the AZL4 base VHDX bakes in AZL4 + # SELinux contexts that Trident-from-AZL3-MOS can't preserve during + # install (MOS's loaded policy rejects unknown contexts via + # setxattr). Stripping at cosi build time sidesteps the cascade. + # Must run LAST so other postCustomization scripts don't re-label. + - path: scripts/strip-selinux-xattrs.sh