From 49b6d9b9ecc32b9da2ea07e04353716496f71790 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Tue, 2 Jun 2026 17:39:55 -0700 Subject: [PATCH 01/26] engineering: Add AZL4 distro detection and extend GRUB update path Implements AzureLinuxRelease::AzL4 variant, VERSION_ID 4.x parsing, ID_LIKE=fedora matching, updated GRUB match arms for AzL3|AzL4, and image_distro() fallback to host os-release. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/osutils/src/mkinitrd.rs | 2 + crates/osutils/src/osrelease.rs | 47 +++++++++++++++++++++++ crates/osutils/src/testutils/osrelease.rs | 25 ++++++++++++ crates/trident/src/engine/boot/grub.rs | 9 +++-- crates/trident/src/engine/context/mod.rs | 10 ++++- 5 files changed, 88 insertions(+), 5 deletions(-) diff --git a/crates/osutils/src/mkinitrd.rs b/crates/osutils/src/mkinitrd.rs index c6ab3d2e1..d01831826 100644 --- a/crates/osutils/src/mkinitrd.rs +++ b/crates/osutils/src/mkinitrd.rs @@ -118,6 +118,8 @@ mod functional_test { fn test_regenerate_initrd() { let pattern = if osrelease::is_azl3().unwrap() { "/boot/initramfs-*.azl3.img" + } else if osrelease::is_azl4().unwrap() { + "/boot/initramfs-*.azl4.img" } else { "/boot/initrd.img-*" }; diff --git a/crates/osutils/src/osrelease.rs b/crates/osutils/src/osrelease.rs index e51926e74..c39981c6f 100644 --- a/crates/osutils/src/osrelease.rs +++ b/crates/osutils/src/osrelease.rs @@ -31,6 +31,11 @@ pub fn is_azl3() -> Result { Ok(OsRelease::read()?.get_distro().is_azl3()) } +/// Returns whether the host is running Azure Linux 4. +pub fn is_azl4() -> Result { + Ok(OsRelease::read()?.get_distro().is_azl4()) +} + /// Represents the contents of the /etc/os-release file. /// /// See @@ -146,6 +151,8 @@ impl OsRelease { AzureLinuxRelease::AzL2 } else if v.starts_with("3.") { AzureLinuxRelease::AzL3 + } else if v.starts_with("4.") { + AzureLinuxRelease::AzL4 } else { trace!("Unknown Azure Linux release: {v}"); AzureLinuxRelease::Other @@ -342,6 +349,10 @@ impl Distro { self == &Distro::AzureLinux(AzureLinuxRelease::AzL3) } + pub fn is_azl4(&self) -> bool { + self == &Distro::AzureLinux(AzureLinuxRelease::AzL4) + } + pub fn is_acl(&self) -> bool { self == &Distro::AzureContainerLinux } @@ -354,6 +365,7 @@ pub enum AzureLinuxRelease { Other, AzL2, AzL3, + AzL4, } #[cfg(test)] @@ -429,6 +441,41 @@ mod tests { ); } + #[test] + fn test_parse_azl4() { + let data = indoc::indoc! { + r#" + NAME="Azure Linux" + VERSION="4.0 (Four Alpha2)" + RELEASE_TYPE=development + ID=azurelinux + ID_LIKE=fedora + VERSION_ID="4.0" + VERSION_CODENAME="" + PRETTY_NAME="Azure Linux 4.0 (Four Alpha2)" + ANSI_COLOR="0;38;2;60;110;180" + LOGO=azurelinux-logo-icon + CPE_NAME="cpe:/o:azurelinuxproject:azurelinux:4.0" + DEFAULT_HOSTNAME="azurelinux" + HOME_URL="https://aka.ms/azurelinux" + DOCUMENTATION_URL="https://aka.ms/azurelinux" + SUPPORT_URL="https://aka.ms/azurelinux" + BUG_REPORT_URL="https://aka.ms/azurelinux" + SUPPORT_END=2026-05-15 + "#, + }; + + let os_release = OsRelease::parse(data); + assert_eq!(os_release.id, Some("azurelinux".to_string())); + assert_eq!(os_release.version_id, Some("4.0".to_string())); + assert_eq!(os_release.id_like, Some("fedora".to_string())); + assert_eq!(os_release.release_type, Some("development".to_string())); + assert_eq!( + os_release.get_distro(), + Distro::AzureLinux(AzureLinuxRelease::AzL4) + ); + } + #[test] fn test_parse_extension_release() { let data = indoc::indoc! { diff --git a/crates/osutils/src/testutils/osrelease.rs b/crates/osutils/src/testutils/osrelease.rs index 6feff02bc..27a2e5b17 100644 --- a/crates/osutils/src/testutils/osrelease.rs +++ b/crates/osutils/src/testutils/osrelease.rs @@ -38,11 +38,36 @@ const AZURE_LINUX_3_OS_RELEASE: &str = indoc::indoc! { "#, }; +/// Azure Linux 4.0 sample os-release file. +const AZURE_LINUX_4_OS_RELEASE: &str = indoc::indoc! { + r#" + NAME="Azure Linux" + VERSION="4.0 (Cloud Variant Beta)" + RELEASE_TYPE=development + ID=azurelinux + ID_LIKE=fedora + VERSION_ID="4.0" + VERSION_CODENAME="" + PRETTY_NAME="Azure Linux 4.0 (Cloud Variant Beta)" + ANSI_COLOR="0;38;2;60;110;180" + LOGO=azurelinux-logo-icon + CPE_NAME="cpe:/o:azurelinuxproject:azurelinux:4.0" + DEFAULT_HOSTNAME="azurelinux" + HOME_URL="https://aka.ms/azurelinux" + DOCUMENTATION_URL="https://aka.ms/azurelinux" + SUPPORT_URL="https://aka.ms/azurelinux" + BUG_REPORT_URL="https://aka.ms/azurelinux" + VARIANT="Cloud Variant" + VARIANT_ID=cloud + "#, +}; + /// Creates a mock /etc/os-release file with the given Azure Linux release. pub fn make_mock_os_release(root_path: &Path, azl_release: AzureLinuxRelease) -> Result<(), Error> { let os_release_content = match azl_release { AzureLinuxRelease::AzL2 => AZURE_LINUX_2_OS_RELEASE, AzureLinuxRelease::AzL3 => AZURE_LINUX_3_OS_RELEASE, + AzureLinuxRelease::AzL4 => AZURE_LINUX_4_OS_RELEASE, AzureLinuxRelease::Other => bail!("Unsupported Azure Linux release 'other'"), }; diff --git a/crates/trident/src/engine/boot/grub.rs b/crates/trident/src/engine/boot/grub.rs index b345f5c31..fb25b59c8 100644 --- a/crates/trident/src/engine/boot/grub.rs +++ b/crates/trident/src/engine/boot/grub.rs @@ -63,9 +63,10 @@ pub(super) fn update_configs(ctx: &EngineContext) -> Result<(), Error> { let boot_grub_config_path = Path::new(ROOT_MOUNT_POINT_PATH).join(GRUB2_CONFIG_RELATIVE_PATH); // Update GRUB config on the boot device (volume holding /boot) - match ctx.host_os_release.get_distro() { - Distro::AzureLinux(AzureLinuxRelease::AzL3) => { - update_grub_config_azl3(ctx, &root_device_path, &boot_grub_config_path)?; + // Use the *image* distro (the OS being installed), not the host (MOS ISO). + match ctx.image_distro() { + Distro::AzureLinux(AzureLinuxRelease::AzL3 | AzureLinuxRelease::AzL4) => { + update_grub_config(ctx, &root_device_path, &boot_grub_config_path)?; } d => bail!("Unsupported distro for GRUB config update: {d:?}"), @@ -86,7 +87,7 @@ pub(super) fn update_configs(ctx: &EngineContext) -> Result<(), Error> { } /// Updates the GRUB config for Azure Linux 3.0 using OS modifier. -fn update_grub_config_azl3( +fn update_grub_config( ctx: &EngineContext, root_device_path: &Path, boot_grub_config_path: &Path, diff --git a/crates/trident/src/engine/context/mod.rs b/crates/trident/src/engine/context/mod.rs index 73fe61f4e..f873ff947 100644 --- a/crates/trident/src/engine/context/mod.rs +++ b/crates/trident/src/engine/context/mod.rs @@ -441,8 +441,16 @@ impl EngineContext { } /// Retrieves the distribution of the OS image. + /// + /// Prefers the image's own os-release (e.g., from the COSI being installed). + /// Falls back to the host os-release when no image is available (functional + /// tests, runtime operations outside an install flow). pub(crate) fn image_distro(&self) -> Distro { - self.image_os_release().get_distro() + let distro = self.image_os_release().get_distro(); + match distro { + Distro::Other => self.host_os_release.get_distro(), + d => d, + } } } From cca2a4c31ace91190e67af498178ede67b16a655 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Wed, 3 Jun 2026 16:41:07 -0700 Subject: [PATCH 02/26] fix: Only fall back to host distro when no image is mounted image_distro() was falling back to the host os-release whenever the image's distro was Distro::Other. This silently masked unrecognized distros as the host distro, causing GRUB config to be written for the wrong OS. Now: if an image is mounted (self.image.is_some()), always use the image's distro. Fallback to host only fires when no image is present at all (functional tests, runtime operations). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/trident/src/engine/context/mod.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/crates/trident/src/engine/context/mod.rs b/crates/trident/src/engine/context/mod.rs index f873ff947..4632acabc 100644 --- a/crates/trident/src/engine/context/mod.rs +++ b/crates/trident/src/engine/context/mod.rs @@ -443,13 +443,17 @@ impl EngineContext { /// Retrieves the distribution of the OS image. /// /// Prefers the image's own os-release (e.g., from the COSI being installed). - /// Falls back to the host os-release when no image is available (functional - /// tests, runtime operations outside an install flow). + /// Falls back to the host os-release only when no image is mounted + /// (functional tests, runtime operations outside an install flow). + /// + /// If an image IS present but its distro is unrecognized, the image's + /// distro is returned as-is (Distro::Other) so callers can bail + /// explicitly rather than silently using the host's distro. pub(crate) fn image_distro(&self) -> Distro { - let distro = self.image_os_release().get_distro(); - match distro { - Distro::Other => self.host_os_release.get_distro(), - d => d, + if self.image.is_some() { + self.image_os_release().get_distro() + } else { + self.host_os_release.get_distro() } } } From 460393ca1b3809e20b9b06b0f18464c2fbeb672a Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Tue, 2 Jun 2026 17:40:04 -0700 Subject: [PATCH 03/26] engineering: Generic EFI vendor-dir discovery and AZL4 ESP support Adds is_azl4_or_later() helper, generic EFI vendor-dir discovery via grub-probe, and AZL4 ESP partition layout support. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/osutils/src/grub.rs | 109 +++++++++++++++- crates/osutils/src/osrelease.rs | 31 +++++ crates/trident/src/subsystems/esp.rs | 178 ++++++++++++++++++++++++--- 3 files changed, 298 insertions(+), 20 deletions(-) diff --git a/crates/osutils/src/grub.rs b/crates/osutils/src/grub.rs index 92782bbf7..dea58f2dd 100644 --- a/crates/osutils/src/grub.rs +++ b/crates/osutils/src/grub.rs @@ -231,22 +231,51 @@ impl GrubConfig { } /// Update the search command in the GRUB config. + /// + /// Three variants of the GRUB stub `search` line exist in practice: + /// + /// 1. The upstream legacy form: `search -n -u -s` + /// 2. AZL3 / standard form: `search --no-floppy --fs-uuid --set=root ` + /// 3. AZL4 MIC-generated form: `search --fs-uuid --set=root ` + /// (the `--no-floppy` option is redundant on EFI machines, so AZL4's + /// grub stub omits it.) + /// + /// We rewrite *every* matching line with the corresponding form so that + /// stubs containing more than one variant (rare but possible during + /// distribution transitions) all get the new UUID. We bail only if no + /// regex matched any line. pub fn update_search(&mut self, uuid: &Uuid) -> Result<(), Error> { let re = Regex::new(r"(?m)^(\s*)search -n -u [\w-]+ -s$").unwrap(); let re2 = Regex::new(r"(?m)^(\s*)search --no-floppy --fs-uuid --set=root [\w-]+$").unwrap(); + let re3 = Regex::new(r"(?m)^(\s*)search --fs-uuid --set=root [\w-]+$").unwrap(); + let mut matched = false; if re.is_match(&self.contents) { self.contents = re - .replace(&self.contents, &format!("${{1}}search -n -u {uuid} -s")) + .replace_all(&self.contents, &format!("${{1}}search -n -u {uuid} -s")) .to_string(); - } else if re2.is_match(&self.contents) { + matched = true; + } + if re2.is_match(&self.contents) { self.contents = re2 - .replace( + .replace_all( &self.contents, &format!("${{1}}search --no-floppy --fs-uuid --set=root {uuid}"), ) .to_string(); - } else { + matched = true; + } + if re3.is_match(&self.contents) { + self.contents = re3 + .replace_all( + &self.contents, + &format!("${{1}}search --fs-uuid --set=root {uuid}"), + ) + .to_string(); + matched = true; + } + + if !matched { bail!( "Unable to find search command in '{}'", &self.path.display() @@ -953,6 +982,78 @@ mod tests { .unwrap(); } + #[test] + fn test_update_search_azl3_form() { + // AZL3 stubs use `search --no-floppy --fs-uuid --set=root `. + let mut grub_config = GrubConfig { + path: PathBuf::new(), + contents: indoc::indoc! { r#" + set timeout=0 + search --no-floppy --fs-uuid --set=root deadbeef-cafe-babe-0000-111122223333 + "# } + .to_owned(), + linux_command_line: None, + }; + + let new_uuid = Uuid::parse_str("9e6a9d2c-b7fe-4359-ac45-18b505e29d8c").unwrap(); + grub_config.update_search(&new_uuid).unwrap(); + + assert!(grub_config.contents.contains(&format!( + "search --no-floppy --fs-uuid --set=root {new_uuid}" + ))); + assert!(!grub_config.contents.contains("deadbeef")); + } + + #[test] + fn test_update_search_azl4_form() { + // AZL4 MIC-generated stubs omit --no-floppy. + let mut grub_config = GrubConfig { + path: PathBuf::new(), + contents: indoc::indoc! { r#" + set timeout=0 + search --fs-uuid --set=root deadbeef-cafe-babe-0000-111122223333 + "# } + .to_owned(), + linux_command_line: None, + }; + + let new_uuid = Uuid::parse_str("9e6a9d2c-b7fe-4359-ac45-18b505e29d8c").unwrap(); + grub_config.update_search(&new_uuid).unwrap(); + + assert!(grub_config + .contents + .contains(&format!("search --fs-uuid --set=root {new_uuid}"))); + assert!(!grub_config.contents.contains("deadbeef")); + // Must not accidentally insert --no-floppy. + assert!(!grub_config.contents.contains("--no-floppy")); + } + + #[test] + fn test_update_search_mixed_forms() { + // If both AZL3 and AZL4 forms appear (e.g. an image whose stub + // includes vendored fragments), both must be rewritten. + let mut grub_config = GrubConfig { + path: PathBuf::new(), + contents: indoc::indoc! { r#" + search --no-floppy --fs-uuid --set=root oldoldold-cafe-babe-0000-aaaabbbbcccc + search --fs-uuid --set=root oldoldold-cafe-babe-0000-aaaabbbbcccc + "# } + .to_owned(), + linux_command_line: None, + }; + + let new_uuid = Uuid::parse_str("9e6a9d2c-b7fe-4359-ac45-18b505e29d8c").unwrap(); + grub_config.update_search(&new_uuid).unwrap(); + + assert!(!grub_config.contents.contains("oldoldold")); + assert!(grub_config.contents.contains(&format!( + "search --no-floppy --fs-uuid --set=root {new_uuid}" + ))); + assert!(grub_config + .contents + .contains(&format!("search --fs-uuid --set=root {new_uuid}"))); + } + #[test] fn test_update_rootdevice() { // Define original GRUB config contents on target machine diff --git a/crates/osutils/src/osrelease.rs b/crates/osutils/src/osrelease.rs index c39981c6f..5d8caafe2 100644 --- a/crates/osutils/src/osrelease.rs +++ b/crates/osutils/src/osrelease.rs @@ -353,6 +353,37 @@ impl Distro { self == &Distro::AzureLinux(AzureLinuxRelease::AzL4) } + /// Returns true for AZL4 and any later Azure Linux release. + /// + /// Use this when gating behavior on features that landed in AZL4 and + /// are expected to remain present in subsequent major releases (e.g. + /// AZL4 dropped the `grub2-efi-binary-noprefix` packaging convention; + /// AZL5+ is expected to keep that change). Strict `is_azl4()` would + /// silently regress to the AZL3 code path when AZL5 ships. + /// + /// The decision is based on the `AzureLinuxRelease` ordering AND, for + /// versions newer than what the parser recognizes, the numeric major + /// component of `version_id`. New major releases that the parser + /// hasn't been taught yet will fall through to `AzureLinuxRelease::Other`, + /// so we re-check `version_id` directly. + pub fn is_azl4_or_later(&self, version_id: Option<&str>) -> bool { + if let Distro::AzureLinux(rel) = self { + if matches!(rel, AzureLinuxRelease::AzL4) { + return true; + } + // Parser doesn't know this version yet; inspect version_id. + if matches!(rel, AzureLinuxRelease::Other) { + if let Some(major) = version_id + .and_then(|v| v.split('.').next()) + .and_then(|m| m.parse::().ok()) + { + return major >= 4; + } + } + } + false + } + pub fn is_acl(&self) -> bool { self == &Distro::AzureContainerLinux } diff --git a/crates/trident/src/subsystems/esp.rs b/crates/trident/src/subsystems/esp.rs index e3073aa8b..b7d16dc3c 100644 --- a/crates/trident/src/subsystems/esp.rs +++ b/crates/trident/src/subsystems/esp.rs @@ -6,7 +6,7 @@ use std::{ }; use anyhow::{bail, ensure, Context, Error}; -use log::{debug, trace}; +use log::{debug, trace, warn}; use reqwest::Url; use tempfile::{NamedTempFile, TempDir}; @@ -292,8 +292,24 @@ fn copy_file_artifacts( uki::stage_uki_on_esp(temp_mount_dir, mount_point, &ctx.esp_mount_path)?; } else { // In non-UKI mode, bail if grub_noprefix.efi is not found in the image. + // AZL4+ does not ship grub2-efi-binary-noprefix (AZL3-specific convention), + // so automatically skip this check for AZL4 and later. `is_azl4_or_later` + // handles AZL5+ correctly by re-checking version_id when the parser + // falls back to AzureLinuxRelease::Other. + // TODO: Two sources of truth for "noprefix not required" exist now: + // - this distro check + // - the filesystem probe in generate_boot_filepaths + // The probe is authoritative. Consider folding the check into the + // probe result (e.g. ensure! that *some* grub binary was found, + // not specifically the noprefix variant) in a follow-up. See + // 2026-05-18 PR-2 deep-review.md. + let image_os_release = ctx.image_os_release(); + let is_azl4_or_later = image_os_release + .get_distro() + .is_azl4_or_later(image_os_release.version_id.as_deref()); ensure!( grub_noprefix + || is_azl4_or_later || ctx .spec .internal_params @@ -605,6 +621,69 @@ fn copy_boot_files( Ok(no_prefix) } +/// Search EFI vendor directories for a specific binary. +/// +/// UEFI convention: each OS vendor installs its bootloader under +/// `EFI//` (e.g., `EFI/fedora/`, `EFI/azurelinux/`). +/// This function searches all subdirectories of the EFI directory +/// for the specified binary, skipping the BOOT fallback directory. +/// +/// Vendor dirs are iterated in sorted (lexicographic) order so the +/// selection is reproducible across builds when more than one vendor +/// directory contains a candidate. `read_dir` order alone is +/// filesystem-dependent (ext4 returns hash order, FAT returns +/// directory-entry order), which would produce irreproducible ESP +/// images on cross-builds and break attestation/PCR lock for the +/// selected bootloader. +fn find_efi_binary_in_vendor_dirs(efi_dir: &Path, binary_name: &str) -> Option { + let entries = match std::fs::read_dir(efi_dir) { + Ok(e) => e, + Err(e) => { + debug!("Cannot read EFI directory '{}': {}", efi_dir.display(), e); + return None; + } + }; + + // Materialize entries first so we can sort, and so a per-entry + // iterator error is logged instead of silently dropped. + let mut paths: Vec = Vec::new(); + for entry in entries { + match entry { + Ok(e) => paths.push(e.path()), + Err(e) => warn!( + "Failed to read entry under EFI directory '{}': {}", + efi_dir.display(), + e + ), + } + } + paths.sort(); + + for path in paths { + if !path.is_dir() { + continue; + } + + // Skip the BOOT directory (already checked by the caller) + if let Some(name) = path.file_name().and_then(|n| n.to_str()) { + if name.eq_ignore_ascii_case("BOOT") { + continue; + } + } + + let candidate = path.join(binary_name); + if candidate.exists() && candidate.is_file() { + debug!( + "Found GRUB EFI executable in vendor directory: '{}'", + candidate.display() + ); + return Some(candidate); + } + } + + None +} + /// Generates a list of filepaths to the boot files that need to be copied to implement file-based /// update of ESP, relative to the mounted directory. /// @@ -642,24 +721,35 @@ fn generate_boot_filepaths(temp_mount_dir: &Path, is_uki: bool) -> Result Date: Wed, 3 Jun 2026 15:14:01 -0700 Subject: [PATCH 04/26] engineering: Clean up ESP noprefix check and grub search comments - Remove redundant ensure!(grub_noprefix) check from ESP setup. generate_boot_filepaths() already finds a working GRUB binary (noprefix, standard, or vendor-dir). The separate policy check was redundant. - Simplify copy_boot_files to return () instead of bool - Attribute grub search format variants to distro conventions (AZL3/Mariner vs AZL4/Fedora), not MIC internals - Update mixed-forms test comment to reference cross-version A/B update scenario Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/osutils/src/grub.rs | 14 ++++--- crates/trident/src/subsystems/esp.rs | 58 ++++++---------------------- 2 files changed, 20 insertions(+), 52 deletions(-) diff --git a/crates/osutils/src/grub.rs b/crates/osutils/src/grub.rs index dea58f2dd..c97183616 100644 --- a/crates/osutils/src/grub.rs +++ b/crates/osutils/src/grub.rs @@ -236,9 +236,9 @@ impl GrubConfig { /// /// 1. The upstream legacy form: `search -n -u -s` /// 2. AZL3 / standard form: `search --no-floppy --fs-uuid --set=root ` - /// 3. AZL4 MIC-generated form: `search --fs-uuid --set=root ` - /// (the `--no-floppy` option is redundant on EFI machines, so AZL4's - /// grub stub omits it.) + /// 3. AZL4 / Fedora-based form: `search --fs-uuid --set=root ` + /// (`--no-floppy` is a Mariner-specific convention; Fedora's grub2 + /// scripts don't emit it, and it's redundant on EFI machines.) /// /// We rewrite *every* matching line with the corresponding form so that /// stubs containing more than one variant (rare but possible during @@ -1006,7 +1006,7 @@ mod tests { #[test] fn test_update_search_azl4_form() { - // AZL4 MIC-generated stubs omit --no-floppy. + // AZL4 (Fedora-based) stubs omit --no-floppy. let mut grub_config = GrubConfig { path: PathBuf::new(), contents: indoc::indoc! { r#" @@ -1030,8 +1030,10 @@ mod tests { #[test] fn test_update_search_mixed_forms() { - // If both AZL3 and AZL4 forms appear (e.g. an image whose stub - // includes vendored fragments), both must be rewritten. + // Validates that all three regex paths fire independently. While a + // single grub stub typically contains one search form, cross-version + // A/B updates (e.g. AZL3->AZL4) may leave different formats across + // the boot and ESP grub configs over the machine's lifecycle. let mut grub_config = GrubConfig { path: PathBuf::new(), contents: indoc::indoc! { r#" diff --git a/crates/trident/src/subsystems/esp.rs b/crates/trident/src/subsystems/esp.rs index b7d16dc3c..1c81d4187 100644 --- a/crates/trident/src/subsystems/esp.rs +++ b/crates/trident/src/subsystems/esp.rs @@ -5,7 +5,7 @@ use std::{ path::{Path, PathBuf}, }; -use anyhow::{bail, ensure, Context, Error}; +use anyhow::{bail, Context, Error}; use log::{debug, trace, warn}; use reqwest::Url; use tempfile::{NamedTempFile, TempDir}; @@ -19,7 +19,7 @@ use osutils::{ use trident_api::{ config::UefiFallbackMode, constants::{ - internal_params::{DISABLE_GRUB_NOPREFIX_CHECK, RAW_COSI_STORAGE}, + internal_params::RAW_COSI_STORAGE, EFI_DEFAULT_BIN_DIRECTORY, EFI_DEFAULT_BIN_RELATIVE_PATH, ESP_EFI_DIRECTORY, GRUB2_CONFIG_FILENAME, GRUB2_CONFIG_RELATIVE_PATH, }, @@ -277,12 +277,11 @@ fn copy_file_artifacts( } // Call helper func to copy boot files from temp_mount_dir to esp_dir_path - let grub_noprefix = - copy_boot_files(temp_mount_dir, &esp_dir_path, boot_files).context(format!( - "Failed to copy boot files from directory {} to directory {}", - temp_mount_dir.display(), - esp_dir_path.display() - ))?; + copy_boot_files(temp_mount_dir, &esp_dir_path, boot_files).context(format!( + "Failed to copy boot files from directory {} to directory {}", + temp_mount_dir.display(), + esp_dir_path.display() + ))?; if ctx.is_uki().unstructured("UKI setting unknown")? { // Prepare ESP directory structure for UKI boot @@ -291,32 +290,8 @@ fn copy_file_artifacts( // Copy the UKI from the image into the ESP directory uki::stage_uki_on_esp(temp_mount_dir, mount_point, &ctx.esp_mount_path)?; } else { - // In non-UKI mode, bail if grub_noprefix.efi is not found in the image. - // AZL4+ does not ship grub2-efi-binary-noprefix (AZL3-specific convention), - // so automatically skip this check for AZL4 and later. `is_azl4_or_later` - // handles AZL5+ correctly by re-checking version_id when the parser - // falls back to AzureLinuxRelease::Other. - // TODO: Two sources of truth for "noprefix not required" exist now: - // - this distro check - // - the filesystem probe in generate_boot_filepaths - // The probe is authoritative. Consider folding the check into the - // probe result (e.g. ensure! that *some* grub binary was found, - // not specifically the noprefix variant) in a follow-up. See - // 2026-05-18 PR-2 deep-review.md. - let image_os_release = ctx.image_os_release(); - let is_azl4_or_later = image_os_release - .get_distro() - .is_azl4_or_later(image_os_release.version_id.as_deref()); - ensure!( - grub_noprefix - || is_azl4_or_later - || ctx - .spec - .internal_params - .get_flag(DISABLE_GRUB_NOPREFIX_CHECK), - "Cannot locate {GRUB_NOPREFIX_EFI} in the boot image. \ - Verify if the grub2-efi-binary-noprefix package was installed on the booted image.", - ); + // generate_boot_filepaths already found a working GRUB binary + // (noprefix, standard, or vendor-dir). No further check needed. } Ok(()) @@ -573,9 +548,7 @@ fn copy_boot_files( temp_mount_dir: &Path, esp_dir: &Path, boot_files: Vec, -) -> Result { - // Track whether grub-noprefix.efi is used - let mut no_prefix = false; +) -> Result<(), Error> { // Copy the specified files from temp_mount_path to esp_dir_path for boot_file in boot_files.iter() { let source_path = temp_mount_dir.join(boot_file); @@ -614,11 +587,10 @@ fn copy_boot_files( .context("Failed to convert path to string")?, ) .context("Failed to rename grub-noprefix efi")?; - no_prefix = true; } } - Ok(no_prefix) + Ok(()) } /// Search EFI vendor directories for a specific binary. @@ -1406,13 +1378,7 @@ mod tests { // Call helper func to create mock boot files in temp_mount_dir create_boot_files(temp_mount_dir.path(), &file_names, "test-content"); // Call helper func to copy boot files from temp_mount_dir to esp_dir - let noprefix = - copy_boot_files(temp_mount_dir.path(), esp_dir.path(), file_names.clone()).unwrap(); - - assert!( - noprefix, - "grub-noprefix.efi is in the list of files, so it should be detected" - ); + copy_boot_files(temp_mount_dir.path(), esp_dir.path(), file_names.clone()).unwrap(); for file_name in file_names.clone() { // Create full path of source_path From bb2fd89905638529632c44c39c6157073252113c Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Wed, 3 Jun 2026 15:17:21 -0700 Subject: [PATCH 05/26] engineering: Remove unused is_azl4_or_later helper No callers remain after the noprefix check removal. Can be re-added if a future change needs version-range gating. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/osutils/src/osrelease.rs | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/crates/osutils/src/osrelease.rs b/crates/osutils/src/osrelease.rs index 5d8caafe2..c39981c6f 100644 --- a/crates/osutils/src/osrelease.rs +++ b/crates/osutils/src/osrelease.rs @@ -353,37 +353,6 @@ impl Distro { self == &Distro::AzureLinux(AzureLinuxRelease::AzL4) } - /// Returns true for AZL4 and any later Azure Linux release. - /// - /// Use this when gating behavior on features that landed in AZL4 and - /// are expected to remain present in subsequent major releases (e.g. - /// AZL4 dropped the `grub2-efi-binary-noprefix` packaging convention; - /// AZL5+ is expected to keep that change). Strict `is_azl4()` would - /// silently regress to the AZL3 code path when AZL5 ships. - /// - /// The decision is based on the `AzureLinuxRelease` ordering AND, for - /// versions newer than what the parser recognizes, the numeric major - /// component of `version_id`. New major releases that the parser - /// hasn't been taught yet will fall through to `AzureLinuxRelease::Other`, - /// so we re-check `version_id` directly. - pub fn is_azl4_or_later(&self, version_id: Option<&str>) -> bool { - if let Distro::AzureLinux(rel) = self { - if matches!(rel, AzureLinuxRelease::AzL4) { - return true; - } - // Parser doesn't know this version yet; inspect version_id. - if matches!(rel, AzureLinuxRelease::Other) { - if let Some(major) = version_id - .and_then(|v| v.split('.').next()) - .and_then(|m| m.parse::().ok()) - { - return major >= 4; - } - } - } - false - } - pub fn is_acl(&self) -> bool { self == &Distro::AzureContainerLinux } From 2411dd9f644c95fc8686e0094d20d2f1ae7dd90f Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Wed, 3 Jun 2026 16:16:00 -0700 Subject: [PATCH 06/26] engineering: Restore AZL3 noprefix guard as distro-specific check AZL3 ships two GRUB variants: grub2-efi-binary (prefix-relative config lookup) and grub2-efi-binary-noprefix (root-device-relative lookup). Trident's A/B update path requires the noprefix variant on AZL3. Restore the noprefix check, but scope it to AZL3 only using image_distro().is_azl3(). AZL4+ uses standard grubx64.efi in vendor directories and does not need noprefix. This replaces the previous generic ensure! + DISABLE_GRUB_NOPREFIX_CHECK flag with a targeted distro check. No escape hatch needed since the check only fires for AZL3. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/trident/src/subsystems/esp.rs | 38 ++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/crates/trident/src/subsystems/esp.rs b/crates/trident/src/subsystems/esp.rs index 1c81d4187..e0d7afc0a 100644 --- a/crates/trident/src/subsystems/esp.rs +++ b/crates/trident/src/subsystems/esp.rs @@ -277,11 +277,12 @@ fn copy_file_artifacts( } // Call helper func to copy boot files from temp_mount_dir to esp_dir_path - copy_boot_files(temp_mount_dir, &esp_dir_path, boot_files).context(format!( - "Failed to copy boot files from directory {} to directory {}", - temp_mount_dir.display(), - esp_dir_path.display() - ))?; + let used_noprefix = + copy_boot_files(temp_mount_dir, &esp_dir_path, boot_files).context(format!( + "Failed to copy boot files from directory {} to directory {}", + temp_mount_dir.display(), + esp_dir_path.display() + ))?; if ctx.is_uki().unstructured("UKI setting unknown")? { // Prepare ESP directory structure for UKI boot @@ -289,9 +290,16 @@ fn copy_file_artifacts( // Copy the UKI from the image into the ESP directory uki::stage_uki_on_esp(temp_mount_dir, mount_point, &ctx.esp_mount_path)?; - } else { - // generate_boot_filepaths already found a working GRUB binary - // (noprefix, standard, or vendor-dir). No further check needed. + } else if ctx.image_distro().is_azl3() && !used_noprefix { + // AZL3 ships two GRUB variants: grub2-efi-binary (prefix-relative + // config lookup) and grub2-efi-binary-noprefix (root-device-relative + // config lookup). Trident's A/B update path requires the noprefix + // variant. If the image shipped the wrong one, fail early rather + // than producing an unbootable machine. + bail!( + "AZL3 image does not contain {GRUB_NOPREFIX_EFI}. \ + Trident requires the grub2-efi-binary-noprefix package on AZL3." + ); } Ok(()) @@ -548,7 +556,8 @@ fn copy_boot_files( temp_mount_dir: &Path, esp_dir: &Path, boot_files: Vec, -) -> Result<(), Error> { +) -> Result { + let mut used_noprefix = false; // Copy the specified files from temp_mount_path to esp_dir_path for boot_file in boot_files.iter() { let source_path = temp_mount_dir.join(boot_file); @@ -587,10 +596,11 @@ fn copy_boot_files( .context("Failed to convert path to string")?, ) .context("Failed to rename grub-noprefix efi")?; + used_noprefix = true; } } - Ok(()) + Ok(used_noprefix) } /// Search EFI vendor directories for a specific binary. @@ -1378,7 +1388,13 @@ mod tests { // Call helper func to create mock boot files in temp_mount_dir create_boot_files(temp_mount_dir.path(), &file_names, "test-content"); // Call helper func to copy boot files from temp_mount_dir to esp_dir - copy_boot_files(temp_mount_dir.path(), esp_dir.path(), file_names.clone()).unwrap(); + let used_noprefix = + copy_boot_files(temp_mount_dir.path(), esp_dir.path(), file_names.clone()).unwrap(); + + assert!( + used_noprefix, + "grub-noprefix.efi is in the list of files, so it should be detected" + ); for file_name in file_names.clone() { // Create full path of source_path From d5846c21aa7632df10560da32a9e07ba36212a34 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Wed, 3 Jun 2026 16:22:31 -0700 Subject: [PATCH 07/26] fix: Restore grub_noprefix name and DISABLE_GRUB_NOPREFIX_CHECK flag Keep the original variable name and preserve the operator escape hatch. Minimize diff from upstream. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/trident/src/subsystems/esp.rs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/crates/trident/src/subsystems/esp.rs b/crates/trident/src/subsystems/esp.rs index e0d7afc0a..8bd900bb1 100644 --- a/crates/trident/src/subsystems/esp.rs +++ b/crates/trident/src/subsystems/esp.rs @@ -19,7 +19,7 @@ use osutils::{ use trident_api::{ config::UefiFallbackMode, constants::{ - internal_params::RAW_COSI_STORAGE, + internal_params::{DISABLE_GRUB_NOPREFIX_CHECK, RAW_COSI_STORAGE}, EFI_DEFAULT_BIN_DIRECTORY, EFI_DEFAULT_BIN_RELATIVE_PATH, ESP_EFI_DIRECTORY, GRUB2_CONFIG_FILENAME, GRUB2_CONFIG_RELATIVE_PATH, }, @@ -277,7 +277,7 @@ fn copy_file_artifacts( } // Call helper func to copy boot files from temp_mount_dir to esp_dir_path - let used_noprefix = + let grub_noprefix = copy_boot_files(temp_mount_dir, &esp_dir_path, boot_files).context(format!( "Failed to copy boot files from directory {} to directory {}", temp_mount_dir.display(), @@ -290,7 +290,10 @@ fn copy_file_artifacts( // Copy the UKI from the image into the ESP directory uki::stage_uki_on_esp(temp_mount_dir, mount_point, &ctx.esp_mount_path)?; - } else if ctx.image_distro().is_azl3() && !used_noprefix { + } else if ctx.image_distro().is_azl3() + && !grub_noprefix + && !ctx.spec.internal_params.get_flag(DISABLE_GRUB_NOPREFIX_CHECK) + { // AZL3 ships two GRUB variants: grub2-efi-binary (prefix-relative // config lookup) and grub2-efi-binary-noprefix (root-device-relative // config lookup). Trident's A/B update path requires the noprefix @@ -557,7 +560,7 @@ fn copy_boot_files( esp_dir: &Path, boot_files: Vec, ) -> Result { - let mut used_noprefix = false; + let mut no_prefix = false; // Copy the specified files from temp_mount_path to esp_dir_path for boot_file in boot_files.iter() { let source_path = temp_mount_dir.join(boot_file); @@ -596,11 +599,11 @@ fn copy_boot_files( .context("Failed to convert path to string")?, ) .context("Failed to rename grub-noprefix efi")?; - used_noprefix = true; + no_prefix = true; } } - Ok(used_noprefix) + Ok(no_prefix) } /// Search EFI vendor directories for a specific binary. From 5ad0c6a3dc97fb9db1b557183bd973d169ee0377 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Wed, 3 Jun 2026 16:46:25 -0700 Subject: [PATCH 08/26] fix: Use ensure! instead of bail for noprefix check Keep the same macro as upstream to minimize diff. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/trident/src/subsystems/esp.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/crates/trident/src/subsystems/esp.rs b/crates/trident/src/subsystems/esp.rs index 8bd900bb1..ae90c8512 100644 --- a/crates/trident/src/subsystems/esp.rs +++ b/crates/trident/src/subsystems/esp.rs @@ -5,7 +5,7 @@ use std::{ path::{Path, PathBuf}, }; -use anyhow::{bail, Context, Error}; +use anyhow::{bail, ensure, Context, Error}; use log::{debug, trace, warn}; use reqwest::Url; use tempfile::{NamedTempFile, TempDir}; @@ -290,18 +290,20 @@ fn copy_file_artifacts( // Copy the UKI from the image into the ESP directory uki::stage_uki_on_esp(temp_mount_dir, mount_point, &ctx.esp_mount_path)?; - } else if ctx.image_distro().is_azl3() - && !grub_noprefix - && !ctx.spec.internal_params.get_flag(DISABLE_GRUB_NOPREFIX_CHECK) - { + } else if ctx.image_distro().is_azl3() { // AZL3 ships two GRUB variants: grub2-efi-binary (prefix-relative // config lookup) and grub2-efi-binary-noprefix (root-device-relative // config lookup). Trident's A/B update path requires the noprefix // variant. If the image shipped the wrong one, fail early rather // than producing an unbootable machine. - bail!( - "AZL3 image does not contain {GRUB_NOPREFIX_EFI}. \ - Trident requires the grub2-efi-binary-noprefix package on AZL3." + ensure!( + grub_noprefix + || ctx + .spec + .internal_params + .get_flag(DISABLE_GRUB_NOPREFIX_CHECK), + "Cannot locate {GRUB_NOPREFIX_EFI} in the boot image. \ + Verify if the grub2-efi-binary-noprefix package was installed on the booted image.", ); } From 74ead34bc49c17544726c0982e3c845c46950fee Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Wed, 3 Jun 2026 16:48:57 -0700 Subject: [PATCH 09/26] fix: Revert replace_all back to replace in update_search Keep the original if/else if chain with replace (first match). No real-world grub config has multiple search lines. Minimizes diff from upstream. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/osutils/src/grub.rs | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/crates/osutils/src/grub.rs b/crates/osutils/src/grub.rs index c97183616..f55476b82 100644 --- a/crates/osutils/src/grub.rs +++ b/crates/osutils/src/grub.rs @@ -239,43 +239,30 @@ impl GrubConfig { /// 3. AZL4 / Fedora-based form: `search --fs-uuid --set=root ` /// (`--no-floppy` is a Mariner-specific convention; Fedora's grub2 /// scripts don't emit it, and it's redundant on EFI machines.) - /// - /// We rewrite *every* matching line with the corresponding form so that - /// stubs containing more than one variant (rare but possible during - /// distribution transitions) all get the new UUID. We bail only if no - /// regex matched any line. pub fn update_search(&mut self, uuid: &Uuid) -> Result<(), Error> { let re = Regex::new(r"(?m)^(\s*)search -n -u [\w-]+ -s$").unwrap(); let re2 = Regex::new(r"(?m)^(\s*)search --no-floppy --fs-uuid --set=root [\w-]+$").unwrap(); let re3 = Regex::new(r"(?m)^(\s*)search --fs-uuid --set=root [\w-]+$").unwrap(); - let mut matched = false; if re.is_match(&self.contents) { self.contents = re - .replace_all(&self.contents, &format!("${{1}}search -n -u {uuid} -s")) + .replace(&self.contents, &format!("${{1}}search -n -u {uuid} -s")) .to_string(); - matched = true; - } - if re2.is_match(&self.contents) { + } else if re2.is_match(&self.contents) { self.contents = re2 - .replace_all( + .replace( &self.contents, &format!("${{1}}search --no-floppy --fs-uuid --set=root {uuid}"), ) .to_string(); - matched = true; - } - if re3.is_match(&self.contents) { + } else if re3.is_match(&self.contents) { self.contents = re3 - .replace_all( + .replace( &self.contents, &format!("${{1}}search --fs-uuid --set=root {uuid}"), ) .to_string(); - matched = true; - } - - if !matched { + } else { bail!( "Unable to find search command in '{}'", &self.path.display() From ed333bf91e76ad1a8fc955ae3221a2e521b4bd4c Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Wed, 3 Jun 2026 16:52:52 -0700 Subject: [PATCH 10/26] fix: Restore original test variable name noprefix Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/trident/src/subsystems/esp.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/trident/src/subsystems/esp.rs b/crates/trident/src/subsystems/esp.rs index ae90c8512..1ba98ea41 100644 --- a/crates/trident/src/subsystems/esp.rs +++ b/crates/trident/src/subsystems/esp.rs @@ -1393,11 +1393,11 @@ mod tests { // Call helper func to create mock boot files in temp_mount_dir create_boot_files(temp_mount_dir.path(), &file_names, "test-content"); // Call helper func to copy boot files from temp_mount_dir to esp_dir - let used_noprefix = + let noprefix = copy_boot_files(temp_mount_dir.path(), esp_dir.path(), file_names.clone()).unwrap(); assert!( - used_noprefix, + noprefix, "grub-noprefix.efi is in the list of files, so it should be detected" ); From 550ff11ba90a5876bdbf4443f983b1249df4f806 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Wed, 3 Jun 2026 17:52:24 -0700 Subject: [PATCH 11/26] fix: Remove mixed-forms test incompatible with if/else if chain Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/osutils/src/grub.rs | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/crates/osutils/src/grub.rs b/crates/osutils/src/grub.rs index f55476b82..352064bee 100644 --- a/crates/osutils/src/grub.rs +++ b/crates/osutils/src/grub.rs @@ -1015,34 +1015,6 @@ mod tests { assert!(!grub_config.contents.contains("--no-floppy")); } - #[test] - fn test_update_search_mixed_forms() { - // Validates that all three regex paths fire independently. While a - // single grub stub typically contains one search form, cross-version - // A/B updates (e.g. AZL3->AZL4) may leave different formats across - // the boot and ESP grub configs over the machine's lifecycle. - let mut grub_config = GrubConfig { - path: PathBuf::new(), - contents: indoc::indoc! { r#" - search --no-floppy --fs-uuid --set=root oldoldold-cafe-babe-0000-aaaabbbbcccc - search --fs-uuid --set=root oldoldold-cafe-babe-0000-aaaabbbbcccc - "# } - .to_owned(), - linux_command_line: None, - }; - - let new_uuid = Uuid::parse_str("9e6a9d2c-b7fe-4359-ac45-18b505e29d8c").unwrap(); - grub_config.update_search(&new_uuid).unwrap(); - - assert!(!grub_config.contents.contains("oldoldold")); - assert!(grub_config.contents.contains(&format!( - "search --no-floppy --fs-uuid --set=root {new_uuid}" - ))); - assert!(grub_config - .contents - .contains(&format!("search --fs-uuid --set=root {new_uuid}"))); - } - #[test] fn test_update_rootdevice() { // Define original GRUB config contents on target machine From afb7a2679878eb96590b7ad6ca2c826dc8ab22a5 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Fri, 5 Jun 2026 11:40:50 -0700 Subject: [PATCH 12/26] engineering: Add BLS entry support for grub boot arg extraction AZL4 (Fedora-based) uses Boot Loader Spec entries instead of inline linux commands in grub.cfg. When grub.cfg contains blscfg and no inline linux lines, fall back to reading boot args from /boot/loader/entries/*.conf. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/osmodifier/src/grub_cfg.rs | 189 +++++++++++++++++++++++++++++- 1 file changed, 188 insertions(+), 1 deletion(-) diff --git a/crates/osmodifier/src/grub_cfg.rs b/crates/osmodifier/src/grub_cfg.rs index ade45dca9..4cd48dfbd 100644 --- a/crates/osmodifier/src/grub_cfg.rs +++ b/crates/osmodifier/src/grub_cfg.rs @@ -18,6 +18,10 @@ use crate::OsModifierContext; /// Possible grub.cfg locations, tried in order. const GRUB_CFG_PATHS: &[&str] = &["/boot/grub2/grub.cfg", "/boot/grub/grub.cfg"]; +/// BLS (Boot Loader Spec) entry directory. Fedora-based distros (including +/// AZL4) store kernel boot entries here instead of inline in grub.cfg. +const BLS_ENTRIES_DIR: &str = "/boot/loader/entries"; + /// Extract boot arguments from the generated grub.cfg. /// /// Returns a tuple of (args_to_sync, optional_root_device). @@ -37,7 +41,14 @@ pub fn extract_boot_args_from_grub_cfg( // Find the non-recovery linux command lines. // Go expects exactly one; error otherwise. - let linux_lines = find_non_recovery_linux_lines(&content)?; + let linux_lines = match find_non_recovery_linux_lines(&content) { + Ok(lines) => lines, + Err(_) if content.contains("blscfg") => { + debug!("grub.cfg uses BLS (blscfg); reading boot args from BLS entries"); + extract_options_from_bls_entries(ctx)? + } + Err(e) => return Err(e), + }; if linux_lines.len() != 1 { bail!( "expected 1 non-recovery linux line, found {}", @@ -94,6 +105,58 @@ fn find_grub_cfg(ctx: &OsModifierContext) -> Result { bail!("Could not find grub.cfg at any of: {:?}", GRUB_CFG_PATHS) } +/// Read boot arguments from BLS (Boot Loader Spec) entries. +/// +/// Scans `{root}/boot/loader/entries/*.conf`, skips entries whose title +/// contains "rescue" or "recovery" (case-insensitive), and returns the +/// `options` line from the first valid entry (sorted lexically, matching +/// grub's ordering). +fn extract_options_from_bls_entries(ctx: &OsModifierContext) -> Result, Error> { + let entries_dir = ctx.path(BLS_ENTRIES_DIR); + let mut conf_files: Vec = fs::read_dir(&entries_dir) + .with_context(|| format!("Failed to read BLS entries dir '{}'", entries_dir.display()))? + .filter_map(|e| e.ok()) + .map(|e| e.path()) + .filter(|p| p.extension().map_or(false, |ext| ext == "conf")) + .collect(); + + conf_files.sort(); + + for conf_path in &conf_files { + let content = fs::read_to_string(conf_path) + .with_context(|| format!("Failed to read BLS entry '{}'", conf_path.display()))?; + + let mut title = None; + let mut options = None; + + for line in content.lines() { + if let Some(value) = line.strip_prefix("title") { + title = Some(value.trim().to_string()); + } else if let Some(value) = line.strip_prefix("options") { + options = Some(value.trim().to_string()); + } + } + + // Skip recovery/rescue entries. + if let Some(ref t) = title { + let lower = t.to_lowercase(); + if lower.contains("rescue") || lower.contains("recovery") { + trace!("Skipping BLS rescue/recovery entry: {}", conf_path.display()); + continue; + } + } + + if let Some(opts) = options { + debug!("Using BLS entry '{}': options = {opts}", conf_path.display()); + // Return as a synthetic "linux" line: prepend a dummy kernel path + // so the downstream parser (which skips the first token) works. + return Ok(vec![format!("/boot/vmlinuz {opts}")]); + } + } + + bail!("no non-recovery BLS entry found in '{}'", entries_dir.display()) +} + /// Return the first whitespace-delimited word from a line, or None if the /// line is empty / whitespace-only. fn first_word(line: &str) -> Option<&str> { @@ -757,4 +820,128 @@ mod tests { assert_eq!(count_braces("menuentry 'title {x}' {"), (1, 0)); assert_eq!(count_braces(r#"menuentry "title {x}" {"#), (1, 0)); } + + // ======================= BLS entry support ======================= + + #[test] + fn test_extract_bls_fallback() { + let tmp = tempdir().unwrap(); + + // Write a BLS-style grub.cfg (contains blscfg, no inline linux lines) + let grub_dir = tmp.path().join("boot/grub2"); + std::fs::create_dir_all(&grub_dir).unwrap(); + std::fs::write( + grub_dir.join("grub.cfg"), + indoc::indoc! {r#" + set timeout=5 + load_env -f /boot/grub2/grubenv + blscfg + "#}, + ) + .unwrap(); + + // Write a BLS entry + let bls_dir = tmp.path().join("boot/loader/entries"); + std::fs::create_dir_all(&bls_dir).unwrap(); + std::fs::write( + bls_dir.join("azl4.conf"), + indoc::indoc! {r#" + title Azure Linux 4.0 (6.6.60) + version 6.6.60 + linux /boot/vmlinuz-6.6.60 + initrd /boot/initramfs-6.6.60.img + options root=/dev/sda2 ro selinux=1 rd.overlayfs=lower,upper,work,/dev/sda5 + "#}, + ) + .unwrap(); + + let ctx = OsModifierContext { + root: tmp.path().to_path_buf(), + }; + + let (args, root_device) = extract_boot_args_from_grub_cfg(&ctx).unwrap(); + assert_eq!(root_device, Some("/dev/sda2".to_string())); + assert!(args.contains(&"selinux=1".to_string())); + } + + #[test] + fn test_extract_bls_skips_recovery() { + let tmp = tempdir().unwrap(); + + let grub_dir = tmp.path().join("boot/grub2"); + std::fs::create_dir_all(&grub_dir).unwrap(); + std::fs::write( + grub_dir.join("grub.cfg"), + "set timeout=5\nblscfg\n", + ) + .unwrap(); + + let bls_dir = tmp.path().join("boot/loader/entries"); + std::fs::create_dir_all(&bls_dir).unwrap(); + + // Rescue entry (should be skipped) + std::fs::write( + bls_dir.join("rescue.conf"), + indoc::indoc! {r#" + title Azure Linux 4.0 rescue + version 6.6.60 + linux /boot/vmlinuz-6.6.60 + initrd /boot/initramfs-6.6.60.img + options root=/dev/sda2 ro single + "#}, + ) + .unwrap(); + + // Normal entry (should be used) + std::fs::write( + bls_dir.join("zzz-normal.conf"), + indoc::indoc! {r#" + title Azure Linux 4.0 (6.6.60) + version 6.6.60 + linux /boot/vmlinuz-6.6.60 + initrd /boot/initramfs-6.6.60.img + options root=/dev/sda2 ro selinux=1 + "#}, + ) + .unwrap(); + + let ctx = OsModifierContext { + root: tmp.path().to_path_buf(), + }; + + let (args, root_device) = extract_boot_args_from_grub_cfg(&ctx).unwrap(); + assert_eq!(root_device, Some("/dev/sda2".to_string())); + assert!(args.contains(&"selinux=1".to_string())); + // "single" from rescue entry should NOT appear + assert!(!args.iter().any(|a| a.contains("single"))); + } + + #[test] + fn test_extract_bls_no_entries() { + let tmp = tempdir().unwrap(); + + let grub_dir = tmp.path().join("boot/grub2"); + std::fs::create_dir_all(&grub_dir).unwrap(); + std::fs::write( + grub_dir.join("grub.cfg"), + "set timeout=5\nblscfg\n", + ) + .unwrap(); + + // Empty BLS entries dir + let bls_dir = tmp.path().join("boot/loader/entries"); + std::fs::create_dir_all(&bls_dir).unwrap(); + + let ctx = OsModifierContext { + root: tmp.path().to_path_buf(), + }; + + let result = extract_boot_args_from_grub_cfg(&ctx); + assert!(result.is_err()); + let err_msg = result.unwrap_err().to_string(); + assert!( + err_msg.contains("no non-recovery BLS entry found"), + "Error should mention no BLS entries, got: {err_msg}" + ); + } } From 75f8095a471428ab2998ca7cb2b370498b5c0014 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Fri, 5 Jun 2026 12:37:00 -0700 Subject: [PATCH 13/26] fix: Apply rustfmt to BLS support code Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/osmodifier/src/grub_cfg.rs | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/crates/osmodifier/src/grub_cfg.rs b/crates/osmodifier/src/grub_cfg.rs index 4cd48dfbd..19a9f3bd2 100644 --- a/crates/osmodifier/src/grub_cfg.rs +++ b/crates/osmodifier/src/grub_cfg.rs @@ -141,20 +141,29 @@ fn extract_options_from_bls_entries(ctx: &OsModifierContext) -> Result Date: Tue, 2 Jun 2026 17:40:14 -0700 Subject: [PATCH 14/26] infra: Add AZL4 builder infrastructure and image acquisition Adds AZL4 build pipeline stages with MCR-hosted MIC container, BlobImageManifest class for ACG blob source downloads, and service connection runbook. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .gitattributes | 22 ++ .gitignore | 5 +- .../stages/build_image/build-image-azl4.yml | 79 ++++++ .../build_image/build-image-template-azl4.yml | 165 +++++++++++++ tests/images/SERVICE-CONNECTION-RUNBOOK.md | 225 ++++++++++++++++++ tests/images/builder/__init__.py | 41 +++- tests/images/builder/cli.py | 22 +- tests/images/builder/download.py | 145 ++++++++++- tests/images/builder/run.py | 14 +- tests/images/testimages.py | 59 +++++ 10 files changed, 769 insertions(+), 8 deletions(-) create mode 100644 .gitattributes create mode 100644 .pipelines/templates/stages/build_image/build-image-azl4.yml create mode 100644 .pipelines/templates/stages/build_image/build-image-template-azl4.yml create mode 100644 tests/images/SERVICE-CONNECTION-RUNBOOK.md diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..0a680fcc4 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,22 @@ +* text=auto eol=lf + +# Anything that gets executed inside an image must keep LF endings; CRLF +# on shebang lines breaks the interpreter lookup with `bad interpreter: +# /bin/bash^M`. +*.sh text eol=lf +*.py text eol=lf +*.service text eol=lf +*.network text eol=lf +*.yaml text eol=lf +*.yml text eol=lf + +# Binary artifacts — never normalize. +*.vhdx binary +*.cosi binary +*.qcow2 binary +*.iso binary +*.raw binary +*.png binary +*.jpg binary +*.zst binary +*.patch text eol=lf diff --git a/.gitignore b/.gitignore index e7d3febb7..a8fd85236 100644 --- a/.gitignore +++ b/.gitignore @@ -366,4 +366,7 @@ vendor/ # Virtdeploy files /tools/vm-netlaunch.yaml -/tools/virt-deploy-metadata.json \ No newline at end of file +/tools/virt-deploy-metadata.json +# AZL4 trident binary baked into test image (built locally) +tests/images/trident-vm-testimage/base/trident-bin/ +tests/images/trident-vm-testimage/base/osmodifier-bin/ diff --git a/.pipelines/templates/stages/build_image/build-image-azl4.yml b/.pipelines/templates/stages/build_image/build-image-azl4.yml new file mode 100644 index 000000000..0fae10eb2 --- /dev/null +++ b/.pipelines/templates/stages/build_image/build-image-azl4.yml @@ -0,0 +1,79 @@ +# AZL4 variant of build-image.yml. +# +# Forked from build-image.yml on 2026-05-13. Calls build-image-template-azl4.yml +# (which uses MCR MIC container + blob-sourced base VHDX) instead of the +# external test-images repo template. +# +# TODO(azl4-merge-back): Merge this back into build-image.yml with an +# `azureLinuxVersion` parameter switch once AZL4 has feed-published base VHDXes +# and RPMs. + +parameters: + - name: imageName + type: string + + - name: clones + displayName: "Number of clones to generate" + type: number + default: 2 + + - name: dependsOnTrident + type: boolean + default: true + + - name: dependsOnStage + type: string + default: "" + +stages: + - stage: TridentTestImg_${{ replace(parameters.imageName, '-', '_') }} + displayName: Build ${{ parameters.imageName }} + ${{ if parameters.dependsOnTrident }}: + dependsOn: + # AZL4 doesn't have RPM publication so we depend on the + # trident-binaries artifact (which the GetTridentBinaries stage + # produces and copies to artifacts/binaries/trident). + - GetTridentBinaries_rpms_amd64 + # PrepareSSHKeys produces the shared 'ssh-keys' artifact. + # build-image-template-azl4.yml stages it into the testimage + # tree so qcow2 + cosi builds share the same SSH keypair, + # which lets storm-trident SSH into both A/B sides after + # update. + - PrepareSSHKeys + - ${{ if ne(parameters.dependsOnStage, '') }}: + - ${{ parameters.dependsOnStage }} + ${{ elseif ne(parameters.dependsOnStage, '') }}: + dependsOn: + - PrepareSSHKeys + - ${{ parameters.dependsOnStage }} + + jobs: + - job: BuildTridentTestImgAzl4 + displayName: Build (AZL4 MIC) + # Pinned MIC container build adds ~5 min cold-cache. Bump the timeout + # accordingly. TODO(azl4-release): lower back to 20 min once we use a + # released MIC container. + timeoutInMinutes: 30 + pool: + type: linux + + variables: + ob_outputDirectory: /tmp/output + ob_artifactBaseName: ${{ parameters.imageName }} + + steps: + - template: ../common_tasks/checkout_trident.yml + + - task: DownloadPipelineArtifact@2 + inputs: + buildType: current + artifactName: trident-binaries + targetPath: "$(Build.ArtifactStagingDirectory)/trident-binaries" + displayName: Download Trident binaries + condition: eq('${{ parameters.dependsOnTrident }}', true) + + - template: build-image-template-azl4.yml + parameters: + tridentSourceDirectory: $(TRIDENT_SOURCE_DIR) + imageName: ${{ parameters.imageName }} + clones: ${{ parameters.clones }} diff --git a/.pipelines/templates/stages/build_image/build-image-template-azl4.yml b/.pipelines/templates/stages/build_image/build-image-template-azl4.yml new file mode 100644 index 000000000..77f26a7c4 --- /dev/null +++ b/.pipelines/templates/stages/build_image/build-image-template-azl4.yml @@ -0,0 +1,165 @@ +# AZL4 variant of build-image-template.yml. +# +# Forked from build-image-template.yml on 2026-05-13. The AZL3 path pulls the +# base VHDX from the AzureLinuxArtifacts ADO feed and the Trident RPM from the +# trident-binaries pipeline artifact, then runs `testimages.py build`. None of +# that works for AZL4 today because: +# +# 1. There is no AzureLinuxArtifacts feed entry for AZL4 base VHDX. We +# download from the AZL preview gallery's backing storage account +# (azlpubdev2mruiyvi/images-dev) instead. See the BlobImageManifest +# registration in tests/images/testimages.py. +# +# 2. There is no Trident RPM for AZL4. The binary is baked in via +# additionalFiles in tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml. +# +# TODO(azl4-merge-back): When AZL4 has feed-published base VHDXes and RPMs, +# fold this template back into build-image-template.yml by adding a +# `azureLinuxVersion: "4.0"` branch. + +parameters: + - name: tridentSourceDirectory + type: string + + - name: imageName + type: string + + - name: clones + type: number + default: 1 + displayName: Number of clones to create + + # The AZL4 base VHDX is sourced from the Azure Linux preview gallery's + # backing storage account. The pipeline service connection at + # $(BLOB_SERVICE_CONNECTION) must have `Storage Blob Data Reader` on + # this account. See tests/images/SERVICE-CONNECTION-RUNBOOK.md. + - name: blobStorageAccount + type: string + default: "azlpubdev2mruiyvi" + + - name: blobContainer + type: string + default: "images-dev" + + - name: blobSubscription + type: string + # Subscription where the storage account lives. The SC's default + # subscription may differ — we explicitly set context before download. + default: "e4ab81f8-030f-4593-a8f2-3ea2c7630a19" + + - name: blobServiceConnection + type: string + # NB: this must be a service connection that exists in the ADO project. + # Trident infra needs to create it manually (Karhu can't); see the PR-5 + # follow-up validation report for the runbook. + default: "trident-azl4-blob-reader" + + - name: micContainerTag + type: string + default: "imagecustomizer:1.4.0-1" + +steps: + - template: ../common_tasks/avoid-pypi-usage.yml + + - template: common/sfi-enforce-isolation-with-etc-hosts.yaml@platform-pipelines + + # Stage the Trident binary that gets baked into the COSI via additionalFiles. + # The trident-binaries artifact comes from the same upstream Trident build + # stage the AZL3 path uses; we just copy the binary rather than installing + # an RPM. + # + # TODO(azl4-rpm): replace this binary copy with an RPM install once the + # trident-service RPM is packaged for AZL4 (same TODO as in + # tests/images/testimages.py registration). + - bash: | + set -euxo pipefail + TRIDENT_BIN_SRC="$(Build.ArtifactStagingDirectory)/trident-binaries" + TRIDENT_BIN_DEST="${{ parameters.tridentSourceDirectory }}/tests/images/trident-vm-testimage/base/trident-bin" + + if [ ! -f "$TRIDENT_BIN_SRC/trident" ]; then + echo "trident binary not found at $TRIDENT_BIN_SRC/trident" + echo "Available artifacts:" + find "$TRIDENT_BIN_SRC" -type f 2>/dev/null | head -20 || true + exit 1 + fi + + mkdir -p "$TRIDENT_BIN_DEST" + cp "$TRIDENT_BIN_SRC/trident" "$TRIDENT_BIN_DEST/trident" + chmod +x "$TRIDENT_BIN_DEST/trident" + file "$TRIDENT_BIN_DEST/trident" + displayName: "Stage Trident binary into testimage tree" + workingDirectory: ${{ parameters.tridentSourceDirectory }} + + # Pull the released MIC container from MCR. AZL4 support is included + # in imagecustomizer >= 1.4.0. + - bash: | + set -euxo pipefail + docker pull "mcr.microsoft.com/azurelinux/${{ parameters.micContainerTag }}" + displayName: "Pull MIC container from MCR" + + # Stage the pipeline-wide SSH key into the testimage tree before + # MIC runs. testimages.py's generate_ssh_keys() generates a new + # keypair UNLESS files/id_rsa.pub already exists at the source path + # — in which case it reuses it. By dropping the shared key from the + # PrepareSSHKeys artifact here, both the qcow2 base build and the + # COSI build end up with the same key baked into testuser's + # authorized_keys, so storm-trident's A/B update test can SSH into + # both A-side and B-side after the update reboot. + # + # The matching private key lives at ssh-keys/id_rsa from the + # PrepareSSHKeys stage. storm-trident's rollback stage picks it up + # the same way for AZL3 builds. + - task: DownloadPipelineArtifact@2 + displayName: "Download shared SSH keys" + inputs: + buildType: current + artifactName: "ssh-keys" + targetPath: "$(Build.ArtifactStagingDirectory)/ssh-keys" + + - bash: | + set -euxo pipefail + SSH_PUB_SRC="$(Build.ArtifactStagingDirectory)/ssh-keys/id_rsa.pub" + SSH_PUB_DEST="${{ parameters.tridentSourceDirectory }}/tests/images/trident-vm-testimage/base/files/id_rsa.pub" + if [ ! -f "$SSH_PUB_SRC" ]; then + echo "shared SSH public key not found at $SSH_PUB_SRC" + find "$(Build.ArtifactStagingDirectory)/ssh-keys" -type f + exit 1 + fi + cp "$SSH_PUB_SRC" "$SSH_PUB_DEST" + echo "Staged shared SSH public key:" + cat "$SSH_PUB_DEST" + displayName: "Stage shared SSH key into testimage tree" + workingDirectory: ${{ parameters.tridentSourceDirectory }} + + # Download the AZL4 base VHDX from the preview gallery's backing storage. + # Authenticates via the federated identity attached to the service + # connection — no storage keys handled here. + # + # The SC's default subscription (Polar_ImageTools_Staging) differs from + # the storage account's subscription (ControlTower_Test). We must switch + # context so `az storage blob list` resolves the account correctly. + - task: AzureCLI@2 + displayName: "Download AZL4 base VHDX from blob" + inputs: + azureSubscription: ${{ parameters.blobServiceConnection }} + scriptType: bash + scriptLocation: inlineScript + workingDirectory: ${{ parameters.tridentSourceDirectory }} + inlineScript: | + set -euxo pipefail + az account set --subscription "${{ parameters.blobSubscription }}" + python3 ./tests/images/testimages.py download-image azl4_qemu_guest \ + --blob-storage-account "${{ parameters.blobStorageAccount }}" \ + --blob-container "${{ parameters.blobContainer }}" + ls -la artifacts/azl4_qemu_guest.vhdx + + - bash: | + set -euxo pipefail + python3 ./tests/images/testimages.py build \ + "${{ parameters.imageName }}" \ + --container "${{ parameters.micContainerTag }}" \ + --output-dir "$(ob_outputDirectory)" \ + --no-download \ + --clones ${{ parameters.clones }} + displayName: "Build ${{ parameters.imageName }}" + workingDirectory: ${{ parameters.tridentSourceDirectory }} diff --git a/tests/images/SERVICE-CONNECTION-RUNBOOK.md b/tests/images/SERVICE-CONNECTION-RUNBOOK.md new file mode 100644 index 000000000..2a17d49d2 --- /dev/null +++ b/tests/images/SERVICE-CONNECTION-RUNBOOK.md @@ -0,0 +1,225 @@ +# ADO Service Connection Runbook — UAMI + Workload Identity Federation + +Step-by-step recipe for creating an ADO Azure Resource Manager service +connection authenticated by a User-Assigned Managed Identity (UAMI) via +Workload Identity Federation (WIF). This is the SFI-compliant pattern; no +secrets are stored anywhere. + +Adapted from Brian's wiki [Creating an ADO Service Connection authenticated +with UMI](https://dev.azure.com/mariner-org/mariner/_wiki/wikis/mariner.wiki/5697/Creating-an-ADO-Service-Connection-authenticated-with-UMI), +with the concrete commands and gotchas from setting up the +`trident-azl4-blob-reader` connection on 2026-05-14. + +## What you end up with + +``` +Azure UAMI ─(federated)→ ADO Service Connection ─(used by)→ Pipeline + │ + └─(role assignment)→ Target Azure resource +``` + +The pipeline uses `AzureCLI@2` referencing the SC. ADO mints an OIDC token, +exchanges it for an Azure access token via the UAMI's federated credential, +and the pipeline gets an `az login`'d session with the UAMI's RBAC. + +## Prerequisites + +- **Azure:** Contributor on the resource group where you'll create the UAMI +- **Azure:** User Access Administrator or Owner on the target resource you're + granting access to (for the role assignment) +- **ADO:** Project Administrator on the project where the service connection + will live + +## Step 1 — Create the UAMI (Azure CLI) + +```powershell +$sub = "" +$rg = "" +$loc = "" # match siblings if reusing an RG +$umi = "" # naming convention: see notes below + +az account set -s $sub + +# Pre-flight: confirm UAMI doesn't already exist +az identity show -g $rg -n $umi 2>$null +# (should return nothing) + +# Create +az identity create -g $rg -n $umi -l $loc ` + --tags purpose= owner= project= +``` + +The output contains `clientId` (use as ADO's Application ID later) and +`principalId` (use as the role-assignment assignee). + +### Naming convention notes + +Match what's already in the RG. Examples from +`maritimus-github-runner` (b3e01d89... sub): + +- `maritimus-github-runner-umi-*` for GitHub Actions identities +- `maritimus-github-storage-ado-*-umi` for ADO pipeline identities + +When in doubt, ask the RG owner before deviating. + +## Step 2 — Grant the UAMI access to the target resource + +For the trident-azl4-blob-reader UAMI, the target was the +`azlpubdev2mruiyvi` storage account (backing the AZL preview gallery), +with `Storage Blob Data Reader` (least privilege — we only need to read +base VHDXes). + +```powershell +$objId = az identity show -g $rg -n $umi --query principalId -o tsv +$scope = "/subscriptions/$sub/resourceGroups/$rg/providers///" + +az role assignment create ` + --assignee-object-id $objId ` + --assignee-principal-type ServicePrincipal ` + --role "" ` + --scope $scope + +# Verify +az role assignment list --assignee $objId --all -o table +``` + +**Always use least privilege.** Don't pick `Owner` when `Reader` will do. + +## Step 3 — Start service connection in ADO (do NOT click Verify yet) + +In ADO project → Project Settings → Service Connections → New service +connection. + +| Field | Value | +|---|---| +| Connection type | **Azure Resource Manager** | +| Identity type | **App registration or managed identity (manual)** | +| Credential | **Workload Identity Federation** | +| Scope Level | **Subscription** | +| Subscription ID | `` | +| Subscription Name | `` | +| **Application (client) ID** | the UAMI's **clientId** from step 1 | +| Tenant ID | `72f988bf-86f1-41af-91ab-2d7cd011db47` (MSIT) | +| Service connection name | `` | +| Grant access permission to all pipelines | **uncheck** (see SFI note below) | + +After filling these in but **before saving**, ADO shows you: + +- **Issuer URL** +- **Subject identifier** + +Both are needed for step 4. Keep this ADO tab open. + +### Issuer/Subject gotcha — read them off the form + +⚠️ Do NOT guess these values. They are not the same as `vstoken.dev.azure.com/...` +that older service connections may show. ADO assigns a new pair when you +create the SC, and the issuer is the Entra tenant authority URL +(`https://login.microsoftonline.com//v2.0`), not the ADO token +issuer URL. The subject is opaque (looks like +`/eid1/c/pub/t/.../sc/.../`). + +Copy the exact strings from the ADO form into the FIC. Do not transcribe; +copy-paste. + +## Step 4 — Add the federated credential to the UAMI + +```powershell +$issuer = "" +$subject = "" + +az identity federated-credential create ` + -g $rg ` + --identity-name $umi ` + --name "" ` + --issuer "$issuer" ` + --subject "$subject" ` + --audiences "api://AzureADTokenExchange" + +# Verify +az identity federated-credential list -g $rg --identity-name $umi -o table +``` + +FIC name should describe the consumer. For ADO connections we use +`ado--` (e.g. `ado-ecf-trident-azl4-blob-reader`). + +## Step 5 — Verify and save in ADO + +Wait ~30 seconds for Entra to propagate the FIC, then return to the ADO +form and click **Verify and save**. + +### Common errors + +**`AADSTS70025: client has no configured federated identity credentials`** +- The FIC hasn't been added yet. Run step 4. + +**`AADSTS700211: No matching federated identity record found for presented +assertion issuer 'https://login.microsoftonline.com//v2.0'`** +- The FIC exists but the issuer or subject doesn't match what ADO is + presenting. Re-read the ADO form carefully (do not transcribe — copy). +- A common mistake is reusing the issuer URL from an unrelated existing + service connection. Each new SC may get its own issuer string. + +**Verify succeeds but pipeline fails with `You do not have the required +permissions...`** +- The role assignment in step 2 either targeted the wrong scope, or + Azure RBAC hasn't propagated yet (wait up to 10 minutes). Re-check that + `az role assignment list --assignee --all` shows the role + on the correct scope. + +## Step 6 — SFI compliance — restrict pipeline permissions + +[SFI-ES2.4.11](https://eng.ms/docs/coreai/devdiv/one-engineering-system-1es/1es-docs/1es-security-configuration/azdo-config-remediation/all-pipeline-access-es-2-4-tsg) +prohibits leaving a service connection accessible to all pipelines. + +After saving: + +1. Open the new service connection in ADO +2. Click **More options (⋮) → Security** +3. Under **Pipeline permissions**, click **Restrict permission** +4. Click **+** and add each pipeline that needs the SC by ID/name. Do not + add "all pipelines." + +## When to use the manual cleanup path + +If something goes wrong mid-setup and you need to start over cleanly: + +```powershell +# Remove an FIC that pointed at the wrong issuer/subject +az identity federated-credential delete -g $rg --identity-name $umi --name "" --yes + +# Confirm no stray role assignments +az role assignment list --assignee --all -o table + +# In ADO: delete the SC via Project Settings → Service connections → ⋮ → Delete +# In Azure: only delete the UAMI itself if you're sure nothing else uses it +``` + +The UAMI does no harm by itself — it's a managed identity with role +assignments and FICs. Deleting it cascades to role assignments +automatically; FICs are removed with the parent UAMI. + +## Reference — the trident-azl4-blob-reader connection + +| Field | Value | +|---|---| +| Purpose | Read AZL4 base VHDX from the AZL preview gallery's backing storage for trident CI | +| Storage account | `azlpubdev2mruiyvi` (subscription `e4ab81f8-030f-4593-a8f2-3ea2c7630a19`, RG `azl-acg-preview-publishing`) | +| Gallery source | `azlpubDevGallery2mruiyvi / azure-linux-4-daily-x64` (same subscription/RG) | +| UAMI name | `maritimus-github-storage-ado-trident-reader-umi` | +| UAMI subscription | `b3e01d89-bd55-414f-bbb4-cdfeb2628caa` (`AzureCNMP_CNP_AzureLinux_Polar_ImageTools_Staging`) | +| UAMI resource group | `maritimus-github-runner` | +| UAMI region | `westus2` | +| UAMI clientId | `5eaafbf5-279b-4f16-b797-50bd730dcdb8` | +| UAMI principalId | `97c7c5f1-db58-4e65-8c4a-b6d614a72657` | +| Role granted | `Storage Blob Data Reader` on `azlpubdev2mruiyvi` | +| FIC name | `ado-ecf-trident-azl4-blob-reader` | +| ADO project | `mariner-org/ECF` | +| ADO SC name | `trident-azl4-blob-reader` | +| Pipelines allowed | `[GITHUB]-trident-pr-e2e`, `[GITHUB]-trident-ci`, `[GITHUB]-trident-pr-e2e-azure` | +| Created | 2026-05-14 | +| Updated | 2026-06-01 (re-scoped from `maritimusgithubstorage` to `azlpubdev2mruiyvi`) | + +When the `AzureLinuxArtifacts` ADO feed publishes AZL4 base VHDXes, +this connection can be deleted — the standard `BaseImageManifest` +download path will handle it. diff --git a/tests/images/builder/__init__.py b/tests/images/builder/__init__.py index ca82f58db..2881fe851 100644 --- a/tests/images/builder/__init__.py +++ b/tests/images/builder/__init__.py @@ -3,7 +3,7 @@ from dataclasses import dataclass, field, fields from enum import Enum from pathlib import Path -from typing import List, Optional +from typing import List, Optional, Union @dataclass @@ -16,6 +16,9 @@ class BaseImage(Enum): BAREMETAL = BaseImageData("baremetal", Path("artifacts/baremetal.vhdx")) CORE_SELINUX = BaseImageData("core_selinux", Path("artifacts/core_selinux.vhdx")) QEMU_GUEST = BaseImageData("qemu_guest", Path("artifacts/qemu_guest.vhdx")) + AZL4_QEMU_GUEST = BaseImageData( + "azl4_qemu_guest", Path("artifacts/azl4_qemu_guest.vhdx") + ) CORE_ARM64 = BaseImageData("core_arm64", Path("artifacts/core_arm64.vhdx")) MINIMAL = BaseImageData("minimal", Path("artifacts/minimal.vhdx")) MINIMAL_AARCH64 = BaseImageData( @@ -60,6 +63,34 @@ class BaseImageManifest: glob: str = "*.vhdx" +@dataclass +class BlobImageManifest: + """Manifest for a base image fetched from Azure Storage Blob. + + Used for distros that don't yet publish to an ADO universal artifact + feed (e.g., Azure Linux 4.0 alpha builds). The storage account name + and container are NOT baked in here -- they are supplied at + invocation time via the --blob-storage-account / --blob-container + flags (or the BLOB_STORAGE_ACCOUNT / BLOB_CONTAINER env vars) so the + pipeline can parameterize them and rotate the location without a + code change. + + Authentication is via `az` CLI logged-in identity (`--auth-mode + login`). The pipeline running this must have a federated identity + with read access to the storage account. + """ + + image: BaseImage + # Blob name prefix to search under + # (e.g. "azure-linux/core-efi-vhdx-4.0-amd64") + path_prefix: str + # Suffix the final blob name must end with. + # The downloader lists all blobs under path_prefix, filters to ones + # ending with this suffix, and picks the lexically largest (= most + # recent version) to download. + file_suffix: str = "/image.vhdx" + + class OutputFormat(Enum): BAREMETAL_IMAGE = "baremetal-image" COSI = "cosi" @@ -249,7 +280,9 @@ class ArtifactManifest: customizer_version: str customizer_container: str customizer_container_full: str = None - base_images: List[BaseImageManifest] = field(default_factory=list) + base_images: List[Union["BaseImageManifest", "BlobImageManifest"]] = field( + default_factory=list + ) def __post_init__(self): if self.customizer_container_full is None: @@ -264,7 +297,9 @@ def kebab_fields(cls) -> List[str]: """Return a list of fields in kebab-case.""" return [f.name.replace("_", "-") for f in fields(cls)] - def find_base_image(self, img: BaseImage) -> Optional[BaseImageManifest]: + def find_base_image( + self, img: BaseImage + ) -> Optional[Union["BaseImageManifest", "BlobImageManifest"]]: """Find a base image by its name.""" for base_image in self.base_images: if base_image.image == img: diff --git a/tests/images/builder/cli.py b/tests/images/builder/cli.py index 741f0c239..39e8e9aad 100644 --- a/tests/images/builder/cli.py +++ b/tests/images/builder/cli.py @@ -1,6 +1,7 @@ import argparse from enum import Enum import logging +import os from pathlib import Path from typing import List @@ -183,7 +184,8 @@ def setup_parser_download_image( ) -> None: parser_download_img = subparsers.add_parser( SubCommand.DOWNLOAD_IMAGE.value, - help="Download a base image from the Azure DevOps feed", + help="Download a base image (from the Azure DevOps feed, or from " + "Azure Storage Blob for distros without a published feed).", ) parser_download_img.set_defaults(artifacts=artifacts) parser_download_img.add_argument( @@ -191,6 +193,22 @@ def setup_parser_download_image( help="The image to download", choices=[c.image.name for c in artifacts.base_images], ) + parser_download_img.add_argument( + "--blob-storage-account", + default=os.environ.get("BLOB_STORAGE_ACCOUNT"), + help="Azure Storage account name to pull blob-sourced base images " + "from. Required when downloading an image whose manifest is a " + "BlobImageManifest. Falls back to the BLOB_STORAGE_ACCOUNT env " + "var. Not used for ADO-feed base images.", + ) + parser_download_img.add_argument( + "--blob-container", + default=os.environ.get("BLOB_CONTAINER"), + help="Azure Storage container name to pull blob-sourced base " + "images from. Required when downloading an image whose manifest " + "is a BlobImageManifest. Falls back to the BLOB_CONTAINER env " + "var. Not used for ADO-feed base images.", + ) def setup_parser_matrix( @@ -285,6 +303,8 @@ def run_cmd( run.download_base_image( artifacts=args.artifacts, name=args.image, + blob_storage_account=args.blob_storage_account, + blob_container=args.blob_container, ) elif subcommand == SubCommand.MATRIX: run.generate_matrix( diff --git a/tests/images/builder/download.py b/tests/images/builder/download.py index 6f9db4c9f..56a1313af 100644 --- a/tests/images/builder/download.py +++ b/tests/images/builder/download.py @@ -1,9 +1,15 @@ +import json +import logging +import os +import re from pathlib import Path import shutil import subprocess import tempfile -from builder import BaseImageManifest +from builder import BaseImageManifest, BlobImageManifest + +log = logging.getLogger(__name__) def download_base_image(image: BaseImageManifest) -> None: @@ -39,3 +45,140 @@ def download_base_image(image: BaseImageManifest) -> None: # Copy the .vhdx file to the target location shutil.copy2(vhdx_files[0], image.image.path) + + +# Constrain blob filename selection to a date-prefixed shape so a stray +# blob with a name that lexically sorts last (`zzz-evil/image.vhdx`) +# cannot win selection. Matches `YYYYMMDD/` or `YYYY-MM-DD/`-style +# version prefixes, which is the upstream publisher's convention. +# +# This is defense against a broader governance issue: the storage account +# is owned by another team, so write access is out of Trident's control. +# The regex narrows the attack surface to "names matching this shape" +# while still letting us track the latest published version. Tracked +# longer-term in the AZL4 supply-chain governance discussion. +_BLOB_NAME_VERSION_RE = re.compile(r"/([^/]*\d{4}-?\d{2}-?\d{2}[^/]*)/") + + +def download_blob_image( + image: BlobImageManifest, + storage_account: str, + container: str, +) -> None: + """Download a base image from Azure Storage Blob. + + Lists blobs under `image.path_prefix`, filters to ones whose name + matches a date-prefixed version pattern AND ends with + `image.file_suffix`, picks the lexically largest (= most recent + date), and downloads it atomically to `image.image.path`. + + Requires `az` CLI with a logged-in identity that has read access + to the storage account. Uses `--auth-mode login` so no storage + keys are needed. + """ + if not storage_account or not container: + raise RuntimeError( + f"Blob storage account/container required to download " + f"'{image.image.name}'. Pass --blob-storage-account and " + f"--blob-container, or set BLOB_STORAGE_ACCOUNT and " + f"BLOB_CONTAINER env vars." + ) + + az = shutil.which("az") + if az is None: + raise RuntimeError( + "az CLI not found on PATH; required to fetch blob-sourced " + "base images. Install azure-cli." + ) + + log.info( + f"Listing blobs in '{storage_account}/{container}' under " + f"prefix '{image.path_prefix}/'" + ) + # No `--query` interpolation: do the filtering in Python so caller + # control of `image.file_suffix` (or any other field that might + # become externally settable later) cannot inject JMESPath. + list_proc = subprocess.run( + [ + az, + "storage", + "blob", + "list", + "--auth-mode", + "login", + "--account-name", + storage_account, + "--container-name", + container, + "--prefix", + f"{image.path_prefix}/", + "--query", + "[].name", + "-o", + "json", + ], + check=True, + capture_output=True, + text=True, + ) + all_names = json.loads(list_proc.stdout) + suffix = image.file_suffix + eligible = [ + n for n in all_names if n.endswith(suffix) and _BLOB_NAME_VERSION_RE.search(n) + ] + if not eligible: + raise RuntimeError( + f"No date-versioned blobs ending with '{suffix}' found under " + f"'{image.path_prefix}/' in '{storage_account}/{container}' " + f"(saw {len(all_names)} total blobs under the prefix)" + ) + + latest = sorted(eligible)[-1] + log.info(f"Latest: {latest}") + + image.image.path.parent.mkdir(parents=True, exist_ok=True) + + # Download to a sibling temp file then atomically rename. `az + # storage blob download` writes in place — if the step is killed + # (timeout / OOM / agent reboot) between create and complete, the + # next run sees a truncated VHDX and MIC fails with an opaque + # error. The temp-then-rename pattern guarantees the target either + # has the full bytes or doesn't exist. + target = image.image.path + fd, tmp_path = tempfile.mkstemp( + prefix=target.name + ".", + suffix=".part", + dir=str(target.parent), + ) + os.close(fd) + try: + subprocess.run( + [ + az, + "storage", + "blob", + "download", + "--auth-mode", + "login", + "--account-name", + storage_account, + "--container-name", + container, + "--name", + latest, + "--file", + tmp_path, + "--output", + "none", + ], + check=True, + ) + os.replace(tmp_path, target) + except BaseException: + # On any failure, remove the temp file so we don't leave + # partial-state debris next to the final path. + try: + os.unlink(tmp_path) + except FileNotFoundError: + pass + raise diff --git a/tests/images/builder/run.py b/tests/images/builder/run.py index d465beb2f..8c93bdcb1 100644 --- a/tests/images/builder/run.py +++ b/tests/images/builder/run.py @@ -3,7 +3,7 @@ import json from typing import List, Optional -from builder import ImageConfig, RpmSources, ArtifactManifest +from builder import ArtifactManifest, BlobImageManifest, ImageConfig, RpmSources from .builder import build_image from .convert import convert_image from . import download @@ -148,6 +148,8 @@ def download_base_image( *, artifacts: ArtifactManifest, name: str, + blob_storage_account: Optional[str] = None, + blob_container: Optional[str] = None, ) -> None: image_manifest = next( (img for img in artifacts.base_images if img.image.name == name), None @@ -155,7 +157,15 @@ def download_base_image( if image_manifest is None: raise ValueError(f"Image '{name}' not found in artifacts") log.info(f"Downloading base image '{name}' to '{image_manifest.image.path}'") - download.download_base_image(image_manifest) + + if isinstance(image_manifest, BlobImageManifest): + download.download_blob_image( + image_manifest, + storage_account=blob_storage_account, + container=blob_container, + ) + else: + download.download_base_image(image_manifest) def generate_matrix( diff --git a/tests/images/testimages.py b/tests/images/testimages.py index 9ab341cba..b4d8b5416 100755 --- a/tests/images/testimages.py +++ b/tests/images/testimages.py @@ -7,6 +7,7 @@ ArtifactManifest, BaseImage, BaseImageManifest, + BlobImageManifest, ImageConfig, OutputFormat, SystemArchitecture, @@ -132,6 +133,47 @@ config_file="base/updateimg-grub.yaml", ssh_key="files/id_rsa.pub", ), + ImageConfig( + # AZL4 (Fedora-derived) variant of trident-vm-grub-testimage. + # The base VHDX is pulled from Azure Storage (see + # BlobImageManifest below) since there is no AzureLinuxArtifacts + # ADO feed entry for AZL4 yet. The Trident binary is baked in + # via additionalFiles because the trident-service RPM is not + # yet packaged for AZL4. + "trident-vm-grub-testimage-azl4", + base_image=BaseImage.AZL4_QEMU_GUEST, + config="trident-vm-testimage", + config_file="base/updateimg-grub-azl4.yaml", + ssh_key="files/id_rsa.pub", + # No trident-service RPM for AZL4 yet — the binary is delivered + # via additionalFiles. extra_dependencies enforces both binaries + # are in place before the image is built (osmodifier is delivered + # the same way until an AZL4 RPM exists; see + # tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml + # for the additionalFiles entries that consume both paths). + requires_trident=False, + extra_dependencies=[ + Path("tests/images/trident-vm-testimage/base/trident-bin/trident"), + Path("tests/images/trident-vm-testimage/base/osmodifier-bin/osmodifier"), + ], + ), + ImageConfig( + # AZL4 BASE qcow2: a bootable disk with the AZL4 OS plus trident + # installed, so storm-trident rollback testing can boot a VM and + # immediately drive A/B updates targeting the .cosi above. + # Mirrors AZL3's `make artifacts/trident-vm-grub-testimage.qcow2` + # path. See baseimg-grub-azl4.yaml for the layout / package set. + "trident-vm-grub-testimage-azl4-base", + base_image=BaseImage.AZL4_QEMU_GUEST, + config="trident-vm-testimage", + config_file="base/baseimg-grub-azl4.yaml", + output_format=OutputFormat.QCOW2, + ssh_key="files/id_rsa.pub", + requires_trident=False, + extra_dependencies=[ + Path("tests/images/trident-vm-testimage/base/trident-bin/trident"), + ], + ), ImageConfig( "trident-vm-grub-verity-testimage", base_image=BaseImage.QEMU_GUEST, @@ -246,6 +288,23 @@ package_name="minimal_vhdx-3.0-stable", version="*", ), + BlobImageManifest( + # Azure Linux 4.0 base VHDX from the AZL preview gallery's + # backing storage. Pinned to a specific daily build — bump + # the version segment in path_prefix to pick up a newer one. + # + # Source gallery: + # azlpubDevGallery2mruiyvi / azure-linux-4-daily-x64 + # subscription e4ab81f8-030f-4593-a8f2-3ea2c7630a19 + # RG azl-acg-preview-publishing + # + # Storage account + container are supplied at runtime via + # --blob-storage-account / --blob-container CLI flags or + # the BLOB_STORAGE_ACCOUNT / BLOB_CONTAINER env vars. + image=BaseImage.AZL4_QEMU_GUEST, + path_prefix="staging/azure-linux-4-daily-x64/4.0.2026051502", + file_suffix=".vhdfixed", + ), ], ) From eae6848b8d41ab8aa29951a77393b863887d46c2 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Wed, 3 Jun 2026 18:26:53 -0700 Subject: [PATCH 15/26] fix: Tag MCR MIC container with local short name after pull testimages.py runs docker with the short tag (imagecustomizer:1.4.0-1) but docker pull uses the full MCR path. Without a local tag, docker run fails with 'pull access denied'. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../stages/build_image/build-image-template-azl4.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.pipelines/templates/stages/build_image/build-image-template-azl4.yml b/.pipelines/templates/stages/build_image/build-image-template-azl4.yml index 77f26a7c4..31e163b59 100644 --- a/.pipelines/templates/stages/build_image/build-image-template-azl4.yml +++ b/.pipelines/templates/stages/build_image/build-image-template-azl4.yml @@ -91,10 +91,12 @@ steps: workingDirectory: ${{ parameters.tridentSourceDirectory }} # Pull the released MIC container from MCR. AZL4 support is included - # in imagecustomizer >= 1.4.0. + # in imagecustomizer >= 1.4.0. Tag it locally so testimages.py can + # reference it by short name. - bash: | set -euxo pipefail docker pull "mcr.microsoft.com/azurelinux/${{ parameters.micContainerTag }}" + docker tag "mcr.microsoft.com/azurelinux/${{ parameters.micContainerTag }}" "${{ parameters.micContainerTag }}" displayName: "Pull MIC container from MCR" # Stage the pipeline-wide SSH key into the testimage tree before From 73835d5fcbf42a823a52b4655103d6875dadb99a Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Mon, 8 Jun 2026 12:47:55 -0700 Subject: [PATCH 16/26] docs: Update TODOs to not assume AzureLinuxArtifacts feed for AZL4 AZL4 base VHDXes may continue to come from blob storage rather than the ADO feed. The trident-service RPM will come from an AZL4 package repo, not ADO. Update comments to reflect this. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../stages/build_image/build-image-azl4.yml | 6 ++++-- .../build_image/build-image-template-azl4.yml | 18 +++++++++--------- tests/images/SERVICE-CONNECTION-RUNBOOK.md | 6 +++--- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/.pipelines/templates/stages/build_image/build-image-azl4.yml b/.pipelines/templates/stages/build_image/build-image-azl4.yml index 0fae10eb2..a2901cd84 100644 --- a/.pipelines/templates/stages/build_image/build-image-azl4.yml +++ b/.pipelines/templates/stages/build_image/build-image-azl4.yml @@ -5,8 +5,10 @@ # external test-images repo template. # # TODO(azl4-merge-back): Merge this back into build-image.yml with an -# `azureLinuxVersion` parameter switch once AZL4 has feed-published base VHDXes -# and RPMs. +# `azureLinuxVersion` parameter switch once AZL4 base VHDX acquisition +# and trident-service RPM packaging are resolved. The base VHDX may +# continue to come from blob storage (not the AzureLinuxArtifacts ADO +# feed); the RPM will come from an AZL4 package repo, not ADO. parameters: - name: imageName diff --git a/.pipelines/templates/stages/build_image/build-image-template-azl4.yml b/.pipelines/templates/stages/build_image/build-image-template-azl4.yml index 31e163b59..7b679b084 100644 --- a/.pipelines/templates/stages/build_image/build-image-template-azl4.yml +++ b/.pipelines/templates/stages/build_image/build-image-template-azl4.yml @@ -2,20 +2,20 @@ # # Forked from build-image-template.yml on 2026-05-13. The AZL3 path pulls the # base VHDX from the AzureLinuxArtifacts ADO feed and the Trident RPM from the -# trident-binaries pipeline artifact, then runs `testimages.py build`. None of -# that works for AZL4 today because: +# trident-binaries pipeline artifact, then runs `testimages.py build`. AZL4 +# uses different acquisition paths: # -# 1. There is no AzureLinuxArtifacts feed entry for AZL4 base VHDX. We -# download from the AZL preview gallery's backing storage account -# (azlpubdev2mruiyvi/images-dev) instead. See the BlobImageManifest +# 1. Base VHDX comes from the AZL preview gallery's backing storage +# (azlpubdev2mruiyvi/images-dev). See the BlobImageManifest # registration in tests/images/testimages.py. # -# 2. There is no Trident RPM for AZL4. The binary is baked in via +# 2. There is no Trident RPM for AZL4 yet. The binary is baked in via # additionalFiles in tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml. # -# TODO(azl4-merge-back): When AZL4 has feed-published base VHDXes and RPMs, -# fold this template back into build-image-template.yml by adding a -# `azureLinuxVersion: "4.0"` branch. +# TODO(azl4-merge-back): Fold this template back into build-image-template.yml +# once the AZL4 base VHDX and trident-service RPM acquisition paths are +# standardized. The base VHDX may stay as a blob download; the RPM will +# come from an AZL4 package repo. parameters: - name: tridentSourceDirectory diff --git a/tests/images/SERVICE-CONNECTION-RUNBOOK.md b/tests/images/SERVICE-CONNECTION-RUNBOOK.md index 2a17d49d2..fe448ae4b 100644 --- a/tests/images/SERVICE-CONNECTION-RUNBOOK.md +++ b/tests/images/SERVICE-CONNECTION-RUNBOOK.md @@ -220,6 +220,6 @@ automatically; FICs are removed with the parent UAMI. | Created | 2026-05-14 | | Updated | 2026-06-01 (re-scoped from `maritimusgithubstorage` to `azlpubdev2mruiyvi`) | -When the `AzureLinuxArtifacts` ADO feed publishes AZL4 base VHDXes, -this connection can be deleted — the standard `BaseImageManifest` -download path will handle it. +When AZL4 base VHDX acquisition is standardized (either via the +`AzureLinuxArtifacts` ADO feed or a permanent blob location), this +connection can be re-evaluated. From 9dabb187ef8042494e7d3b0137dd1907ae6412cb Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Mon, 8 Jun 2026 12:52:41 -0700 Subject: [PATCH 17/26] fix: Remove SERVICE-CONNECTION-RUNBOOK from public repo Contains internal infrastructure details (UAMI names, principal IDs, subscription IDs, FIC configuration) that should not be published to a public GitHub repository. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/images/SERVICE-CONNECTION-RUNBOOK.md | 225 --------------------- 1 file changed, 225 deletions(-) delete mode 100644 tests/images/SERVICE-CONNECTION-RUNBOOK.md diff --git a/tests/images/SERVICE-CONNECTION-RUNBOOK.md b/tests/images/SERVICE-CONNECTION-RUNBOOK.md deleted file mode 100644 index fe448ae4b..000000000 --- a/tests/images/SERVICE-CONNECTION-RUNBOOK.md +++ /dev/null @@ -1,225 +0,0 @@ -# ADO Service Connection Runbook — UAMI + Workload Identity Federation - -Step-by-step recipe for creating an ADO Azure Resource Manager service -connection authenticated by a User-Assigned Managed Identity (UAMI) via -Workload Identity Federation (WIF). This is the SFI-compliant pattern; no -secrets are stored anywhere. - -Adapted from Brian's wiki [Creating an ADO Service Connection authenticated -with UMI](https://dev.azure.com/mariner-org/mariner/_wiki/wikis/mariner.wiki/5697/Creating-an-ADO-Service-Connection-authenticated-with-UMI), -with the concrete commands and gotchas from setting up the -`trident-azl4-blob-reader` connection on 2026-05-14. - -## What you end up with - -``` -Azure UAMI ─(federated)→ ADO Service Connection ─(used by)→ Pipeline - │ - └─(role assignment)→ Target Azure resource -``` - -The pipeline uses `AzureCLI@2` referencing the SC. ADO mints an OIDC token, -exchanges it for an Azure access token via the UAMI's federated credential, -and the pipeline gets an `az login`'d session with the UAMI's RBAC. - -## Prerequisites - -- **Azure:** Contributor on the resource group where you'll create the UAMI -- **Azure:** User Access Administrator or Owner on the target resource you're - granting access to (for the role assignment) -- **ADO:** Project Administrator on the project where the service connection - will live - -## Step 1 — Create the UAMI (Azure CLI) - -```powershell -$sub = "" -$rg = "" -$loc = "" # match siblings if reusing an RG -$umi = "" # naming convention: see notes below - -az account set -s $sub - -# Pre-flight: confirm UAMI doesn't already exist -az identity show -g $rg -n $umi 2>$null -# (should return nothing) - -# Create -az identity create -g $rg -n $umi -l $loc ` - --tags purpose= owner= project= -``` - -The output contains `clientId` (use as ADO's Application ID later) and -`principalId` (use as the role-assignment assignee). - -### Naming convention notes - -Match what's already in the RG. Examples from -`maritimus-github-runner` (b3e01d89... sub): - -- `maritimus-github-runner-umi-*` for GitHub Actions identities -- `maritimus-github-storage-ado-*-umi` for ADO pipeline identities - -When in doubt, ask the RG owner before deviating. - -## Step 2 — Grant the UAMI access to the target resource - -For the trident-azl4-blob-reader UAMI, the target was the -`azlpubdev2mruiyvi` storage account (backing the AZL preview gallery), -with `Storage Blob Data Reader` (least privilege — we only need to read -base VHDXes). - -```powershell -$objId = az identity show -g $rg -n $umi --query principalId -o tsv -$scope = "/subscriptions/$sub/resourceGroups/$rg/providers///" - -az role assignment create ` - --assignee-object-id $objId ` - --assignee-principal-type ServicePrincipal ` - --role "" ` - --scope $scope - -# Verify -az role assignment list --assignee $objId --all -o table -``` - -**Always use least privilege.** Don't pick `Owner` when `Reader` will do. - -## Step 3 — Start service connection in ADO (do NOT click Verify yet) - -In ADO project → Project Settings → Service Connections → New service -connection. - -| Field | Value | -|---|---| -| Connection type | **Azure Resource Manager** | -| Identity type | **App registration or managed identity (manual)** | -| Credential | **Workload Identity Federation** | -| Scope Level | **Subscription** | -| Subscription ID | `` | -| Subscription Name | `` | -| **Application (client) ID** | the UAMI's **clientId** from step 1 | -| Tenant ID | `72f988bf-86f1-41af-91ab-2d7cd011db47` (MSIT) | -| Service connection name | `` | -| Grant access permission to all pipelines | **uncheck** (see SFI note below) | - -After filling these in but **before saving**, ADO shows you: - -- **Issuer URL** -- **Subject identifier** - -Both are needed for step 4. Keep this ADO tab open. - -### Issuer/Subject gotcha — read them off the form - -⚠️ Do NOT guess these values. They are not the same as `vstoken.dev.azure.com/...` -that older service connections may show. ADO assigns a new pair when you -create the SC, and the issuer is the Entra tenant authority URL -(`https://login.microsoftonline.com//v2.0`), not the ADO token -issuer URL. The subject is opaque (looks like -`/eid1/c/pub/t/.../sc/.../`). - -Copy the exact strings from the ADO form into the FIC. Do not transcribe; -copy-paste. - -## Step 4 — Add the federated credential to the UAMI - -```powershell -$issuer = "" -$subject = "" - -az identity federated-credential create ` - -g $rg ` - --identity-name $umi ` - --name "" ` - --issuer "$issuer" ` - --subject "$subject" ` - --audiences "api://AzureADTokenExchange" - -# Verify -az identity federated-credential list -g $rg --identity-name $umi -o table -``` - -FIC name should describe the consumer. For ADO connections we use -`ado--` (e.g. `ado-ecf-trident-azl4-blob-reader`). - -## Step 5 — Verify and save in ADO - -Wait ~30 seconds for Entra to propagate the FIC, then return to the ADO -form and click **Verify and save**. - -### Common errors - -**`AADSTS70025: client has no configured federated identity credentials`** -- The FIC hasn't been added yet. Run step 4. - -**`AADSTS700211: No matching federated identity record found for presented -assertion issuer 'https://login.microsoftonline.com//v2.0'`** -- The FIC exists but the issuer or subject doesn't match what ADO is - presenting. Re-read the ADO form carefully (do not transcribe — copy). -- A common mistake is reusing the issuer URL from an unrelated existing - service connection. Each new SC may get its own issuer string. - -**Verify succeeds but pipeline fails with `You do not have the required -permissions...`** -- The role assignment in step 2 either targeted the wrong scope, or - Azure RBAC hasn't propagated yet (wait up to 10 minutes). Re-check that - `az role assignment list --assignee --all` shows the role - on the correct scope. - -## Step 6 — SFI compliance — restrict pipeline permissions - -[SFI-ES2.4.11](https://eng.ms/docs/coreai/devdiv/one-engineering-system-1es/1es-docs/1es-security-configuration/azdo-config-remediation/all-pipeline-access-es-2-4-tsg) -prohibits leaving a service connection accessible to all pipelines. - -After saving: - -1. Open the new service connection in ADO -2. Click **More options (⋮) → Security** -3. Under **Pipeline permissions**, click **Restrict permission** -4. Click **+** and add each pipeline that needs the SC by ID/name. Do not - add "all pipelines." - -## When to use the manual cleanup path - -If something goes wrong mid-setup and you need to start over cleanly: - -```powershell -# Remove an FIC that pointed at the wrong issuer/subject -az identity federated-credential delete -g $rg --identity-name $umi --name "" --yes - -# Confirm no stray role assignments -az role assignment list --assignee --all -o table - -# In ADO: delete the SC via Project Settings → Service connections → ⋮ → Delete -# In Azure: only delete the UAMI itself if you're sure nothing else uses it -``` - -The UAMI does no harm by itself — it's a managed identity with role -assignments and FICs. Deleting it cascades to role assignments -automatically; FICs are removed with the parent UAMI. - -## Reference — the trident-azl4-blob-reader connection - -| Field | Value | -|---|---| -| Purpose | Read AZL4 base VHDX from the AZL preview gallery's backing storage for trident CI | -| Storage account | `azlpubdev2mruiyvi` (subscription `e4ab81f8-030f-4593-a8f2-3ea2c7630a19`, RG `azl-acg-preview-publishing`) | -| Gallery source | `azlpubDevGallery2mruiyvi / azure-linux-4-daily-x64` (same subscription/RG) | -| UAMI name | `maritimus-github-storage-ado-trident-reader-umi` | -| UAMI subscription | `b3e01d89-bd55-414f-bbb4-cdfeb2628caa` (`AzureCNMP_CNP_AzureLinux_Polar_ImageTools_Staging`) | -| UAMI resource group | `maritimus-github-runner` | -| UAMI region | `westus2` | -| UAMI clientId | `5eaafbf5-279b-4f16-b797-50bd730dcdb8` | -| UAMI principalId | `97c7c5f1-db58-4e65-8c4a-b6d614a72657` | -| Role granted | `Storage Blob Data Reader` on `azlpubdev2mruiyvi` | -| FIC name | `ado-ecf-trident-azl4-blob-reader` | -| ADO project | `mariner-org/ECF` | -| ADO SC name | `trident-azl4-blob-reader` | -| Pipelines allowed | `[GITHUB]-trident-pr-e2e`, `[GITHUB]-trident-ci`, `[GITHUB]-trident-pr-e2e-azure` | -| Created | 2026-05-14 | -| Updated | 2026-06-01 (re-scoped from `maritimusgithubstorage` to `azlpubdev2mruiyvi`) | - -When AZL4 base VHDX acquisition is standardized (either via the -`AzureLinuxArtifacts` ADO feed or a permanent blob location), this -connection can be re-evaluated. From f81d73e3faa1dfd996967fce67a0085a8605cfe3 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Mon, 8 Jun 2026 12:58:40 -0700 Subject: [PATCH 18/26] docs: Trim verbose CLI help strings in testimages.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/images/builder/cli.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/tests/images/builder/cli.py b/tests/images/builder/cli.py index 39e8e9aad..784c4c534 100644 --- a/tests/images/builder/cli.py +++ b/tests/images/builder/cli.py @@ -184,30 +184,25 @@ def setup_parser_download_image( ) -> None: parser_download_img = subparsers.add_parser( SubCommand.DOWNLOAD_IMAGE.value, - help="Download a base image (from the Azure DevOps feed, or from " - "Azure Storage Blob for distros without a published feed).", + help="Download a base image.", ) parser_download_img.set_defaults(artifacts=artifacts) parser_download_img.add_argument( "image", - help="The image to download", + help="The image to download.", choices=[c.image.name for c in artifacts.base_images], ) parser_download_img.add_argument( "--blob-storage-account", default=os.environ.get("BLOB_STORAGE_ACCOUNT"), - help="Azure Storage account name to pull blob-sourced base images " - "from. Required when downloading an image whose manifest is a " - "BlobImageManifest. Falls back to the BLOB_STORAGE_ACCOUNT env " - "var. Not used for ADO-feed base images.", + help="Azure Storage account name for blob-sourced images. " + "Env: BLOB_STORAGE_ACCOUNT.", ) parser_download_img.add_argument( "--blob-container", default=os.environ.get("BLOB_CONTAINER"), - help="Azure Storage container name to pull blob-sourced base " - "images from. Required when downloading an image whose manifest " - "is a BlobImageManifest. Falls back to the BLOB_CONTAINER env " - "var. Not used for ADO-feed base images.", + help="Azure Storage container name for blob-sourced images. " + "Env: BLOB_CONTAINER.", ) From 28b09d1cd8e4cf5498f191d04a75ed2185a0ea45 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Tue, 2 Jun 2026 17:40:25 -0700 Subject: [PATCH 19/26] infra: Add AZL4 COSI image config, pipeline stages, and E2E configs Adds AZL4 E2E pipeline parameters, COSI update-image YAML config, test-image helper scripts, and base/rollback trident configurations. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .pipelines/templates/e2e-template.yml | 38 +++++ .../base-azl4/test-selection.yaml | 5 + .../base-azl4/trident-config.yaml | 68 ++++++++ .../rollback-azl4/test-selection.yaml | 3 + .../rollback-azl4/trident-config.yaml | 84 +++++++++ tests/images/trident-vm-testimage/README.md | 46 +++++ .../base/files/hostname-shim.sh | 20 +++ .../base/files/regen-sshd-keys.service | 14 ++ .../base/scripts/enable-regen-sshd-keys.sh | 7 + .../base/scripts/ssh-move-host-keys-azl4.sh | 13 ++ .../base/updateimg-grub-azl4.yaml | 161 ++++++++++++++++++ 11 files changed, 459 insertions(+) create mode 100644 tests/e2e_tests/trident_configurations/base-azl4/test-selection.yaml create mode 100644 tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml create mode 100644 tests/e2e_tests/trident_configurations/rollback-azl4/test-selection.yaml create mode 100644 tests/e2e_tests/trident_configurations/rollback-azl4/trident-config.yaml create mode 100644 tests/images/trident-vm-testimage/base/files/hostname-shim.sh create mode 100644 tests/images/trident-vm-testimage/base/files/regen-sshd-keys.service create mode 100755 tests/images/trident-vm-testimage/base/scripts/enable-regen-sshd-keys.sh create mode 100755 tests/images/trident-vm-testimage/base/scripts/ssh-move-host-keys-azl4.sh create mode 100644 tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml diff --git a/.pipelines/templates/e2e-template.yml b/.pipelines/templates/e2e-template.yml index a0654303a..1dfd69fb7 100644 --- a/.pipelines/templates/e2e-template.yml +++ b/.pipelines/templates/e2e-template.yml @@ -224,6 +224,44 @@ stages: micVersion: ${{ parameters.micVersion }} dependsOnStage: ${{ parameters.baseImageArtifactStage }} + # Build the AZL4 test image (pinned-MIC path). + # + # TODO(azl4-release): Drop the bespoke build-image-azl4.yml call once AZL4 + # has feed-published base VHDXes, RPMs, and a released MIC container. + # Then this can be a plain build-image.yml call with an azureLinuxVersion + # parameter, matching the other testimage stages. + # + # Gating mirrors the AzL installer ISO below so AZL4 build runs in every + # stage type that gates a trunk merge. Previously this only ran on + # pr-e2e / ci / pr-e2e-azure, which silently skipped AZL4 in + # azl-validation / full-validation — exactly the stage you'd want it. + - ${{ if or(eq(parameters.stageType, 'pr-e2e'), eq(parameters.stageType, 'ci'), eq(parameters.stageType, 'pr-e2e-azure'), eq(parameters.stageType, 'azl-validation'), eq(parameters.stageType, 'full-validation')) }}: + - template: stages/build_image/build-image-azl4.yml + parameters: + imageName: trident-vm-grub-testimage-azl4 + dependsOnStage: ${{ parameters.baseImageArtifactStage }} + + # AZL4 base qcow2 — boot point for the VM offline-init / rollback + # path. Same build template as the COSI above; output_format + # differs (QCOW2 vs COSI) per the testimages.py registration. + - template: stages/build_image/build-image-azl4.yml + parameters: + imageName: trident-vm-grub-testimage-azl4-base + dependsOnStage: ${{ parameters.baseImageArtifactStage }} + + # AZL4 BM-simulated netlaunch test. Uses the AZL3 MOS installer ISO + # (built by TridentTestImg_trident_installer below) plus the AZL4 + # COSI built above. Trident runs from the live MOS environment and + # installs the AZL4 COSI onto a fresh virtdeploy VM disk. This is + # the same flow we proved out manually on karhu-ubuntu. + - template: stages/testing_vm/netlaunch-testing-azl4.yml + + # AZL4 VM offline-init rollback test. The base qcow2 already has + # trident's datastore populated by its first-boot offline-init + # oneshot, so storm-trident can drive A/B update + rollback against + # the AZL4 COSI without the MOS bridge. + - template: stages/testing_rollback/vm-testing-azl4.yml + # Build AzL installer ISO (attended and unattended) - ${{ if or(eq(parameters.stageType, 'pr-e2e'), eq(parameters.stageType, 'ci'), eq(parameters.stageType, 'pr-e2e-azure'), eq(parameters.stageType, 'azl-validation')) }}: - template: stages/azl_installer/azl-installer.yml diff --git a/tests/e2e_tests/trident_configurations/base-azl4/test-selection.yaml b/tests/e2e_tests/trident_configurations/base-azl4/test-selection.yaml new file mode 100644 index 000000000..1789997bd --- /dev/null +++ b/tests/e2e_tests/trident_configurations/base-azl4/test-selection.yaml @@ -0,0 +1,5 @@ +compatible: + - base-azl4 + # Reuse the same pytest assertions as the AZL3 `base` scenario where + # appropriate. Add this scenario explicitly to test markers as we wire + # up pytest coverage. diff --git a/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml b/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml new file mode 100644 index 000000000..a3aac68b9 --- /dev/null +++ b/tests/e2e_tests/trident_configurations/base-azl4/trident-config.yaml @@ -0,0 +1,68 @@ +image: + url: http://NETLAUNCH_HOST_ADDRESS/files/regular.cosi + sha384: ignored +# Note: AZL4 doesn't ship grub2-efi-binary-noprefix. We deliberately do +# not set `internalParams.disableGrubNoprefixCheck` here — trident +# auto-detects AZL4 (via `is_azl4_or_later` in +# crates/trident/src/subsystems/esp.rs) and skips the check itself, so +# this scenario exercises the auto-detection path that real customers +# will hit. +storage: + disks: + - id: os + device: /dev/disk/by-path/pci-0000:00:1f.2-ata-2 + partitionTableType: gpt + partitions: + - id: root-a + type: root + size: 8G + - id: root-b + type: root + size: 8G + - id: esp + type: esp + size: 1G + - id: trident + type: linux-generic + size: 1G + - id: disk2 + device: /dev/disk/by-path/pci-0000:00:1f.2-ata-3 + partitionTableType: gpt + partitions: [] + abUpdate: + volumePairs: + - id: root + volumeAId: root-a + volumeBId: root-b + filesystems: + - deviceId: trident + source: new + mountPoint: /var/lib/trident + - deviceId: esp + mountPoint: + path: /boot/efi + options: umask=0077 + - deviceId: root + mountPoint: / +# AZL4 baseline scenario for the pytest E2E framework. Mirrors the AZL3 +# `base/` scenario as closely as possible while staying inside what PR-4's +# native hostname-carry-over fast path can serve. +# +# Why no `os:` section yet: +# The MOS install ISO (built from tests/images/trident-mos/iso.yaml) +# does not include /usr/bin/osmodifier. PR-5 bakes osmodifier into the +# target image so post-install Trident operations (update, runtime +# apply) can drive os.users / os.selinux / os.netplan via osmodifier, +# but the install-time validation runs in the MOS environment which +# currently lacks the binary. Until the MOS ISO is rebuilt with +# azurelinux-image-tools-osmodifier installed (a small follow-up), the +# install path must stick to PR-4's hostname-only fast path. +# +# Once the MOS includes osmodifier, this file can grow to mirror `base/` +# more completely (os.users, os.selinux, os.netplan). +# +# Other differences from base/: +# - No swap or /home partitions (kept simple for the first AZL4 scenario; +# swap support is its own follow-up and /home isn't load-bearing here). +# - No postConfigure sudo grant: the testing-user is added to wheel by +# the testimage MIC config and /etc/sudoers.d/wheel grants nopasswd. diff --git a/tests/e2e_tests/trident_configurations/rollback-azl4/test-selection.yaml b/tests/e2e_tests/trident_configurations/rollback-azl4/test-selection.yaml new file mode 100644 index 000000000..cbfa81bbe --- /dev/null +++ b/tests/e2e_tests/trident_configurations/rollback-azl4/test-selection.yaml @@ -0,0 +1,3 @@ +compatible: + - rollback + - rollback-azl4 diff --git a/tests/e2e_tests/trident_configurations/rollback-azl4/trident-config.yaml b/tests/e2e_tests/trident_configurations/rollback-azl4/trident-config.yaml new file mode 100644 index 000000000..e83320906 --- /dev/null +++ b/tests/e2e_tests/trident_configurations/rollback-azl4/trident-config.yaml @@ -0,0 +1,84 @@ +image: + url: http://NETLAUNCH_HOST_ADDRESS/files/regular.cosi + sha384: ignored +storage: + disks: + - id: os + device: /dev/disk/by-path/pci-0000:00:1f.2-ata-2 + partitionTableType: gpt + partitions: + - id: root-a + type: root + size: 8G + - id: root-b + type: root + size: 8G + - id: esp + type: esp + size: 1G + - id: trident + type: linux-generic + size: 1G + - id: disk2 + device: /dev/disk/by-path/pci-0000:00:1f.2-ata-3 + partitionTableType: gpt + partitions: [] + abUpdate: + volumePairs: + - id: root + volumeAId: root-a + volumeBId: root-b + filesystems: + - deviceId: trident + source: new + mountPoint: /var/lib/trident + - deviceId: esp + mountPoint: + path: /boot/efi + options: umask=0077 + - deviceId: root + mountPoint: / +os: + additionalFiles: + - destination: /var/lib/trident/local-health-check-file.sh + content: | + echo 'This is a local health check script.' + exit 0 +health: + checks: + - name: invoke-rollback-from-local-script + runOn: + - clean-install + path: /var/lib/trident/local-health-check-file.sh + - name: invoke-rollback-from-script + runOn: + - clean-install + content: | + exit 1 + - name: install-failure-systemd-check + runOn: + - clean-install + systemdServices: + - non-existent-service1.service + - non-existent-service2.service + timeoutSeconds: 15 +# AZL4 variant of the AZL3 `health-checks-install/` scenario. Adapted for the +# PR-4 hostname-only fast path: +# - Empty top-level `users`/`selinux`/`netplan` so install validation does +# not require the OS Modifier binary to be in the MOS install ISO (which +# does not currently include it; once the MOS rebuild lands, both this +# scenario and base-azl4 can grow os.users / os.selinux / os.netplan). +# - `os.additionalFiles` is the one os.* field used because health.checks +# references `path: /var/lib/trident/local-health-check-file.sh`, which +# needs to be on the target filesystem. additionalFiles is processed by +# Trident's storage / file-deploy paths, not by OS Modifier. +# +# Health-check failure expectations (asserted by tests/e2e_tests/rollback_test.py): +# - State transitions to `not-provisioned` (clean-install has no slot to +# roll back to; the install just fails). +# - `/var/lib/trident/trident-health-check-failure-*.log` is created. +# - The log contains: +# * `"Failed health check(s)"` +# * `"Script 'invoke-rollback-from-script' failed"` +# * `"Unit non-existent-service1.service could not be found"` +# * `"Unit non-existent-service2.service could not be found"` diff --git a/tests/images/trident-vm-testimage/README.md b/tests/images/trident-vm-testimage/README.md index e527ae04e..7d4379ed5 100644 --- a/tests/images/trident-vm-testimage/README.md +++ b/tests/images/trident-vm-testimage/README.md @@ -35,3 +35,49 @@ To build the update images, run: | ----------- | --------------------------------------- | ----------------------------------- | | Regular | `make trident-vm-grub-testimage` | `artifacts/trident-vm-grub-testimage/*` | | With verity | `make trident-vm-grub-verity-testimage` | `artifacts/trident-vm-grub-testimage/*` | + +## AZL4 variant (`trident-vm-grub-testimage-azl4`) + +A Fedora-derived (Azure Linux 4.0) variant lives alongside the AZL3 image +above. It uses `base/updateimg-grub-azl4.yaml` instead of +`base/updateimg-grub.yaml` and consumes `BaseImage.AZL4_QEMU_GUEST`. + +### Two extra prerequisites for AZL4 + +1. **AZL4 base VHDX.** No prebuilt AZL4 VHDX is available in the ADO + Artifacts feed yet, so build one locally with Image Customizer: + + ```bash + sudo imagecustomizer create \ + --config-file path/to/azl4-qemu-guest.yaml \ + --rpm-source path/to/azl4.repo \ + --tools-file path/to/azl4-tools.tar.gz \ + --build-dir /tmp/azl4-base-build \ + --output-image-file artifacts/azl4_qemu_guest.vhdx \ + --output-image-format vhdx \ + --distro azurelinux --distro-version 4.0 + ``` + + See `wiki/playbooks/trident-azl4-e2e-manual.md` in the karhu repo for + a ready-to-paste base config and the alpha2 repo URL. + + When an AZL4 VHDX lands in the ADO feed, add a `BaseImageManifest` + entry for `AZL4_QEMU_GUEST` in `testimages.py` so `cli download` + fetches it the same way it does the AZL3 bases. + +2. **Trident binary baked in.** The AZL4 image bakes + `/usr/bin/trident` via `additionalFiles` because there is no + `trident-service` RPM packaged for AZL4 yet. Drop the built binary + at `base/trident-bin/trident` before invoking the builder: + + ```bash + mkdir -p base/trident-bin + cp base/trident-bin/trident + chmod +x base/trident-bin/trident + ``` + + The binary should be built from a stack including the AZL4 enabling + branches: `azl4-1-grub-native` + `azl4-2-esp-layouts` + + `azl4-3-configure-bls` + `azl4-4-osconfig-hostname`. Once those land + on main, a plain main build suffices. The `base/trident-bin/` + directory is gitignored. diff --git a/tests/images/trident-vm-testimage/base/files/hostname-shim.sh b/tests/images/trident-vm-testimage/base/files/hostname-shim.sh new file mode 100644 index 000000000..b12b3807c --- /dev/null +++ b/tests/images/trident-vm-testimage/base/files/hostname-shim.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# AZL4 doesn't ship a `hostname` binary in `coreutils` (Fedora moved it to +# its own package which AZL4 hasn't picked up yet). The pytest E2E +# framework uses `hostname` as a smoke test of the SSH session in +# tests/e2e_tests/conftest.py, so without this shim every test errors out +# at fixture setup. +# +# Tiny POSIX-only replacement that reads /etc/hostname, plus a passthrough +# for `hostname -s` and `hostname -f` for completeness. +case "$1" in + -s|--short) + cat /etc/hostname | cut -d. -f1 + ;; + -f|--fqdn|"") + cat /etc/hostname + ;; + *) + cat /etc/hostname + ;; +esac diff --git a/tests/images/trident-vm-testimage/base/files/regen-sshd-keys.service b/tests/images/trident-vm-testimage/base/files/regen-sshd-keys.service new file mode 100644 index 000000000..0fe938ddc --- /dev/null +++ b/tests/images/trident-vm-testimage/base/files/regen-sshd-keys.service @@ -0,0 +1,14 @@ +[Unit] +Description=Generate sshd host keys in /var/srv on first boot +ConditionPathExists=!/var/srv/etc/ssh/ssh_host_ed25519_key +Before=sshd.service +After=local-fs.target + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStartPre=/usr/bin/mkdir -p /var/srv/etc/ssh +ExecStart=/usr/bin/ssh-keygen -A -f /var/srv -q + +[Install] +WantedBy=multi-user.target diff --git a/tests/images/trident-vm-testimage/base/scripts/enable-regen-sshd-keys.sh b/tests/images/trident-vm-testimage/base/scripts/enable-regen-sshd-keys.sh new file mode 100755 index 000000000..bdf901cd2 --- /dev/null +++ b/tests/images/trident-vm-testimage/base/scripts/enable-regen-sshd-keys.sh @@ -0,0 +1,7 @@ +#!/bin/bash +# regen-sshd-keys is a one-shot service that generates SSH host keys in +# /var/srv on first boot. Enable it via wants symlink because the generic +# `services.enable` in MIC config is reserved for systemd unit names that +# come from packages, and our unit is delivered via additionalFiles. +ln -sf /etc/systemd/system/regen-sshd-keys.service \ + /etc/systemd/system/multi-user.target.wants/regen-sshd-keys.service diff --git a/tests/images/trident-vm-testimage/base/scripts/ssh-move-host-keys-azl4.sh b/tests/images/trident-vm-testimage/base/scripts/ssh-move-host-keys-azl4.sh new file mode 100755 index 000000000..ede3fdbaa --- /dev/null +++ b/tests/images/trident-vm-testimage/base/scripts/ssh-move-host-keys-azl4.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# AZL4-compatible variant of ssh-move-host-keys.sh. +# +# AZL3 sshd reads the main /etc/ssh/sshd_config and we appended HostKey +# lines to it. AZL4 sshd 10.0+ supports drop-ins under /etc/ssh/sshd_config.d/ +# which is the cleaner approach. +SSH_VAR_DIR="/var/srv/etc/ssh" +mkdir -p /etc/ssh/sshd_config.d +cat > /etc/ssh/sshd_config.d/50-trident-host-keys.conf < Date: Mon, 8 Jun 2026 13:21:02 -0700 Subject: [PATCH 20/26] fix: Remove stale osmodifier additionalFile from updateimg osmodifier is now a Rust crate built into the trident binary (PR #638). No separate osmodifier binary needs to be baked into test images. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/images/testimages.py | 8 ++------ .../trident-vm-testimage/base/updateimg-grub-azl4.yaml | 9 --------- 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/tests/images/testimages.py b/tests/images/testimages.py index b4d8b5416..e71f9fda0 100755 --- a/tests/images/testimages.py +++ b/tests/images/testimages.py @@ -146,15 +146,11 @@ config_file="base/updateimg-grub-azl4.yaml", ssh_key="files/id_rsa.pub", # No trident-service RPM for AZL4 yet — the binary is delivered - # via additionalFiles. extra_dependencies enforces both binaries - # are in place before the image is built (osmodifier is delivered - # the same way until an AZL4 RPM exists; see - # tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml - # for the additionalFiles entries that consume both paths). + # via additionalFiles. extra_dependencies enforces it is in place + # before the image is built. requires_trident=False, extra_dependencies=[ Path("tests/images/trident-vm-testimage/base/trident-bin/trident"), - Path("tests/images/trident-vm-testimage/base/osmodifier-bin/osmodifier"), ], ), ImageConfig( diff --git a/tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml b/tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml index b03677b65..9cee3c809 100644 --- a/tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml +++ b/tests/images/trident-vm-testimage/base/updateimg-grub-azl4.yaml @@ -106,15 +106,6 @@ os: - source: trident-bin/trident destination: /usr/bin/trident permissions: "755" - # Bake the OS Modifier binary into /usr/local/bin/osmodifier. - # AZL4 does not yet ship azurelinux-image-tools-osmodifier as an RPM, - # but Vince Perri's MIC AZL4 branch builds an AZL4-aware binary. - # Drop the build at osmodifier-bin/osmodifier alongside the trident - # binary; this ride-along disappears once the AZL4 RPM is published. - # See the comment above on /usr/local/bin placement. - - source: osmodifier-bin/osmodifier - destination: /usr/bin/osmodifier - permissions: "755" # AZL4 lacks a /usr/bin/hostname binary; the pytest framework smoke- # tests SSH with `hostname`, so we ship a tiny shim. - source: files/hostname-shim.sh From 4147f5a2c442cc6df1526dad8a951d2fc0691ead Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Mon, 8 Jun 2026 13:21:02 -0700 Subject: [PATCH 21/26] tests: Add AZL4 BM-simulated netlaunch test stage Adds AZL4 bare-metal simulated netlaunch pipeline stage and SELinux xattr stripping script for test image prep. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../testing_vm/netlaunch-testing-azl4.yml | 364 ++++++++++++++++++ .../base/scripts/strip-selinux-xattrs.sh | 85 ++++ 2 files changed, 449 insertions(+) create mode 100644 .pipelines/templates/stages/testing_vm/netlaunch-testing-azl4.yml create mode 100644 tests/images/trident-vm-testimage/base/scripts/strip-selinux-xattrs.sh diff --git a/.pipelines/templates/stages/testing_vm/netlaunch-testing-azl4.yml b/.pipelines/templates/stages/testing_vm/netlaunch-testing-azl4.yml new file mode 100644 index 000000000..03dd6cab9 --- /dev/null +++ b/.pipelines/templates/stages/testing_vm/netlaunch-testing-azl4.yml @@ -0,0 +1,364 @@ +# AZL4 BM-simulated netlaunch test stage. +# +# Drives an AZL3 MOS installer ISO + AZL4 COSI through netlaunch to validate +# that the AZL4 COSI (from PR-5) can be installed by Trident onto a fresh +# virtdeploy VM. This is the BM-simulated path: Trident runs from the live +# MOS environment (AZL3), `trident install` partitions the disk and streams +# the AZL4 COSI to it, the target boots into AZL4. +# +# Differences from netlaunch-testing.yml: +# * No test matrix. Hardcoded to the `base-azl4` configuration in +# tests/e2e_tests/trident_configurations/. +# * Host runtimeEnv only. Container variant is a follow-on. +# * No ACR push. The AZL4 COSI is served locally by netlaunch. +# * No SELinux check. AZL4 SELinux integration is its own follow-on. +# * No matrix-driven test execution after install. First iteration only +# validates that the VM provisions and is reachable over SSH. +# +# TODO(azl4-merge-back): Once AZL4 has a published trident-service RPM and +# all the bits below (SELinux, container path, metrics) are wired up for +# AZL4, fold this back into netlaunch-testing.yml as an additional matrix +# entry. + +parameters: + - name: installerISOArtifact + type: string + # AZL3 MOS ISO is the live OS Trident runs from. It does not need to + # match the target OS version since the target comes from the COSI. + default: "trident-installer" + + - name: cosiArtifact + type: string + # Artifact published by stages/build_image/build-image-azl4.yml. The + # actual COSI file inside is trident-vm-grub-testimage-azl4.cosi. + default: "trident-vm-grub-testimage-azl4" + + - name: tridentConfiguration + type: string + # Lives at tests/e2e_tests/trident_configurations/base-azl4/. + default: "base-azl4" + + - name: dependsOnStage + type: string + default: "" + +stages: + - stage: NetlaunchTesting_AZL4 + displayName: Netlaunch Testing - AZL4 (BM-simulated) + dependsOn: + - BuildingTools + - PrepareSSHKeys + - TridentTestImg_trident_installer + - TridentTestImg_trident_vm_grub_testimage_azl4 + - ${{ if ne(parameters.dependsOnStage, '') }}: + - ${{ parameters.dependsOnStage }} + + jobs: + - job: NetlaunchAzl4 + displayName: Netlaunch (AZL3 ISO + AZL4 COSI) + timeoutInMinutes: 30 + pool: + type: linux + name: trident-ubuntu-1es-pool-eastus2 + hostArchitecture: amd64 + + variables: + - name: ob_outputDirectory + value: /tmp/deployment_logs_azl4 + - name: ob_artifactBaseName + value: "netlaunch-testing-azl4" + + - name: tridentConfigPath + value: tests/e2e_tests/trident_configurations/${{ parameters.tridentConfiguration }} + + - name: netlaunchPort + value: 4001 + + steps: + - template: ../common_tasks/checkout_trident.yml + + - task: DownloadPipelineArtifact@2 + displayName: "Download AZL3 installer ISO" + inputs: + buildType: current + artifactName: "${{ parameters.installerISOArtifact }}" + targetPath: "$(TRIDENT_SOURCE_DIR)/artifacts/iso" + + - task: DownloadPipelineArtifact@2 + displayName: "Download AZL4 COSI" + inputs: + buildType: current + artifactName: "${{ parameters.cosiArtifact }}" + targetPath: "$(Build.ArtifactStagingDirectory)/cosi-azl4" + + # PrepareSSHKeys produces the shared 'ssh-keys' artifact whose + # id_rsa.pub is baked into the AZL4 COSI at MIC build time (see + # build-image-template-azl4.yml). The matching private key + # `ssh-keys/id_rsa` is what we use locally to SSH into the + # post-install AZL4 VM. Until 2026-05-17 we generated a fresh + # per-build keypair inside testimages.py and published the + # private half alongside the COSI, but the qcow2 + cosi builds + # for VM-testing need to share a key (the same VM A/B-updates + # from qcow2 to cosi), so we standardized on the shared artifact. + - task: DownloadPipelineArtifact@2 + displayName: "Download shared SSH key" + inputs: + buildType: current + artifactName: "ssh-keys" + targetPath: "$(Build.ArtifactStagingDirectory)/ssh-keys" + + - task: DownloadPipelineArtifact@2 + displayName: "Download go-tools" + inputs: + buildType: current + artifactName: "go-tools" + patterns: | + netlaunch + netlisten + storm-trident + virtdeploy + targetPath: "$(TRIDENT_SOURCE_DIR)/bin" + + # Install libvirt / qemu / OVMF and configure libvirt access. Without + # this, virt-deploy fails creating bridge interfaces ("Operation not + # permitted") on the OneBranch Ubuntu runner. + - template: netlaunch-prep.yml + + # NOTE: we intentionally do NOT run testing_common/trident-prep.yml. + # That template runs edit_host_config.py, which injects the test + # SSH key into trident-config's os.users section. The AZL4 + # `base-azl4` trident-config omits the os: section entirely + # because the AZL3 MOS installer ISO has no /usr/bin/osmodifier, + # so trident can't drive os.users at install time. Instead we + # use the per-image SSH key that testimages.py baked into the + # AZL4 COSI at MIC time (set up below). + + - bash: | + set -euxo pipefail + + chmod +x "$(TRIDENT_SOURCE_DIR)"/bin/{netlaunch,netlisten,storm-trident,virtdeploy} + + # Stage the AZL4 COSI as regular.cosi where netlaunch will + # serve it. The trident-config for base-azl4 references + # http://NETLAUNCH_HOST_ADDRESS/files/regular.cosi. + SERVE_DIR="$(TRIDENT_SOURCE_DIR)/artifacts/test-image" + mkdir -p "$SERVE_DIR" + + # The artifact may contain the file with the imageName as + # prefix; tolerate both layouts. The clone-index suffix + # (`_0.cosi`) is what testimages.py produces when called + # with the default --clones >= 1. + COSI_SRC="" + for candidate in \ + "$(Build.ArtifactStagingDirectory)/cosi-azl4/trident-vm-grub-testimage-azl4_0.cosi" \ + "$(Build.ArtifactStagingDirectory)/cosi-azl4/trident-vm-grub-testimage-azl4.cosi" \ + "$(Build.ArtifactStagingDirectory)/cosi-azl4/regular.cosi"; do + if [ -f "$candidate" ]; then + COSI_SRC="$candidate" + break + fi + done + + if [ -z "$COSI_SRC" ]; then + echo "Could not find AZL4 COSI. Artifact contents:" + find "$(Build.ArtifactStagingDirectory)/cosi-azl4" -type f | head -20 + exit 1 + fi + + cp "$COSI_SRC" "$SERVE_DIR/regular.cosi" + ls -alh "$SERVE_DIR" + + # Install the shared SSH private key (from the + # PrepareSSHKeys artifact) as the test framework's + # helpers/key. Its matching public key was baked into the + # AZL4 COSI at MIC build time, so post-install we can SSH + # into the target as testing-user with this key. + KEY_SRC="$(Build.ArtifactStagingDirectory)/ssh-keys/id_rsa" + if [ ! -f "$KEY_SRC" ]; then + echo "Could not find shared SSH key at $KEY_SRC. Artifact contents:" + find "$(Build.ArtifactStagingDirectory)/ssh-keys" -type f + exit 1 + fi + cp "$KEY_SRC" "$(TRIDENT_SOURCE_DIR)/tests/e2e_tests/helpers/key" + chmod 600 "$(TRIDENT_SOURCE_DIR)/tests/e2e_tests/helpers/key" + # Convert to PEM if not already (matches what trident-prep + # does for AZL3 keys). + # Convert the per-build SSH key to PEM if it isn't already. + # `ssh-keygen -p -P "" -N "" -m PEM -f ...` is a no-op on + # already-PEM keys and explicitly tells ssh-keygen that + # the existing passphrase is empty (so it doesn't read + # stdin if it can't guess). + ssh-keygen -p -P "" -N "" -m PEM -f "$(TRIDENT_SOURCE_DIR)/tests/e2e_tests/helpers/key" + ls -alh "$(TRIDENT_SOURCE_DIR)/artifacts/iso" + ls -alh "$(TRIDENT_SOURCE_DIR)/bin" + displayName: "Stage AZL4 COSI as regular.cosi" + + - bash: | + set -eux + # Disable virtlogd rollover so we keep full logs. + echo "max_size = 0" | sudo tee -a /etc/libvirt/virtlogd.conf + sudo systemctl restart virtlogd.socket + + ./tools/virt-deploy create --mem 12 --disks 32,32 + workingDirectory: $(TRIDENT_SOURCE_DIR) + displayName: "Create virt-deploy VM" + + - bash: | + set -euxo pipefail + + TRIDENT_CONFIG="$(TRIDENT_SOURCE_DIR)/$(tridentConfigPath)/trident-config.yaml" + + # Run netlaunch in the background so we can monitor its log + # for the install-success marker. The base-azl4 cosi does + # not yet ship trident systemd units (trident.service, + # tridentd.socket) so the installed AZL4 system never + # phones home post-reboot. netlaunch's ListenLoop always + # waits for at least one phone-home regardless of flags, + # so we treat trident's own "Rebooting system" log line + # (emitted by crates/trident/src/reboot.rs from the install + # success path) as our completion signal and terminate + # netlaunch cleanly. Phone-home wiring belongs with the + # VM-testing work where the trident systemd stack lands. + # + # netlaunch is launched with `setsid` so we can signal the + # whole process group on shutdown — otherwise the child + # HTTP/TFTP server processes get reparented to PID 1 and + # may leak ports / qcow2 file locks to the next job on the + # same agent. + setsid ./bin/netlaunch \ + --iso ./artifacts/iso/${{ parameters.installerISOArtifact }}.iso \ + --config $(TRIDENT_SOURCE_DIR)/tools/vm-netlaunch.yaml \ + --trident "$TRIDENT_CONFIG" \ + --servefolder ./artifacts/test-image \ + --logstream \ + --force-color \ + --full-logstream logstream-full.log \ + --only-print-exit-code \ + --port $(netlaunchPort) > ./clean-install-azl4.log 2>&1 & + NETLAUNCH_PID=$! + NETLAUNCH_PGID="$NETLAUNCH_PID" + echo "netlaunch pid: $NETLAUNCH_PID (pgid $NETLAUNCH_PGID)" + + # Watch for the install-success marker for up to 12 minutes. + # Real install completes in 1-3 minutes once netlaunch + # finishes booting the MOS ISO via HTTP boot. UEFI HTTP + # boot can occasionally need 5+ minutes, so 12 minutes is + # generous. + # + # The marker regex is tightened to trident's own + # log-record prefix to avoid false-positives from any + # kernel / systemd / dracut "Restarting system" line that + # might fire on an error path before trident itself + # actually completes. + REBOOT_RE='trident[^[:space:]]*[[:space:]]+(INFO|WARN)[[:space:]].*Rebooting system' + FATAL_RE='kernel panic|dracut:.*FATAL|Emergency mode|emergency!' + DEADLINE=$((SECONDS + 720)) + INSTALL_OK=0 + while [ $SECONDS -lt $DEADLINE ]; do + if grep -Eq "$FATAL_RE" ./clean-install-azl4.log 2>/dev/null; then + echo "FATAL marker observed before install success — aborting" + break + fi + if ! kill -0 $NETLAUNCH_PID 2>/dev/null; then + echo "netlaunch exited on its own" + if wait $NETLAUNCH_PID; then + INSTALL_OK=1 + fi + break + fi + if grep -Eq "$REBOOT_RE" ./clean-install-azl4.log 2>/dev/null; then + echo "install completed (saw trident 'Rebooting system' marker)" + INSTALL_OK=1 + break + fi + sleep 10 + done + + # Always show the netlaunch log tail for diagnostics + echo "--- netlaunch log tail ---" + tail -50 ./clean-install-azl4.log || true + + if [ $INSTALL_OK -eq 1 ]; then + echo "Killing netlaunch process group (install completed; not waiting for phone-home)" + # SIGTERM the whole group; netlaunch's children include + # an HTTP/TFTP server we need to release the port on. + kill -TERM -"$NETLAUNCH_PGID" 2>/dev/null || true + # Generous grace so --full-logstream finishes flushing + # to logstream-full.log (which the failure-diagnostic + # display-logs step uploads). + for _ in 1 2 3 4 5 6 7 8 9 10; do + if ! kill -0 $NETLAUNCH_PID 2>/dev/null; then + break + fi + sleep 1 + done + kill -KILL -"$NETLAUNCH_PGID" 2>/dev/null || true + wait $NETLAUNCH_PID 2>/dev/null || true + exit 0 + fi + + echo "Install marker not observed within timeout (or fatal seen)" + kill -TERM -"$NETLAUNCH_PGID" 2>/dev/null || true + sleep 5 + kill -KILL -"$NETLAUNCH_PGID" 2>/dev/null || true + exit 1 + workingDirectory: $(TRIDENT_SOURCE_DIR) + displayName: "🚀 Run netlaunch (AZL3 ISO installs AZL4 COSI)" + # 14 minutes covers the 12-minute install-success watcher + # plus a couple minutes of slack. + timeoutInMinutes: 14 + + - bash: | + set -eux + sudo ./bin/storm-trident helper wait-for-login -a \ + --vm-name "$(jq -r '.virtualmachines[0].name' $(TRIDENT_SOURCE_DIR)/tools/virt-deploy-metadata.json)" \ + --artifacts-folder "$(ob_outputDirectory)" + timeoutInMinutes: 5 + # `succeeded()` (not `succeededOrFailed()`) so a failed + # SSH-up after a "successful" netlaunch actually fails the + # stage. Combined with the tightened install marker above, + # this closes the structural bias-toward-green where the + # netlaunch wrapper could exit 0 on a false-positive log + # line and let everything downstream gloss over the failure. + condition: succeeded() + workingDirectory: $(TRIDENT_SOURCE_DIR) + displayName: "📄 Wait for target OS to be reachable" + + - bash: | + set -eux + ./bin/storm-trident script capture-screenshot \ + --screenshot-filename "install-azl4.png" \ + --artifacts-folder "$(ob_outputDirectory)" + displayName: "📷 Capture screenshot" + workingDirectory: $(TRIDENT_SOURCE_DIR) + condition: succeededOrFailed() + + - bash: | + set -eux + sudo ./bin/storm-trident helper display-logs -a \ + --serial-log-artifact-file-name "azl4-install-target-os-A-serial.log" \ + --trident-trace-log-file "$(TRIDENT_SOURCE_DIR)/logstream-full.log" \ + --artifacts-folder "$(ob_outputDirectory)" + displayName: "📄 Display install logs" + workingDirectory: $(TRIDENT_SOURCE_DIR) + condition: succeededOrFailed() + + - bash: | + set -eux + sudo virsh shutdown virtdeploy-vm-0 || true + mkdir -p $(ob_outputDirectory) + sudo cp /var/lib/libvirt/images/virtdeploy-pool/virtdeploy-vm-0-0-volume.qcow2 $(ob_outputDirectory)/ || true + sudo zstd -T0 $(ob_outputDirectory)/virtdeploy-vm-0-0-volume.qcow2 || true + sudo cp $(TRIDENT_SOURCE_DIR)/tests/e2e_tests/helpers/key $(ob_outputDirectory) || true + # Owner-only readable. Previously this was `chmod 777` + # which produced a SARIF-flaggable artifact even though + # the key is per-build ephemeral. + sudo chmod 600 $(ob_outputDirectory)/key || true + workingDirectory: $(TRIDENT_SOURCE_DIR) + condition: failed() + displayName: "Publish OS disk on failure" + + - template: ../testing_common/fix-output-directory-for-one-branch-step.yml + parameters: + outputDir: $(ob_outputDirectory) + condition: always() diff --git a/tests/images/trident-vm-testimage/base/scripts/strip-selinux-xattrs.sh b/tests/images/trident-vm-testimage/base/scripts/strip-selinux-xattrs.sh new file mode 100644 index 000000000..aaa8f3844 --- /dev/null +++ b/tests/images/trident-vm-testimage/base/scripts/strip-selinux-xattrs.sh @@ -0,0 +1,85 @@ +#!/bin/bash +# Strip security.selinux xattrs from every file in the cosi. +# +# Background: AZL4's base VHDX is built by the upstream Azure Linux build +# process with SELinux file labels baked in (e.g. system_u:object_r:lib_t:s0). +# Even though this test image sets `selinux: mode: disabled`, MIC does not +# strip the inherited xattrs — `mode: disabled` only controls boot-time +# SELinux state. +# +# These labels become a problem when Trident installs this cosi from the +# AZL3 MOS environment: MOS boots with selinux=1 enforcing=0, loads its +# AZL3 policy, and dracut (running inside the chroot) tries to preserve +# the AZL4 labels via cp -a. The MOS-side SELinux LSM validates the +# context being written and rejects labels not in its policy. dracut +# cascades through hundreds of "cp: setting attribute 'security.selinux': +# Permission denied" errors, eventually fatally on dracut-install's ldd +# step. +# +# Stripping the xattrs at cosi build time sidesteps this entirely: +# - During MIC build, SELinux is not loaded inside the chroot, so +# setfattr -x works without policy interference. +# - During Trident install in MOS, cp -a finds no security.selinux to +# preserve and skips the setxattr call. +# - On first boot of the installed AZL4 OS, files get auto-relabeled if +# SELinux is enabled (which our test config disables anyway). +# +# Once AZL4 is the install/target environment for everything (no AZL3 MOS +# bridging it), this script can be removed. + +set -euo pipefail + +echo "Stripping security.selinux xattrs from rootfs..." + +# Walk every regular file, symlink, and directory across all filesystems +# under /. `find -xdev` would skip separately-mounted filesystems like +# `/boot` and `/var` that MIC commonly composes with — and `/boot` +# specifically carries SELinux labels on the kernel image and initramfs, +# which is exactly what dracut touches during AZL3 MOS install of the +# AZL4 cosi. So we walk the whole tree and only prune the virtual +# filesystems where xattrs don't make sense (`/proc`, `/sys`, `/dev`, +# `/run`). +# +# `setfattr` follows symlinks by default; `-h` makes it operate on the +# symlink itself, which is what we want here. +count=0 +fail_count=0 +while IFS= read -r -d '' f; do + # Capture stderr so we can distinguish ENODATA ("no such attribute", + # benign — nothing to strip) from real failures (EPERM, EOPNOTSUPP). + err=$(setfattr -h -x security.selinux "$f" 2>&1 >/dev/null) || rc=$? && rc=${rc:-0} + if [ "$rc" -eq 0 ]; then + count=$((count + 1)) + elif echo "$err" | grep -qE "No such attribute|Operation not supported"; then + : # nothing to strip, expected for files without the xattr + else + fail_count=$((fail_count + 1)) + echo "setfattr failed on '$f': $err" >&2 + fi + rc=0 +done < <(find / \( -path /proc -o -path /sys -o -path /dev -o -path /run \) -prune \ + -o \( -type f -o -type d -o -type l \) -print0) + +echo "Stripped security.selinux from ${count} files/dirs" + +if [ "$fail_count" -gt 0 ]; then + echo "ERROR: setfattr failed (non-ENODATA) on ${fail_count} entries" >&2 + exit 1 +fi + +# Verify the strip actually took effect by scanning a representative set +# of paths (rootfs, /boot if present, /usr/lib/systemd, /etc). Any +# residual security.selinux means we missed something — fail loudly +# rather than warning, since the whole point of the script is to leave +# the image bare. +sentinel_dirs=( "/etc" "/usr/lib/systemd" "/usr/bin" ) +if [ -d /boot ]; then + sentinel_dirs+=( "/boot" ) +fi +for d in "${sentinel_dirs[@]}"; do + if getfattr -R -m security.selinux "$d" 2>/dev/null | grep -q security.selinux; then + echo "ERROR: security.selinux xattr still present under '$d'" >&2 + getfattr -R -m security.selinux "$d" 2>/dev/null | head -10 >&2 + exit 1 + fi +done From 8b158d11b378dd6abd8ca87f91e44806610649da Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Tue, 2 Jun 2026 17:40:44 -0700 Subject: [PATCH 22/26] engineering: Add AZL4 qcow2 base image, offline-init, sfdisk hardening Adds sfdisk partition-table helper, extended offline-init for AZL4 qcow2 images, base image COSI config, and test helper scripts. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/osutils/src/sfdisk.rs | 55 +++++ crates/trident/src/init/offline/mod.rs | 146 +++++++++++-- tests/e2e_tests/base_test.py | 3 + .../base/baseimg-grub-azl4.yaml | 195 ++++++++++++++++++ .../scripts/enable-trident-service-azl4.sh | 35 ++++ .../base/scripts/rebuild-initrd-azl4.sh | 62 ++++++ .../base/scripts/update-host-status-azl4.sh | 16 ++ 7 files changed, 498 insertions(+), 14 deletions(-) create mode 100644 tests/images/trident-vm-testimage/base/baseimg-grub-azl4.yaml create mode 100644 tests/images/trident-vm-testimage/base/scripts/enable-trident-service-azl4.sh create mode 100644 tests/images/trident-vm-testimage/base/scripts/rebuild-initrd-azl4.sh create mode 100644 tests/images/trident-vm-testimage/base/scripts/update-host-status-azl4.sh diff --git a/crates/osutils/src/sfdisk.rs b/crates/osutils/src/sfdisk.rs index 81eef21c7..f40276ad9 100644 --- a/crates/osutils/src/sfdisk.rs +++ b/crates/osutils/src/sfdisk.rs @@ -197,6 +197,61 @@ pub fn get_disk_uuid(disk: &Path) -> Result, Error> { Ok(Some(uuid)) } +/// Sets the disk-id (GPT header DiskGUID) of the given disk via sfdisk. +/// +/// `uuid` must parse as a valid GUID; this is checked before invoking +/// sfdisk so an accidental flag-like string (e.g. `--foo`) is rejected +/// here rather than mis-interpreted by sfdisk as an option. +/// +/// `--no-reread` + `--no-tell-kernel` are passed because the typical +/// caller is `trident offline-initialize` inside MIC's chroot, where +/// the disk's partitions are bind-mounted into the chroot. Requesting +/// `BLKRRPART` on a disk with mounted partitions returns EBUSY; we +/// only care about updating the on-disk GPT here. +pub fn set_disk_uuid(disk: &Path, uuid: &str) -> Result<(), Error> { + uuid::Uuid::parse_str(uuid) + .with_context(|| format!("'{uuid}' is not a valid GUID for sfdisk --disk-id"))?; + Dependency::Sfdisk + .cmd() + .arg("--no-reread") + .arg("--no-tell-kernel") + .arg("--disk-id") + .arg(disk) + .arg(uuid) + .run_and_check() + .context(format!( + "Failed to set disk-id on {} to {uuid}", + disk.display() + ))?; + Ok(()) +} + +/// Sets the GPT partition UUID for a specific partition by number on the +/// given disk. +/// +/// `uuid` is validated as a GUID first to avoid sfdisk mis-interpreting +/// a flag-like argument. `--no-reread` / `--no-tell-kernel` mirror +/// [`set_disk_uuid`] for safety inside MIC chroots with mounted +/// partitions. +pub fn set_part_uuid(disk: &Path, partition_number: usize, uuid: &str) -> Result<(), Error> { + uuid::Uuid::parse_str(uuid) + .with_context(|| format!("'{uuid}' is not a valid GUID for sfdisk --part-uuid"))?; + Dependency::Sfdisk + .cmd() + .arg("--no-reread") + .arg("--no-tell-kernel") + .arg("--part-uuid") + .arg(disk) + .arg(partition_number.to_string()) + .arg(uuid) + .run_and_check() + .context(format!( + "Failed to set partition UUID on {} partition {partition_number} to {uuid}", + disk.display() + ))?; + Ok(()) +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/trident/src/init/offline/mod.rs b/crates/trident/src/init/offline/mod.rs index cdbeee23d..ef3d03872 100644 --- a/crates/trident/src/init/offline/mod.rs +++ b/crates/trident/src/init/offline/mod.rs @@ -11,7 +11,7 @@ use anyhow::{bail, Error}; use log::{debug, info, trace, warn}; use maplit::hashmap; -use osutils::lsblk; +use osutils::{lsblk, sfdisk}; use sysdefs::partition_types::DiscoverablePartitionType; use trident_api::{ config::{ @@ -256,22 +256,131 @@ fn generate_host_status( .structured(ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment) .message("Failed to find root device in lsblk output")?; - let disk_uuid = lsblk_device + let disk_uuid = match lsblk_device .ptuuid .clone() .and_then(|ptuuid| ptuuid.as_uuid()) - .structured(ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment) - .message("No UUID found for root device")?; + { + Some(uuid) => uuid, + None => { + // lsblk didn't surface a PTUUID. This can happen in chroot + // environments (e.g. image-customizer / MIC) where the + // exposed loop device has partition children but the GPT + // disk-id either isn't set on the partition table or isn't + // populated by lsblk's PTUUID column. Fall back to sfdisk + // (which reads the GPT directly), and if that also reports + // no disk-id, mint one and persist it so the resulting + // image carries it forward to runtime. + let disk_dev_path = PathBuf::from("/dev").join(&lsblk_device.name); + warn!( + "PTUUID not reported by lsblk for {}; falling back to sfdisk", + disk_dev_path.display() + ); + let from_sfdisk = sfdisk::get_disk_uuid(&disk_dev_path) + .structured(ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment) + .message("Failed to read GPT disk-id via sfdisk")? + .and_then(|u| u.as_uuid()); + match from_sfdisk { + Some(uuid) => uuid, + None => { + let new_uuid = uuid::Uuid::new_v4(); + warn!( + "No GPT disk-id present on {}; assigning {}", + disk_dev_path.display(), + new_uuid + ); + sfdisk::set_disk_uuid(&disk_dev_path, &new_uuid.to_string()) + .structured( + ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment, + ) + .message(format!( + "Failed to assign GPT disk-id on {}", + disk_dev_path.display() + ))?; + new_uuid + } + } + } + }; lsblk_device.children.sort_by_key(|p| p.partn); - for (i, part) in lsblk_device.children.iter().enumerate() { - if part.part_uuid.is_none() { - return Err(TridentError::new( - ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment, - )) - .message(format!("No part UUID found for partition {}", i + 1)); + // Compute disk_dev_path once for partition-UUID fallback below. + let disk_dev_path = PathBuf::from("/dev").join(&lsblk_device.name); + + // For each partition, ensure we have a usable PARTUUID. Mirror the + // disk-id fallback above: prefer lsblk, then sfdisk, then mint a + // fresh one and persist it via sfdisk. Some chroot environments + // don't surface PARTUUID via lsblk --output-all and may also leave + // the value unset on the underlying GPT. + for (i, part) in lsblk_device.children.iter_mut().enumerate() { + if part.part_uuid.as_ref().and_then(|u| u.as_uuid()).is_some() { + continue; } + let partn = part.partn.unwrap_or((i + 1) as u32) as usize; + warn!( + "PARTUUID not reported by lsblk for partition {} on {}; falling back to sfdisk", + partn, + disk_dev_path.display() + ); + // Re-read the disk via sfdisk -J to find any UUID already present + // on this partition (sfdisk reads the GPT directly). + let sf_info = sfdisk::SfDisk::get_info(&disk_dev_path) + .structured(ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment) + .message(format!( + "Failed to read GPT info via sfdisk for {}", + disk_dev_path.display() + ))?; + if let Some(existing) = sf_info + .partitions + .iter() + .find(|p| p.number == partn) + .and_then(|p| p.id.as_uuid()) + { + // Use the canonical form sfdisk reported, not a re-rendered + // copy — sfdisk normalizes UUIDs to upper-case on disk and + // downstream /dev/disk/by-partuuid/ lookups must match. + part.part_uuid = Some(existing.to_string().into()); + continue; + } + + let new_uuid = uuid::Uuid::new_v4(); + warn!( + "Partition {} on {} has no PARTUUID; assigning {}", + partn, + disk_dev_path.display(), + new_uuid + ); + sfdisk::set_part_uuid(&disk_dev_path, partn, &new_uuid.to_string()) + .structured(ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment) + .message(format!( + "Failed to assign PARTUUID on partition {} of {}", + partn, + disk_dev_path.display() + ))?; + + // Re-read to get sfdisk's canonical on-disk form (upper-case) + // rather than stamping our locally-generated lower-case Uuid. + // Avoids a subtle case-mismatch with udev's + // /dev/disk/by-partuuid/ symlinks. + let written_uuid = sfdisk::SfDisk::get_info(&disk_dev_path) + .structured(ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment) + .message(format!( + "Failed to re-read GPT info via sfdisk for {} after writing partition UUID", + disk_dev_path.display() + ))? + .partitions + .iter() + .find(|p| p.number == partn) + .and_then(|p| p.id.as_uuid()) + .ok_or_else(|| { + TridentError::new(ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment) + }) + .message(format!( + "sfdisk reported no PARTUUID for partition {} after writing {}", + partn, new_uuid + ))?; + part.part_uuid = Some(written_uuid.to_string().into()); } // Get partition paths created from combining Prism history and lsblk output. @@ -494,12 +603,21 @@ pub fn execute( trace!("Prism history contents:\n{history_file}"); + // Note: `disk` is the *runtime* device path that will be written + // into the datastore (e.g. /dev/sda). At build time inside Prism's + // chroot, this path generally does not exist because the disk is + // exposed as a loop device (the actual build-time device is + // auto-detected below by walking lsblk for the mount at "/"). + // Older code asserted that `disk` exist at build time, but that + // check tested the wrong invariant and broke AZL4 image builds + // where MIC does not bind a /dev/sda node into the chroot. let disk_path = Path::new(disk); if !disk_path.exists() { - return Err(TridentError::new( - ExecutionEnvironmentMisconfigurationError::PrismChrootEnvironment, - )) - .message(format!("Prism chroot environment doesn't contain {disk}")); + debug!( + "Runtime disk path {} not present in build environment; \ + this is expected when running inside MIC's chroot.", + disk_path.display() + ); } let history: Vec = diff --git a/tests/e2e_tests/base_test.py b/tests/e2e_tests/base_test.py index 3e6094805..884e2dcec 100644 --- a/tests/e2e_tests/base_test.py +++ b/tests/e2e_tests/base_test.py @@ -421,6 +421,9 @@ def test_users(connection, hostConfiguration): expected_users = list() expected_groups = dict() + if "os" not in hostConfiguration or "users" not in hostConfiguration.get("os", {}): + pytest.skip("No os.users in trident config (user baked into image by MIC)") + for user_info in hostConfiguration["os"]["users"]: expected_users.append(user_info["name"]) if "groups" in user_info: diff --git a/tests/images/trident-vm-testimage/base/baseimg-grub-azl4.yaml b/tests/images/trident-vm-testimage/base/baseimg-grub-azl4.yaml new file mode 100644 index 000000000..c601b34d9 --- /dev/null +++ b/tests/images/trident-vm-testimage/base/baseimg-grub-azl4.yaml @@ -0,0 +1,195 @@ +# Base image config for trident-vm-grub-testimage-azl4. +# +# This builds the BOOTABLE base qcow2 that storm-trident rollback tests +# start the VM from. After this qcow2 boots, trident is installed and ready +# to drive A/B updates to the .cosi produced by updateimg-grub-azl4.yaml. +# +# Layout mirrors AZL3's baseimg-grub.yaml (A/B partitions) but uses +# AZL4-specific package names (dnf5, grub2-efi-x64, shim, etc.) matching +# the updateimg-grub-azl4.yaml flavor. +# +# TODO(azl4-rpm): Drop the trident + osmodifier additionalFiles entries +# once an AZL4 trident-service RPM and azurelinux-image-tools-osmodifier +# RPM are published. Until then we bake the binaries inline. + +storage: + disks: + - partitionTableType: gpt + maxSize: 10G + partitions: + - id: esp + type: esp + # 64M (vs AZL3's 16M) because AZL4 ships larger grub/shim + # binaries (~5MB grubx64.efi) and trident's offline-init + # copies them to both /boot/efi/EFI/AZLA and /AZLB. + size: 64M + + - id: root-a + size: 4G + + - id: root-b + size: 4G + + - id: trident + size: 1G + + - id: srv + size: grow + + bootType: efi + + filesystems: + - deviceId: esp + type: fat32 + mountPoint: + path: /boot/efi + options: umask=0077 + + - deviceId: root-a + type: ext4 + mountPoint: / + + - deviceId: trident + type: ext4 + mountPoint: /var/lib/trident + + - deviceId: srv + type: ext4 + mountPoint: /srv + +os: + bootloader: + resetType: hard-reset + hostname: trident-vm-testimg + + selinux: + mode: disabled + + kernelCommandLine: + # Mirrors AZL3 baseimg-grub.yaml; same console + debug settings so + # serial output works the same on both flavors. `net.ifnames=0` + # keeps interface naming as eth0/eth1/... so the + # `99-dhcp-eth0.network` systemd-networkd config matches the only + # virtio NIC the qemu test VM ships with. + extraCommandLine: + - console=tty0 + - console=tty1 + - console=ttyS0 + - net.ifnames=0 + - rd.debug + - loglevel=6 + - log_buf_len=1M + - systemd.journald.forward_to_console=1 + + packages: + install: + # AZL4 equivalents of the AZL3 set. See updateimg-grub-azl4.yaml + # for the rationale on each substitution. + - curl + - dnf5 + - efibootmgr + - grub2-efi-x64 + - grub2-efi-x64-modules + - grub2-tools + - grub2-tools-efi + - iproute + - iptables-nft + - jq + - lsof + - netplan + - openssh-server + - shim + - sudo + - systemd-networkd + - systemd-resolved + - vim + + services: + enable: + - sshd + - systemd-networkd + - systemd-resolved + # Trident socket-activated daemon. Storm-trident drives all + # update/commit/rollback through `trident grpc-client ...` which + # talks to this socket. + - tridentd.socket + # Oneshot trident commit at boot. Marks A/B update commits when + # they complete after reboot. + - trident.service + + additionalFiles: + # TODO(azl4-rpm): replace these binary copies and unit-file copies + # with `packages.install: - trident-service` once the RPM is + # published for AZL4. + - source: trident-bin/trident + destination: /usr/bin/trident + permissions: "755" + # TODO(azl4-osmodifier-rpm): replace with + # `packages.install: - azurelinux-image-tools-osmodifier` + # once the RPM is published. + - source: osmodifier-bin/osmodifier + destination: /usr/bin/osmodifier + permissions: "755" + + # Trident systemd units. AZL3 gets these from the trident-service + # RPM; AZL4 doesn't have that RPM yet so we ship them inline. The + # contents come straight from packaging/systemd/ in this repo so a + # source change requires a re-build of the qcow2 to pick up. + - source: ../../../../packaging/systemd/trident.service + destination: /usr/lib/systemd/system/trident.service + - source: ../../../../packaging/systemd/tridentd.service + destination: /usr/lib/systemd/system/tridentd.service + - source: ../../../../packaging/systemd/tridentd.socket + destination: /usr/lib/systemd/system/tridentd.socket + + # AZL4 lacks a /usr/bin/hostname binary; the pytest framework + # smoke-tests SSH with `hostname`, so we ship a tiny shim. + - source: files/hostname-shim.sh + destination: /usr/local/bin/hostname + permissions: "755" + - source: files/sudoers-wheel + destination: /etc/sudoers.d/wheel + - source: files/99-dhcp-eth0.network + destination: /etc/systemd/network/99-dhcp-eth0.network + - source: files/regen-sshd-keys.service + destination: /etc/systemd/system/regen-sshd-keys.service + + users: + - name: testing-user + sshPublicKeyPaths: + - files/id_rsa.pub + secondaryGroups: + - wheel + +scripts: + postCustomization: + # Mirrors AZL3's baseimg-grub.yaml ordering: post-install runs + # first, then we bake the trident datastore at build time (so first + # boot is fast and storm-trident can immediately drive updates), + # then ssh + network housekeeping, then initrd rebuild + xattr + # strip last. + - path: scripts/post-install.sh + # Bake trident's hoststatus into the datastore at build time. AZL3 + # does this via update-host-status.sh; AZL4 uses the same pattern + # via update-host-status-azl4.sh. Requires trident's offline-init + # to tolerate the absence of /dev/sda inside MIC's chroot (the + # `disk` argument is a runtime label, not a build-time assertion); + # the fix lives in crates/trident/src/init/offline/mod.rs. + - path: scripts/update-host-status-azl4.sh + - path: scripts/enable-trident-service-azl4.sh + - path: scripts/ssh-move-host-keys-azl4.sh + - path: scripts/enable-regen-sshd-keys.sh + # Rebuild initramfs with --no-hostonly + extra SATA drivers so the + # qcow2 boots regardless of which bus the consumer's libvirt config + # picks (storm-trident uses bus=sata; the original boot test on + # karhu-ubuntu used bus=virtio). MUST run BEFORE strip-selinux-xattrs + # because dracut writes new files with the build-time SELinux + # context, and we want those stripped too. + - path: scripts/rebuild-initrd-azl4.sh + # Strip security.selinux xattrs from all files. See updateimg-grub- + # azl4.yaml for the parallel write-up; the same MOS-side AZL3 + # SELinux policy rejects AZL4 contexts when any future operation + # tries to preserve them. Keeping the qcow2 label-free is defensive. + # MUST run LAST so it sweeps any files produced by earlier scripts + # (initrd, etc.). + - path: scripts/strip-selinux-xattrs.sh diff --git a/tests/images/trident-vm-testimage/base/scripts/enable-trident-service-azl4.sh b/tests/images/trident-vm-testimage/base/scripts/enable-trident-service-azl4.sh new file mode 100644 index 000000000..29889ea58 --- /dev/null +++ b/tests/images/trident-vm-testimage/base/scripts/enable-trident-service-azl4.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Defensive enable of trident.service and tridentd.socket. +# +# AZL3 gets these via the trident-service RPM's %systemd_post scriptlet. +# AZL4 doesn't have that RPM yet, so we ship the units via additionalFiles +# and *should* be able to rely on baseimg-grub-azl4.yaml's `services.enable:` +# stanza. In practice, `services.enable` did not create the +# multi-user.target.wants/trident.service symlink in MIC AZL4 builds +# (build 1120959 showed multi-user.target reached but trident.service +# never started post-reboot, leaving servicingState stuck at +# ab-update-finalized). Until we figure out why, manually link the +# units defensively. +# +# tridentd.socket gets the same treatment because (a) if services.enable +# is unreliable for one unit, it's likely unreliable for the other, and +# (b) storm-trident drives every update/commit/rollback through the +# tridentd gRPC socket — a missing /run/trident/trident.sock at boot +# would fail every subsequent storm-trident invocation in the test +# pipeline. +set -euxo pipefail + +mkdir -p /etc/systemd/system/multi-user.target.wants +mkdir -p /etc/systemd/system/sockets.target.wants +ln -sf /usr/lib/systemd/system/trident.service \ + /etc/systemd/system/multi-user.target.wants/trident.service +ln -sf /usr/lib/systemd/system/tridentd.socket \ + /etc/systemd/system/sockets.target.wants/tridentd.socket + +# Belt and braces: log the enabled state for diagnostics. systemctl is-enabled +# may fail inside MIC's chroot without a running dbus, so don't gate the +# script on it. +systemctl is-enabled trident.service 2>&1 || true +systemctl is-enabled tridentd.socket 2>&1 || true +ls -l /etc/systemd/system/multi-user.target.wants/trident.service || true +ls -l /etc/systemd/system/sockets.target.wants/tridentd.socket || true diff --git a/tests/images/trident-vm-testimage/base/scripts/rebuild-initrd-azl4.sh b/tests/images/trident-vm-testimage/base/scripts/rebuild-initrd-azl4.sh new file mode 100644 index 000000000..b07b3a8c0 --- /dev/null +++ b/tests/images/trident-vm-testimage/base/scripts/rebuild-initrd-azl4.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# Regenerate initrd with --no-hostonly so all storage drivers are +# included, not just the ones MIC's build environment happens to need. +# +# Why: storm-trident's rollback test (tools/storm/utils/vm/qemu/qemu.go) +# attaches the qcow2 to a virt-install VM with `bus=sata`. MIC builds +# the qcow2 in a virtio-backed environment, so dracut's default +# hostonly mode produces an initramfs with only virtio drivers. On a +# SATA-backed boot, the initramfs can't find the root partition by +# UUID and systemd hangs forever waiting for /dev/disk/by-uuid/. +# +# Rebuilding with --no-hostonly bakes in ahci, ata_piix, sata_sil, etc. +# along with virtio so the same qcow2 boots regardless of the bus type +# the consumer chooses. +# +# Runs inside the MIC chroot where /sys and /proc are bind-mounted but +# the host's SELinux is not loaded (MIC strips that), so dracut's +# cp -a doesn't hit the security.selinux setxattr issue that bites in +# AZL3 MOS during install (see strip-selinux-xattrs.sh for the parallel +# write-up). + +set -euo pipefail + +# Find the kernel version installed in this image. We require exactly +# one — `ls | head -1` would silently pick the wrong one if any future +# AZL4 variant ships multiple (kernel + kernel-hyperv, extramodules-*, +# etc.). Fail loudly rather than generate an initramfs for the wrong +# kernel: the failure mode of that misstep is "boot hangs waiting for +# /dev/disk/by-uuid/", which is the exact bug this script is +# meant to prevent. +KVERS=( /usr/lib/modules/* ) +case ${#KVERS[@]} in + 0) + echo "ERROR: no kernel modules dir under /usr/lib/modules" >&2 + exit 1 + ;; + 1) + KVER=$(basename "${KVERS[0]}") + ;; + *) + echo "ERROR: expected exactly one kernel under /usr/lib/modules, found:" >&2 + printf ' %s\n' "${KVERS[@]}" >&2 + exit 1 + ;; +esac +echo "Regenerating initramfs for kernel $KVER with --no-hostonly" + +# `--no-hostonly` includes all storage modules; `--no-hostonly-cmdline` +# prevents dracut from baking the build-host's /proc/cmdline parameters +# into the initramfs (which would fight the qcow2's grub cmdline at +# runtime); `--reproducible` keeps the output bit-stable across builds +# so we can detect spurious regenerations. +dracut \ + --no-hostonly \ + --no-hostonly-cmdline \ + --reproducible \ + --add-drivers "ahci ata_piix sata_sil sata_nv sata_via sd_mod" \ + --force \ + --kver "$KVER" + +echo "Regenerated initramfs:" +ls -lh /boot/initramfs-* diff --git a/tests/images/trident-vm-testimage/base/scripts/update-host-status-azl4.sh b/tests/images/trident-vm-testimage/base/scripts/update-host-status-azl4.sh new file mode 100644 index 000000000..a2cfbe27f --- /dev/null +++ b/tests/images/trident-vm-testimage/base/scripts/update-host-status-azl4.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# AZL4 equivalent of AZL3's update-host-status.sh. +# +# Runs inside MIC's chroot at qcow2 build time. Populates the trident +# datastore with the host status derived from Prism's history.json so +# the system boots ready for storm-trident to drive A/B updates -- no +# first-boot bootstrap, no datastore creation at runtime. +# +# Mirrors AZL3's pattern (scripts/update-host-status.sh, called from +# baseimg-grub.yaml). The trident binary in the chroot must understand +# that `--disk /dev/sda` is the runtime label and not a build-time +# existence assertion; see trident PR fixing the spurious check in +# crates/trident/src/init/offline/mod.rs. +set -euxo pipefail + +trident offline-initialize From 9761f99610d094c7efadbd1ef854ca63fd8de2b9 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Mon, 8 Jun 2026 13:21:03 -0700 Subject: [PATCH 23/26] fix: Remove stale osmodifier additionalFile from baseimg osmodifier is now a Rust crate built into the trident binary (PR #638). No separate osmodifier binary needs to be baked into test images. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../trident-vm-testimage/base/baseimg-grub-azl4.yaml | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tests/images/trident-vm-testimage/base/baseimg-grub-azl4.yaml b/tests/images/trident-vm-testimage/base/baseimg-grub-azl4.yaml index c601b34d9..6237475c2 100644 --- a/tests/images/trident-vm-testimage/base/baseimg-grub-azl4.yaml +++ b/tests/images/trident-vm-testimage/base/baseimg-grub-azl4.yaml @@ -8,9 +8,9 @@ # AZL4-specific package names (dnf5, grub2-efi-x64, shim, etc.) matching # the updateimg-grub-azl4.yaml flavor. # -# TODO(azl4-rpm): Drop the trident + osmodifier additionalFiles entries -# once an AZL4 trident-service RPM and azurelinux-image-tools-osmodifier -# RPM are published. Until then we bake the binaries inline. +# TODO(azl4-rpm): Drop the trident additionalFiles entries +# once an AZL4 trident-service RPM is published. Until then we bake +# the binary inline. storage: disks: @@ -124,12 +124,6 @@ os: - source: trident-bin/trident destination: /usr/bin/trident permissions: "755" - # TODO(azl4-osmodifier-rpm): replace with - # `packages.install: - azurelinux-image-tools-osmodifier` - # once the RPM is published. - - source: osmodifier-bin/osmodifier - destination: /usr/bin/osmodifier - permissions: "755" # Trident systemd units. AZL3 gets these from the trident-service # RPM; AZL4 doesn't have that RPM yet so we ship them inline. The From c14b94d560f635888910099fc01d97e9ddfddbb5 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Tue, 2 Jun 2026 17:40:52 -0700 Subject: [PATCH 24/26] infra: Add AZL4 VM rollback test stage via storm-trident Adds AZL4 VM rollback test pipeline stage using storm-trident for automated rollback validation. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../testing_rollback/vm-testing-azl4.yml | 222 ++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100644 .pipelines/templates/stages/testing_rollback/vm-testing-azl4.yml diff --git a/.pipelines/templates/stages/testing_rollback/vm-testing-azl4.yml b/.pipelines/templates/stages/testing_rollback/vm-testing-azl4.yml new file mode 100644 index 000000000..b9aa7fb91 --- /dev/null +++ b/.pipelines/templates/stages/testing_rollback/vm-testing-azl4.yml @@ -0,0 +1,222 @@ +# AZL4 VM offline-init rollback test stage. +# +# Complement to testing_vm/netlaunch-testing-azl4.yml (BM-simulated install). +# This stage exercises the VM offline-init path: pre-baked AZL4 qcow2 boots +# directly, then storm-trident drives A/B update from the AZL4 COSI and +# tests rollback. +# +# Inputs (both built by other AZL4 stages in this pipeline): +# - trident-vm-grub-testimage-azl4-base.qcow2 (base qcow2 with trident +# systemd units + first-boot offline-init oneshot) +# - trident-vm-grub-testimage-azl4.cosi (update target, same cosi +# the BM-sim stage uses for fresh installs) +# +# Differences from testing_rollback/testing-template.yml: +# * No test matrix. One configuration: AZL4 rollback. +# * No extension testing (--skip-extension-testing). The AZL4 cosi +# doesn't ship the sysext machinery yet. +# * No netplan runtime testing (--skip-netplan-runtime-testing). +# base-azl4 trident-config omits the os: section because the AZL3 +# MOS install path doesn't have osmodifier available; the qcow2 +# base shouldn't need netplan runtime tweaks either. +# * No manual rollback testing (--skip-manual-rollbacks) for first +# iteration; add once basic A/B works. +# * No runtime updates (--skip-runtime-updates) for first iteration. +# +# When AZL4 grows a trident-service RPM, sysext / netplan / runtime +# variants will reuse the AZL3 testing-template.yml as a matrix entry. + +parameters: + - name: baseQcowArtifact + type: string + default: "trident-vm-grub-testimage-azl4-base" + + - name: cosiArtifact + type: string + default: "trident-vm-grub-testimage-azl4" + + - name: dependsOnStage + type: string + default: "" + + - name: verboseLogging + type: boolean + default: true + + - name: pool + type: string + default: "trident-ubuntu-1es-pool-eastus2" + +stages: + - stage: RollbackTesting_AZL4 + displayName: Rollback Testing - AZL4 (VM offline-init) + dependsOn: + - BuildingTools + - PrepareSSHKeys + - TridentTestImg_trident_vm_grub_testimage_azl4 + - TridentTestImg_trident_vm_grub_testimage_azl4_base + - ${{ if ne(parameters.dependsOnStage, '') }}: + - ${{ parameters.dependsOnStage }} + + variables: + - group: servicing_testing_params + - name: SSH_PRIVATE_KEY_PATH + value: "$HOME/.ssh/id_rsa" + - name: SSH_PUBLIC_KEY_PATH + value: "$(SSH_PRIVATE_KEY_PATH).pub" + + jobs: + - job: RollbackTestingAzl4 + displayName: Rollback Testing AZL4 + timeoutInMinutes: 30 + pool: + type: linux + name: ${{ parameters.pool }} + hostArchitecture: amd64 + + variables: + ob_outputDirectory: /tmp/deployment_logs_azl4_rollback + ob_artifactBaseName: "rollback-testing-azl4" + + steps: + - template: ../common_tasks/checkout_trident.yml + + - task: DownloadPipelineArtifact@2 + displayName: "Download AZL4 base qcow2" + inputs: + buildType: current + artifactName: "${{ parameters.baseQcowArtifact }}" + targetPath: "$(Build.ArtifactStagingDirectory)/base-qcow" + + - task: DownloadPipelineArtifact@2 + displayName: "Download AZL4 update COSI" + inputs: + buildType: current + artifactName: "${{ parameters.cosiArtifact }}" + targetPath: "$(Build.ArtifactStagingDirectory)/update-cosi" + + - task: DownloadPipelineArtifact@2 + displayName: "Download SSH keys" + inputs: + buildType: current + artifactName: "ssh-keys" + targetPath: "$(Build.ArtifactStagingDirectory)/ssh" + + - task: DownloadPipelineArtifact@2 + displayName: "Download go-tools" + inputs: + buildType: current + artifactName: "go-tools" + patterns: | + netlisten + storm-trident + targetPath: "$(TRIDENT_SOURCE_DIR)/bin" + + - bash: | + set -euxo pipefail + chmod +x $(TRIDENT_SOURCE_DIR)/bin/netlisten + chmod +x $(TRIDENT_SOURCE_DIR)/bin/storm-trident + cp $(Build.ArtifactStagingDirectory)/ssh/id_rsa* ~/.ssh/ + # Targeted permissions on the keys we just staged. Avoid + # `chmod -R 700 ~/.ssh/` because self-hosted agents may + # reuse the directory across jobs and we shouldn't trample + # other tooling's known_hosts / config / id_*. + chmod 700 ~/.ssh/ || true + chmod 600 ~/.ssh/id_rsa + chmod 644 ~/.ssh/id_rsa.pub + mkdir -p $(ob_outputDirectory) + + # Both the qcow2 base build and the COSI build stage the + # shared 'ssh-keys' artifact into their MIC trees (see + # .pipelines/templates/stages/build_image/build-image-template-azl4.yml). + # So the pipeline-wide PrepareSSHKeys id_rsa we just + # copied to ~/.ssh/ matches both A-side and B-side of + # the test VM. No per-build key swap needed. + ls -l ~/.ssh/ + + # storm-trident expects the artifacts laid out under + # one directory. testimages.py output uses a clone-index + # suffix; rename to the conventional names storm-trident + # script prepare-images would produce. + ARTIFACTS=$(Build.ArtifactStagingDirectory)/storm-input + mkdir -p "$ARTIFACTS" + + # storm-trident's qemu deploy looks for a qcow2 matching the + # regex `^trident-vm-.*-testimage.qcow2$` (see + # tools/storm/utils/vm/qemu/qemu.go:34). Our build artifact + # is named trident-vm-grub-testimage-azl4-base.qcow2 which + # doesn't match (-base.qcow2 not -testimage.qcow2 at end). + # Stage it under a name that matches. + QCOW_SRC="" + for c in \ + "$(Build.ArtifactStagingDirectory)/base-qcow/trident-vm-grub-testimage-azl4-base_0.qcow2" \ + "$(Build.ArtifactStagingDirectory)/base-qcow/trident-vm-grub-testimage-azl4-base.qcow2"; do + if [ -f "$c" ]; then QCOW_SRC="$c"; break; fi + done + if [ -z "$QCOW_SRC" ]; then + echo "Could not find AZL4 base qcow2. Contents:" + find "$(Build.ArtifactStagingDirectory)/base-qcow" -type f + exit 1 + fi + cp "$QCOW_SRC" "$ARTIFACTS/trident-vm-azl4-base-testimage.qcow2" + + # storm-trident's rollback test looks for any *.cosi in the + # artifacts dir (see tools/storm/rollback/tests/rollback.go:29). + # No rename needed beyond the clone-index suffix. + COSI_SRC="" + for c in \ + "$(Build.ArtifactStagingDirectory)/update-cosi/trident-vm-grub-testimage-azl4_0.cosi" \ + "$(Build.ArtifactStagingDirectory)/update-cosi/trident-vm-grub-testimage-azl4.cosi"; do + if [ -f "$c" ]; then COSI_SRC="$c"; break; fi + done + if [ -z "$COSI_SRC" ]; then + echo "Could not find AZL4 update COSI. Contents:" + find "$(Build.ArtifactStagingDirectory)/update-cosi" -type f + exit 1 + fi + cp "$COSI_SRC" "$ARTIFACTS/trident-vm-azl4-update-testimage.cosi" + + ls -lh "$ARTIFACTS" + displayName: "Stage artifacts for storm-trident" + workingDirectory: $(TRIDENT_SOURCE_DIR) + + - bash: | + set -euxo pipefail + + STORM_DYNAMIC_FLAGS="" + if [ "${{ parameters.verboseLogging }}" == "True" ]; then + STORM_DYNAMIC_FLAGS="$STORM_DYNAMIC_FLAGS --verbose" + fi + + # First-iteration AZL4 skips: see file header for rationale. + STORM_DYNAMIC_FLAGS="$STORM_DYNAMIC_FLAGS \ + --skip-extension-testing \ + --skip-netplan-runtime-testing \ + --skip-manual-rollbacks \ + --skip-runtime-updates" + + sudo ./bin/storm-trident run rollback -a $STORM_DYNAMIC_FLAGS \ + --artifacts-dir $(Build.ArtifactStagingDirectory)/storm-input \ + --output-path $(ob_outputDirectory) \ + --platform qemu \ + --user testing-user \ + --ssh-private-key-path $(SSH_PRIVATE_KEY_PATH) \ + --ssh-public-key-path $(SSH_PUBLIC_KEY_PATH) \ + --force-cleanup + displayName: "🚀 Storm-trident rollback test (AZL4)" + workingDirectory: $(TRIDENT_SOURCE_DIR) + timeoutInMinutes: 20 + + - bash: | + set -eux + sudo zstd -T0 $(Build.ArtifactStagingDirectory)/booted.qcow2 || true + sudo mv $(Build.ArtifactStagingDirectory)/booted.qcow2.zst $(ob_outputDirectory)/ || true + workingDirectory: $(TRIDENT_SOURCE_DIR) + condition: failed() + displayName: "Publish OS disk on failure" + timeoutInMinutes: 5 + + - template: ../testing_common/fix-output-directory-for-one-branch-step.yml + parameters: + outputDir: $(ob_outputDirectory) + condition: always() From 6ec4a8142d1fc10f9cf40101db205b0dc4322d53 Mon Sep 17 00:00:00 2001 From: Brian Telfer Date: Mon, 8 Jun 2026 13:21:03 -0700 Subject: [PATCH 25/26] fix: Use is_some_and instead of map_or for clippy Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- crates/osmodifier/src/grub_cfg.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/osmodifier/src/grub_cfg.rs b/crates/osmodifier/src/grub_cfg.rs index 19a9f3bd2..46dd82c97 100644 --- a/crates/osmodifier/src/grub_cfg.rs +++ b/crates/osmodifier/src/grub_cfg.rs @@ -117,7 +117,7 @@ fn extract_options_from_bls_entries(ctx: &OsModifierContext) -> Result Date: Mon, 8 Jun 2026 13:38:56 -0700 Subject: [PATCH 26/26] docs: Remove PR references and stale osmodifier comments from rollback config Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../rollback-azl4/trident-config.yaml | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/tests/e2e_tests/trident_configurations/rollback-azl4/trident-config.yaml b/tests/e2e_tests/trident_configurations/rollback-azl4/trident-config.yaml index e83320906..67a2c574a 100644 --- a/tests/e2e_tests/trident_configurations/rollback-azl4/trident-config.yaml +++ b/tests/e2e_tests/trident_configurations/rollback-azl4/trident-config.yaml @@ -62,16 +62,12 @@ health: - non-existent-service1.service - non-existent-service2.service timeoutSeconds: 15 -# AZL4 variant of the AZL3 `health-checks-install/` scenario. Adapted for the -# PR-4 hostname-only fast path: -# - Empty top-level `users`/`selinux`/`netplan` so install validation does -# not require the OS Modifier binary to be in the MOS install ISO (which -# does not currently include it; once the MOS rebuild lands, both this -# scenario and base-azl4 can grow os.users / os.selinux / os.netplan). -# - `os.additionalFiles` is the one os.* field used because health.checks -# references `path: /var/lib/trident/local-health-check-file.sh`, which -# needs to be on the target filesystem. additionalFiles is processed by -# Trident's storage / file-deploy paths, not by OS Modifier. +# AZL4 variant of the AZL3 `health-checks-install/` scenario. +# - No `users`/`selinux`/`netplan` — these are baked into the test image +# at MIC build time. +# - `os.additionalFiles` is used because health.checks references +# `path: /var/lib/trident/local-health-check-file.sh`, which needs to +# be on the target filesystem. # # Health-check failure expectations (asserted by tests/e2e_tests/rollback_test.py): # - State transitions to `not-provisioned` (clean-install has no slot to