From d828e43fd75aa786500a57ea0e51c9bc1ddd017d Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 13 Mar 2026 17:31:06 -0400 Subject: [PATCH 01/14] Reconfigurator: Add live test for executing expunged-and-never-created zones --- Cargo.lock | 3 + live-tests/Cargo.toml | 3 + live-tests/tests/common/reconfigurator.rs | 81 +++- .../tests/test_execute_expunged_zone.rs | 393 ++++++++++++++++++ live-tests/tests/test_nexus_add_remove.rs | 6 +- live-tests/tests/test_nexus_handoff.rs | 8 +- 6 files changed, 480 insertions(+), 14 deletions(-) create mode 100644 live-tests/tests/test_execute_expunged_zone.rs diff --git a/Cargo.lock b/Cargo.lock index 8bc4b88fc38..71a4616b5e6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8445,11 +8445,14 @@ dependencies = [ "pq-sys", "reqwest 0.13.2", "serde", + "serde_json", "sled-agent-types", "slog", "slog-error-chain", + "strum 0.27.2", "textwrap 0.16.2", "tokio", + "update-engine", "uuid", ] diff --git a/live-tests/Cargo.toml b/live-tests/Cargo.toml index dca2b884fd2..ce63a17f087 100644 --- a/live-tests/Cargo.toml +++ b/live-tests/Cargo.toml @@ -35,10 +35,13 @@ omicron-test-utils.workspace = true omicron-uuid-kinds.workspace = true reqwest.workspace = true serde.workspace = true +serde_json.workspace = true slog.workspace = true slog-error-chain.workspace = true +strum.workspace = true textwrap.workspace = true tokio.workspace = true +update-engine.workspace = true uuid.workspace = true [lints] diff --git a/live-tests/tests/common/reconfigurator.rs b/live-tests/tests/common/reconfigurator.rs index 2e867b43f53..fa4ea54a33f 100644 --- a/live-tests/tests/common/reconfigurator.rs +++ b/live-tests/tests/common/reconfigurator.rs @@ -28,6 +28,25 @@ use std::time::Duration; pub async fn blueprint_load_target_enabled( log: &slog::Logger, nexus: &nexus_lockstep_client::Client, +) -> Result { + blueprint_load_target_impl(log, nexus, true).await +} + +/// Return the current target blueprint +/// +/// Also validates that it's disabled. If an operator has enabled execution, we +/// don't want to proceed with tests. +pub async fn blueprint_load_target_disabled( + log: &slog::Logger, + nexus: &nexus_lockstep_client::Client, +) -> Result { + blueprint_load_target_impl(log, nexus, false).await +} + +async fn blueprint_load_target_impl( + log: &slog::Logger, + nexus: &nexus_lockstep_client::Client, + expect_enabled: bool, ) -> Result { // Fetch the current target configuration. info!(log, "editing current target blueprint"); @@ -40,9 +59,11 @@ pub async fn blueprint_load_target_enabled( debug!(log, "found current target blueprint"; "blueprint_id" => %target_blueprint.target_id ); + + let expect_inverse = if !expect_enabled { "enabled" } else { "disabled" }; ensure!( - target_blueprint.enabled, - "refusing to operate on a system with target blueprint disabled" + target_blueprint.enabled == expect_enabled, + "refusing to operate on a system with target blueprint {expect_inverse}" ); let blueprint = nexus @@ -78,14 +99,59 @@ pub async fn blueprint_load_target_enabled( /// case, a developer enables the initial target blueprint before running these /// tests and then doesn't need to think about it again for the lifetime of /// their test environment. -pub async fn blueprint_edit_current_target( +pub async fn blueprint_edit_current_target_enabled( + log: &slog::Logger, + nexus: &nexus_lockstep_client::Client, + edit_fn: F, +) -> Result<(Blueprint, Blueprint), anyhow::Error> +where + F: FnOnce(&mut BlueprintBuilder) -> Result<(), anyhow::Error>, +{ + blueprint_edit_current_target_impl(log, nexus, true, edit_fn).await +} + +/// Modify the system by editing the current target blueprint +/// +/// More precisely, this function: +/// +/// - fetches the current target blueprint +/// - creates a new BlueprintBuilder based on it +/// - invokes the caller's `edit_fn`, which may modify the builder however it +/// likes +/// - generates a new blueprint (thus based on the current target) +/// - uploads the new blueprint +/// - sets the new blueprint as the current target +/// - disables the new blueprint +/// +/// ## Errors +/// +/// This function fails if the current target blueprint is not already disabled. +/// Callers of this function expect execution to be - and remain - disabled. If +/// that isn't the case, we don't want to inadvertently proceed. +pub async fn blueprint_edit_current_target_disabled( + log: &slog::Logger, + nexus: &nexus_lockstep_client::Client, + edit_fn: F, +) -> Result<(Blueprint, Blueprint), anyhow::Error> +where + F: FnOnce(&mut BlueprintBuilder) -> Result<(), anyhow::Error>, +{ + blueprint_edit_current_target_impl(log, nexus, false, edit_fn).await +} + +async fn blueprint_edit_current_target_impl( log: &slog::Logger, nexus: &nexus_lockstep_client::Client, - edit_fn: &dyn Fn(&mut BlueprintBuilder) -> Result<(), anyhow::Error>, -) -> Result<(Blueprint, Blueprint), anyhow::Error> { + expect_enabled: bool, + edit_fn: F, +) -> Result<(Blueprint, Blueprint), anyhow::Error> +where + F: FnOnce(&mut BlueprintBuilder) -> Result<(), anyhow::Error>, +{ // Fetch the current target configuration. info!(log, "editing current target blueprint"); - let blueprint1 = blueprint_load_target_enabled(log, nexus).await?; + let blueprint1 = + blueprint_load_target_impl(log, nexus, expect_enabled).await?; // Make a new builder based on that blueprint and use `edit_fn` to edit it. let mut builder = BlueprintBuilder::new_based_on( @@ -113,7 +179,7 @@ pub async fn blueprint_edit_current_target( ); nexus .blueprint_target_set(&BlueprintTargetSet { - enabled: true, + enabled: expect_enabled, target_id: blueprint2.id, }) .await @@ -121,6 +187,7 @@ pub async fn blueprint_edit_current_target( info!(log, "finished editing target blueprint"; "old_target_id" => %blueprint1.id, "new_target_id" => %blueprint2.id, + "enabled" => %expect_enabled, ); Ok((blueprint1, blueprint2)) diff --git a/live-tests/tests/test_execute_expunged_zone.rs b/live-tests/tests/test_execute_expunged_zone.rs new file mode 100644 index 00000000000..3f944841739 --- /dev/null +++ b/live-tests/tests/test_execute_expunged_zone.rs @@ -0,0 +1,393 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +pub mod common; + +use crate::common::reconfigurator::blueprint_load_target_enabled; +use anyhow::Context; +use common::LiveTestContext; +use common::reconfigurator::blueprint_edit_current_target_disabled; +use live_tests_macros::live_test; +use nexus_lockstep_client::types::BlueprintTargetSet; +use nexus_lockstep_client::types::LastResult; +use nexus_types::deployment::BlueprintZoneConfig; +use nexus_types::deployment::ReconfiguratorConfig; +use nexus_types::deployment::ReconfiguratorConfigParam; +use omicron_test_utils::dev::poll::CondCheckError; +use omicron_test_utils::dev::poll::wait_for_condition; +use omicron_uuid_kinds::BlueprintUuid; +use omicron_uuid_kinds::OmicronZoneUuid; +use omicron_uuid_kinds::SledUuid; +use serde::Deserialize; +use sled_agent_types::inventory::ZoneKind; +use slog::Logger; +use slog::info; +use std::collections::BTreeSet; +use std::time::Duration; +use strum::IntoEnumIterator; +use update_engine::EventBuffer; +use update_engine::ExecutionStatus; +use update_engine::NestedError; +use update_engine::NestedSpec; +use update_engine::TerminalKind; +use update_engine::events::EventReport; +use update_engine::events::StepOutcome; + +#[live_test] +async fn test_execute_expunged_zone(lc: &LiveTestContext) { + let log = lc.log(); + + for zone_kind in ZoneKind::iter() { + match zone_kind { + // Skip multinode Clickhouse for now - we don't deploy it and have + // no plans to change this in the near future. + ZoneKind::ClickhouseKeeper | ZoneKind::ClickhouseServer => { + continue; + } + + // We expect all these zone types to exist, and we want to test what + // happens if we execute a zone of this type with the `expunged` + // disposition when we never executed that same zone with the + // `in-service` disposition. + ZoneKind::BoundaryNtp + | ZoneKind::Clickhouse + | ZoneKind::CockroachDb + | ZoneKind::Crucible + | ZoneKind::CruciblePantry + | ZoneKind::ExternalDns + | ZoneKind::InternalDns + | ZoneKind::InternalNtp + | ZoneKind::Nexus + | ZoneKind::Oximeter => { + let log = + log.new(slog::o!("zone-kind" => format!("{zone_kind:?}"))); + info!(log, "starting inner test"); + test_execute_expunged_zone_of_kind(zone_kind, lc).await; + } + } + } +} + +async fn test_execute_expunged_zone_of_kind( + zone_kind: ZoneKind, + lc: &LiveTestContext, +) { + // Test setup + let log = lc.log(); + + let initial_nexus_clients = lc.all_internal_nexus_clients().await.unwrap(); + let nexus = initial_nexus_clients.first().expect("internal Nexus client"); + + // Safety check: We expect the going-in state to be "blueprint execution is + // enabled". If it isn't, bail out and don't attempt to run this test. + blueprint_load_target_enabled(log, nexus) + .await + .expect("blueprint execution should be enabled"); + + // The high-level sequence here is: + // + // 1. Pause blueprint execution and planning + // 2. Expunge a zone of kind `zone_kind` + // 3. Run the planner; it should place a new zone of `zone_kind` + // 4. Expunge that new zone + // 5. Run the planner; it should place another new zone of `zone_kind` + // 6. Enable execution + // + // Execution in step 6 should succeed. It will be executing a blueprint that + // has the following changes since the last time execution ran: + // + // * One zone that had been in-service the last time a blueprint was + // executed is now expunged + // * One zone that did not exist the last time a blueprint was executed + // exists in the blueprint but with the expunged disposition + // * One zone that did not exist the last time a blueprint was executed + // exists and is should be put into service + + // Disable planning and execution. + disable_blueprint_planning(nexus, log).await; + disable_blueprint_execution(nexus, log).await; + + let mut orig_zones_of_interest = None; + blueprint_edit_current_target_disabled(log, nexus, |builder| { + let zones_of_interest = ZonesOfInterest::from_zones( + builder.current_in_service_zones(), + zone_kind, + ); + + let (sled_id, zone_id) = zones_of_interest.pick_any_zone_to_expunge(); + + builder + .sled_expunge_zone(sled_id, zone_id) + .context("expunging zone")?; + + orig_zones_of_interest = Some(zones_of_interest); + + Ok(()) + }) + .await + .expect("edited blueprint to expunge a zone"); + let orig_zones_of_interest = orig_zones_of_interest.unwrap(); + + // Run the planner. + let planned_bp_1 = nexus.blueprint_regenerate().await.expect("ran planner"); + + // It should have added a new zone of the kind we care about. + let plan1_zones_of_interest = + ZonesOfInterest::from_zones(planned_bp_1.in_service_zones(), zone_kind); + let plan1_zones_added = + plan1_zones_of_interest.zones_added_since(&orig_zones_of_interest); + assert_eq!( + plan1_zones_added.len(), + 1, + "expected planner to add exactly 1 new zone, \ + but got {plan1_zones_added:?}" + ); + + // Make that blueprint the target, but keep execution disabled. + nexus + .blueprint_target_set(&BlueprintTargetSet { + enabled: false, + target_id: planned_bp_1.id, + }) + .await + .expect("set new target"); + + // Edit the newly-planned blueprint and expunge the zone it just added. + blueprint_edit_current_target_disabled(log, nexus, |builder| { + let (sled_id, zone_id) = plan1_zones_added.first().copied().unwrap(); + builder + .sled_expunge_zone(sled_id, zone_id) + .context("expunging zone")?; + Ok(()) + }) + .await + .expect("edited blueprint to expunge a zone"); + + // Run the planner again. + let planned_bp_2 = nexus.blueprint_regenerate().await.expect("ran planner"); + + // It should have added a new zone of the kind we care about. + let plan2_zones_of_interest = + ZonesOfInterest::from_zones(planned_bp_2.in_service_zones(), zone_kind); + let plan2_zones_added = + plan2_zones_of_interest.zones_added_since(&orig_zones_of_interest); + assert_eq!( + plan2_zones_added.len(), + 1, + "expected planner to add exactly 1 new zone, \ + but got {plan2_zones_added:?}" + ); + + // Make that blueprint the target, but and finally enable execution. + nexus + .blueprint_target_set(&BlueprintTargetSet { + enabled: true, + target_id: planned_bp_2.id, + }) + .await + .expect("set new target"); + + // Wait until we see execution of this latest blueprint. + wait_for_condition( + || async { + let status = nexus + .bgtask_view("blueprint_executor") + .await + .expect("got status for `blueprint_executor` bgtask") + .into_inner() + .last; + + let details = match status { + LastResult::NeverCompleted => { + return Err(CondCheckError::NotYet); + } + LastResult::Completed(completed) => completed.details, + }; + + #[derive(Deserialize)] + struct BlueprintExecutorStatus { + target_id: BlueprintUuid, + execution_error: Option, + event_report: EventReport, + } + + let details: BlueprintExecutorStatus = + serde_json::from_value(details) + .expect("parsed details as JSON"); + + // Make sure we're looking at the execution of the blueprint we care + // about; otherwise, keep waiting. + if details.target_id != planned_bp_2.id { + return Err(CondCheckError::NotYet); + } + + // Check that execution completely cleanly. + if let Some(err) = details.execution_error { + return Err(CondCheckError::Failed(format!( + "execution error: {err}" + ))); + } + + if let Some(err) = event_report_has_problems(details.event_report) { + return Err(CondCheckError::Failed(format!( + "execution problem: {err}" + ))); + } + + Ok(()) + }, + &Duration::from_secs(1), + &Duration::from_secs(120), + ) + .await + .expect("waited for successful execution"); + + // Log the IDs of the zone in this state. + let (sled_id, zone_id) = plan1_zones_added.first().copied().unwrap(); + info!( + log, + "got successful execution of blueprint with \ + expunged-and-never-in-service zone"; + "sled_id" => %sled_id, + "zone_id" => %zone_id, + ); +} + +async fn disable_blueprint_planning( + nexus: &nexus_lockstep_client::Client, + log: &Logger, +) { + let current_config = nexus + .reconfigurator_config_show_current() + .await + .expect("got current reconfigurator config") + .into_inner(); + + if !current_config.config.planner_enabled { + info!(log, "skipping disable of blueprint planning: already disabled"); + } + + let new_config = ReconfiguratorConfigParam { + version: current_config.version + 1, + config: ReconfiguratorConfig { + planner_enabled: false, + planner_config: current_config.config.planner_config, + tuf_repo_pruner_enabled: current_config + .config + .tuf_repo_pruner_enabled, + }, + }; + + nexus + .reconfigurator_config_set(&new_config) + .await + .expect("applied config to disable planner"); + + info!(log, "disabled blueprint planning"); +} + +async fn disable_blueprint_execution( + nexus: &nexus_lockstep_client::Client, + log: &Logger, +) { + let target_blueprint = nexus + .blueprint_target_view() + .await + .expect("got current target blueprint"); + + if !target_blueprint.enabled { + info!(log, "blueprint execution already disabled"); + return; + } + + nexus + .blueprint_target_set_enabled(&BlueprintTargetSet { + enabled: false, + target_id: target_blueprint.target_id, + }) + .await + .expect("disabled blueprint execution"); + + info!(log, "disabled blueprint execution"); +} + +struct ZonesOfInterest { + in_service_zones: BTreeSet<(SledUuid, OmicronZoneUuid)>, +} + +impl ZonesOfInterest { + fn from_zones<'a>( + zones: impl Iterator, + zone_kind: ZoneKind, + ) -> Self { + let mut in_service_zones = BTreeSet::new(); + + for (sled_id, zone_cfg) in zones { + if zone_cfg.kind() != zone_kind { + continue; + } + + in_service_zones.insert((sled_id, zone_cfg.id)); + } + + Self { in_service_zones } + } + + fn pick_any_zone_to_expunge(&self) -> (SledUuid, OmicronZoneUuid) { + self.in_service_zones + .first() + .copied() + .expect("no zones of relevant kind found") + } + + fn zones_added_since( + &self, + other: &Self, + ) -> BTreeSet<(SledUuid, OmicronZoneUuid)> { + self.in_service_zones + .difference(&other.in_service_zones) + .copied() + .collect() + } +} + +fn event_report_has_problems( + event_report: EventReport, +) -> Option<&'static str> { + let mut buf = EventBuffer::default(); + buf.add_event_report(event_report); + + // Check for outright failure. + let Some(summary) = buf.root_execution_summary() else { + return Some("missing root_execution_summary"); + }; + match summary.execution_status { + ExecutionStatus::Terminal(info) => match info.kind { + TerminalKind::Completed => (), + TerminalKind::Failed => return Some("execution failed"), + TerminalKind::Aborted => return Some("execution aborted"), + }, + ExecutionStatus::NotStarted => return Some("execution not started"), + ExecutionStatus::Running { .. } => { + return Some("execution still running"); + } + } + + // Also look for warnings. + for (_, step_data) in buf.iter_steps_recursive() { + if let Some(reason) = step_data.step_status().completion_reason() { + if let Some(info) = reason.step_completed_info() { + match info.outcome { + StepOutcome::Success { .. } + | StepOutcome::Skipped { .. } => (), + StepOutcome::Warning { .. } => { + return Some("execution has warnings"); + } + } + } + } + } + + None +} diff --git a/live-tests/tests/test_nexus_add_remove.rs b/live-tests/tests/test_nexus_add_remove.rs index 9b9a862cebe..10a58386aa8 100644 --- a/live-tests/tests/test_nexus_add_remove.rs +++ b/live-tests/tests/test_nexus_add_remove.rs @@ -7,7 +7,7 @@ pub mod common; use crate::common::reconfigurator::blueprint_wait_sled_configs_propagated; use anyhow::Context; use common::LiveTestContext; -use common::reconfigurator::blueprint_edit_current_target; +use common::reconfigurator::blueprint_edit_current_target_enabled; use futures::TryStreamExt; use live_tests_macros::live_test; use nexus_lockstep_client::ClientInfo as _; @@ -78,7 +78,7 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { .all_sled_ids(SledFilter::Commissioned) .collect::>(); let sled_id = *commissioned_sled_ids.first().expect("any sled id"); - let (blueprint1, blueprint2) = blueprint_edit_current_target( + let (blueprint1, blueprint2) = blueprint_edit_current_target_enabled( log, &nexus, &|builder: &mut BlueprintBuilder| { @@ -194,7 +194,7 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { info!(log, "created demo saga"; "demo_saga" => ?demo_saga); // Now expunge the zone we just created. - let (_blueprint2, blueprint3) = blueprint_edit_current_target( + let (_blueprint2, blueprint3) = blueprint_edit_current_target_enabled( log, &nexus, &|builder: &mut BlueprintBuilder| { diff --git a/live-tests/tests/test_nexus_handoff.rs b/live-tests/tests/test_nexus_handoff.rs index a73a20ff7a7..0da0ca00479 100644 --- a/live-tests/tests/test_nexus_handoff.rs +++ b/live-tests/tests/test_nexus_handoff.rs @@ -8,7 +8,7 @@ use crate::common::reconfigurator::blueprint_load_target_enabled; use crate::common::reconfigurator::blueprint_wait_sled_configs_propagated; use anyhow::Context; use common::LiveTestContext; -use common::reconfigurator::blueprint_edit_current_target; +use common::reconfigurator::blueprint_edit_current_target_enabled; use internal_dns_types::config::DnsRecord; use internal_dns_types::names::ServiceName; use live_tests_macros::live_test; @@ -176,7 +176,7 @@ async fn test_nexus_handoff(lc: &LiveTestContext) { .await .expect("planning input"); let (_blueprint_initial, blueprint_new_nexus) = - blueprint_edit_current_target( + blueprint_edit_current_target_enabled( log, &nexus, &|builder: &mut BlueprintBuilder| { @@ -272,7 +272,7 @@ async fn test_nexus_handoff(lc: &LiveTestContext) { // Now update the target blueprint to trigger a handoff. let (_blueprint_new_nexus, blueprint_handoff) = - blueprint_edit_current_target( + blueprint_edit_current_target_enabled( log, &nexus, &|builder: &mut BlueprintBuilder| { @@ -432,7 +432,7 @@ async fn test_nexus_handoff(lc: &LiveTestContext) { let new_nexus = new_nexus_clients.values().next().expect("one new Nexus client"); let (_blueprint_handoff, blueprint_cleanup) = - blueprint_edit_current_target( + blueprint_edit_current_target_enabled( log, new_nexus, &|builder: &mut BlueprintBuilder| { From 18a74c026f2e26b40b30f4b70f935ddb4308d363 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 16 Mar 2026 16:29:29 -0400 Subject: [PATCH 02/14] live-tests: use /etc/resolv.conf --- live-tests/tests/common/mod.rs | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/live-tests/tests/common/mod.rs b/live-tests/tests/common/mod.rs index a74f81eb4f0..f950b13c88b 100644 --- a/live-tests/tests/common/mod.rs +++ b/live-tests/tests/common/mod.rs @@ -12,7 +12,6 @@ use nexus_config::PostgresConfigWithUrl; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::deployment::SledFilter; -use omicron_common::address::Ipv6Subnet; use slog::info; use slog::o; use std::ffi::OsStr; @@ -96,20 +95,14 @@ impl LiveTestContext { } fn create_resolver(log: &slog::Logger) -> Result { - // In principle, we should look at /etc/resolv.conf to find the DNS servers. - // In practice, this usually isn't populated today. See - // oxidecomputer/omicron#2122. - // - // However, the address selected below should work for most existing Omicron - // deployments today. That's because while the base subnet is in principle - // configurable in config-rss.toml, it's very uncommon to change it from the - // default value used here. - let subnet = Ipv6Subnet::new("fd00:1122:3344:0100::".parse().unwrap()); - eprintln!("note: using DNS server for subnet {}", subnet.net()); - internal_dns_resolver::Resolver::new_from_subnet(log.clone(), subnet) - .with_context(|| { - format!("creating DNS resolver for subnet {}", subnet.net()) - }) + // The internal DNS servers are populated in /etc/resolv.conf in the switch + // zone, which is where we expect live tests to run. Notify the user that + // we're going to attempt DNS resolution via the default system path. + eprintln!( + "note: using DNS from system config (typically /etc/resolv.conf)", + ); + internal_dns_resolver::Resolver::new_from_system_conf(log.clone()) + .context("creating DNS resolver from system config") } /// Creates a DataStore pointing at the CockroachDB cluster that's in DNS From 31eaa26e67f1cd78c7b4d1d11a01762a1ce42b06 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 16 Mar 2026 16:43:48 -0400 Subject: [PATCH 03/14] update racklette serials --- live-tests/tests/common/mod.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/live-tests/tests/common/mod.rs b/live-tests/tests/common/mod.rs index f950b13c88b..a22bda45e33 100644 --- a/live-tests/tests/common/mod.rs +++ b/live-tests/tests/common/mod.rs @@ -206,7 +206,7 @@ async fn check_hardware_environment( ) -> Result<(), anyhow::Error> { const ALLOWED_GIMLET_SERIALS: &[&str] = &[ // Serial number lists can be generated with: - // inventron env system list -Hpo serial -F type=gimlet + // inventron env system list -Hpo serial -F type=cosmo -F type=gimlet // test rig: "madrid" "BRM42220081", @@ -215,11 +215,11 @@ async fn check_hardware_environment( "BRM42220004", // test rig: "london" "BRM42220036", - "BRM42220062", + "2CN2M459", "BRM42220030", - "BRM44220007", + "2RGCFG10", // test rig: "dublin" - "BRM42220026", + "2F8JEXDK", "BRM27230037", "BRM23230018", "BRM23230010", @@ -227,7 +227,7 @@ async fn check_hardware_environment( "BRM42220011", "BRM44220007", "BRM42220082", - "BRM06240029", + "271FVPY0", ]; // Refuse to operate in an environment that might contain real Oxide From ed69397dd82824b2b011b2d002d751f12419b255 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 16 Mar 2026 16:49:42 -0400 Subject: [PATCH 04/14] whitespace --- live-tests/README.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/live-tests/README.adoc b/live-tests/README.adoc index 6df94283146..97891d31cd6 100644 --- a/live-tests/README.adoc +++ b/live-tests/README.adoc @@ -6,7 +6,7 @@ This package is not built or tested by default because the tests generally can't == Why a separate test suite? -What makes these tests different from the rest of the test suite is that they require connectivity to the underlay network of the deployed system and they make API calls to various components in that system and they assume that this will behave like a real production system. By contrast, the normal tests instead _set up_ a bunch of components using simulated sled agents and localhost networking, which is great for starting from a predictable state and running tests in parallel, but the simulated sled agents and networking make it impossible to exercise quite a lot of Reconfigurator's functionality. +What makes these tests different from the rest of the test suite is that they require connectivity to the underlay network of the deployed system and they make API calls to various components in that system and they assume that this will behave like a real production system. By contrast, the normal tests instead _set up_ a bunch of components using simulated sled agents and localhost networking, which is great for starting from a predictable state and running tests in parallel, but the simulated sled agents and networking make it impossible to exercise quite a lot of Reconfigurator's functionality. There are also the `end-to-end-tests`. That environment is more realistic than the main test suite, but not faithful enough for many Reconfigurator tests. From 7c0d57b627b179bc199b0c755298dffac5f17a1e Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 16 Mar 2026 16:49:46 -0400 Subject: [PATCH 05/14] update live-test instructions * no need to enable execution by default * DO need to disable planning by default --- live-tests/README.adoc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/live-tests/README.adoc b/live-tests/README.adoc index 97891d31cd6..3042eb4d0c9 100644 --- a/live-tests/README.adoc +++ b/live-tests/README.adoc @@ -22,10 +22,12 @@ First, deploy Omicron using `a4x2` or one of the hardware test rigs. Ensure the system's target blueprint is enabled. The live tests require this to avoid a case where the live tests generate blueprints based on a target blueprint that is not current, and then make a bunch of changes to the system unrelated to the tests. -On a fresh system, you will have to enable the target blueprint yourself: +Ensure the system's blueprint planner is disabled. The live tests will generate various blueprints with potentially-questiable content, and we don't want the planner to run mid-test and attempt to "fix" the state of the system. + +On a fresh system, you will have to disable the blueprint planner yourself: ``` -omdb --destructive nexus blueprints target enable current +omdb --destructive nexus reconfigurator-config set --planner-enabled false ``` This essentially enables reconfigurator, causing it to constantly try to make the system match its target blueprint. You only need to do this once in the lifetime of the system, not every time you re-run the live tests. From 25cde5110aa0c71c48933dce0389f064728e44e3 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 16 Mar 2026 17:12:34 -0400 Subject: [PATCH 06/14] more logging --- live-tests/tests/common/reconfigurator.rs | 2 +- .../tests/test_execute_expunged_zone.rs | 101 +++++++++++++----- 2 files changed, 75 insertions(+), 28 deletions(-) diff --git a/live-tests/tests/common/reconfigurator.rs b/live-tests/tests/common/reconfigurator.rs index fa4ea54a33f..54cca720763 100644 --- a/live-tests/tests/common/reconfigurator.rs +++ b/live-tests/tests/common/reconfigurator.rs @@ -49,7 +49,7 @@ async fn blueprint_load_target_impl( expect_enabled: bool, ) -> Result { // Fetch the current target configuration. - info!(log, "editing current target blueprint"); + info!(log, "loading current target blueprint"); let target_blueprint = nexus .blueprint_target_view() .await diff --git a/live-tests/tests/test_execute_expunged_zone.rs b/live-tests/tests/test_execute_expunged_zone.rs index 3f944841739..f77ca6daf37 100644 --- a/live-tests/tests/test_execute_expunged_zone.rs +++ b/live-tests/tests/test_execute_expunged_zone.rs @@ -23,6 +23,7 @@ use serde::Deserialize; use sled_agent_types::inventory::ZoneKind; use slog::Logger; use slog::info; +use slog_error_chain::InlineErrorChain; use std::collections::BTreeSet; use std::time::Duration; use strum::IntoEnumIterator; @@ -109,28 +110,38 @@ async fn test_execute_expunged_zone_of_kind( disable_blueprint_execution(nexus, log).await; let mut orig_zones_of_interest = None; - blueprint_edit_current_target_disabled(log, nexus, |builder| { - let zones_of_interest = ZonesOfInterest::from_zones( - builder.current_in_service_zones(), - zone_kind, - ); + let (_, edit1) = + blueprint_edit_current_target_disabled(log, nexus, |builder| { + let zones_of_interest = ZonesOfInterest::from_zones( + builder.current_in_service_zones(), + zone_kind, + ); - let (sled_id, zone_id) = zones_of_interest.pick_any_zone_to_expunge(); + let (sled_id, zone_id) = + zones_of_interest.pick_any_zone_to_expunge(); - builder - .sled_expunge_zone(sled_id, zone_id) - .context("expunging zone")?; + builder + .sled_expunge_zone(sled_id, zone_id) + .context("expunging zone")?; - orig_zones_of_interest = Some(zones_of_interest); + orig_zones_of_interest = Some(zones_of_interest); - Ok(()) - }) - .await - .expect("edited blueprint to expunge a zone"); + Ok(()) + }) + .await + .expect("edited blueprint to expunge a zone"); let orig_zones_of_interest = orig_zones_of_interest.unwrap(); + info!( + log, "edited blueprint first time: expunged 1 zone"; + "blueprint_id" => %edit1.id, + ); // Run the planner. let planned_bp_1 = nexus.blueprint_regenerate().await.expect("ran planner"); + info!( + log, "ran planner first time; expected one new zone to be placed"; + "blueprint_id" => %planned_bp_1.id, + ); // It should have added a new zone of the kind we care about. let plan1_zones_of_interest = @@ -154,18 +165,28 @@ async fn test_execute_expunged_zone_of_kind( .expect("set new target"); // Edit the newly-planned blueprint and expunge the zone it just added. - blueprint_edit_current_target_disabled(log, nexus, |builder| { - let (sled_id, zone_id) = plan1_zones_added.first().copied().unwrap(); - builder - .sled_expunge_zone(sled_id, zone_id) - .context("expunging zone")?; - Ok(()) - }) - .await - .expect("edited blueprint to expunge a zone"); + let (_, edit2) = + blueprint_edit_current_target_disabled(log, nexus, |builder| { + let (sled_id, zone_id) = + plan1_zones_added.first().copied().unwrap(); + builder + .sled_expunge_zone(sled_id, zone_id) + .context("expunging zone")?; + Ok(()) + }) + .await + .expect("edited blueprint to expunge a zone"); + info!( + log, "edited blueprint second time: expunged 1 zone"; + "blueprint_id" => %edit2.id, + ); // Run the planner again. let planned_bp_2 = nexus.blueprint_regenerate().await.expect("ran planner"); + info!( + log, "ran planner second time; expected one new zone to be placed"; + "blueprint_id" => %planned_bp_2.id, + ); // It should have added a new zone of the kind we care about. let plan2_zones_of_interest = @@ -179,7 +200,7 @@ async fn test_execute_expunged_zone_of_kind( but got {plan2_zones_added:?}" ); - // Make that blueprint the target, but and finally enable execution. + // Make that blueprint the target, and finally enable execution. nexus .blueprint_target_set(&BlueprintTargetSet { enabled: true, @@ -200,6 +221,10 @@ async fn test_execute_expunged_zone_of_kind( let details = match status { LastResult::NeverCompleted => { + info!( + log, + "still waiting for execution: task never completed" + ); return Err(CondCheckError::NotYet); } LastResult::Completed(completed) => completed.details, @@ -212,13 +237,34 @@ async fn test_execute_expunged_zone_of_kind( event_report: EventReport, } - let details: BlueprintExecutorStatus = - serde_json::from_value(details) - .expect("parsed details as JSON"); + // We won't be able to parse the `details` we expect if the + // `bgtask_view()` we just executed is still returning the status + // from a previous execution attempt of a disabled blueprint, so + // return `NotYet` until we can parse them. + let details: BlueprintExecutorStatus = match serde_json::from_value( + details, + ) { + Ok(details) => details, + Err(err) => { + info!( + log, + "still waiting for execution: failed to parse details"; + InlineErrorChain::new(&err), + ); + return Err(CondCheckError::NotYet); + } + }; // Make sure we're looking at the execution of the blueprint we care // about; otherwise, keep waiting. if details.target_id != planned_bp_2.id { + info!( + log, + "still waiting for execution: executed blueprint ID \ + does not match ID we're waiting for"; + "executed_id" => %details.target_id, + "waiting_for_id" => %planned_bp_2.id, + ); return Err(CondCheckError::NotYet); } @@ -266,6 +312,7 @@ async fn disable_blueprint_planning( if !current_config.config.planner_enabled { info!(log, "skipping disable of blueprint planning: already disabled"); + return; } let new_config = ReconfiguratorConfigParam { From 4ad75be487ba4b85198b1b449e2b509740326ef8 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 16 Mar 2026 18:02:58 -0400 Subject: [PATCH 07/14] test fixes --- .../tests/test_execute_expunged_zone.rs | 86 ++++++++++++++----- 1 file changed, 66 insertions(+), 20 deletions(-) diff --git a/live-tests/tests/test_execute_expunged_zone.rs b/live-tests/tests/test_execute_expunged_zone.rs index f77ca6daf37..ed72fd50811 100644 --- a/live-tests/tests/test_execute_expunged_zone.rs +++ b/live-tests/tests/test_execute_expunged_zone.rs @@ -23,6 +23,7 @@ use serde::Deserialize; use sled_agent_types::inventory::ZoneKind; use slog::Logger; use slog::info; +use slog::warn; use slog_error_chain::InlineErrorChain; use std::collections::BTreeSet; use std::time::Duration; @@ -47,6 +48,13 @@ async fn test_execute_expunged_zone(lc: &LiveTestContext) { continue; } + // Skip internal DNS for now - our test relies on the planner + // immediately replacing a zone we expunge. The planner (correctly!) + // does not do this for internal DNS: it has to wait for inventory + // to reflect that the zone has been expunged, which can't happen + // during our test because we've disabled execution. + ZoneKind::InternalDns => continue, + // We expect all these zone types to exist, and we want to test what // happens if we execute a zone of this type with the `expunged` // disposition when we never executed that same zone with the @@ -57,7 +65,6 @@ async fn test_execute_expunged_zone(lc: &LiveTestContext) { | ZoneKind::Crucible | ZoneKind::CruciblePantry | ZoneKind::ExternalDns - | ZoneKind::InternalDns | ZoneKind::InternalNtp | ZoneKind::Nexus | ZoneKind::Oximeter => { @@ -234,7 +241,7 @@ async fn test_execute_expunged_zone_of_kind( struct BlueprintExecutorStatus { target_id: BlueprintUuid, execution_error: Option, - event_report: EventReport, + event_report: Result, NestedError>, } // We won't be able to parse the `details` we expect if the @@ -271,17 +278,35 @@ async fn test_execute_expunged_zone_of_kind( // Check that execution completely cleanly. if let Some(err) = details.execution_error { return Err(CondCheckError::Failed(format!( - "execution error: {err}" + "execution error: {}", + InlineErrorChain::new(&err), ))); } - if let Some(err) = event_report_has_problems(details.event_report) { - return Err(CondCheckError::Failed(format!( - "execution problem: {err}" - ))); + match details.event_report { + Ok(event_report) => { + match event_report_problems(event_report, log) { + // Success! + None => Ok(()), + // Some warnings when executing new zones are expected + // (e.g., failure to talk to a zone immediately after + // starting it), but we expect those to clear up + // quickly. + Some(EventReportProblems::HadWarnings) => { + Err(CondCheckError::NotYet) + } + Some(EventReportProblems::Fatal(err)) => { + Err(CondCheckError::Failed(format!( + "execution problem: {err}" + ))) + } + } + } + Err(err) => Err(CondCheckError::Failed(format!( + "no available event report: {}", + InlineErrorChain::new(&err), + ))), } - - Ok(()) }, &Duration::from_secs(1), &Duration::from_secs(120), @@ -399,42 +424,63 @@ impl ZonesOfInterest { } } -fn event_report_has_problems( +enum EventReportProblems { + Fatal(&'static str), + HadWarnings, +} + +fn event_report_problems( event_report: EventReport, -) -> Option<&'static str> { + log: &Logger, +) -> Option { let mut buf = EventBuffer::default(); buf.add_event_report(event_report); // Check for outright failure. let Some(summary) = buf.root_execution_summary() else { - return Some("missing root_execution_summary"); + return Some(EventReportProblems::Fatal( + "missing root_execution_summary", + )); }; match summary.execution_status { ExecutionStatus::Terminal(info) => match info.kind { TerminalKind::Completed => (), - TerminalKind::Failed => return Some("execution failed"), - TerminalKind::Aborted => return Some("execution aborted"), + TerminalKind::Failed => { + return Some(EventReportProblems::Fatal("execution failed")); + } + TerminalKind::Aborted => { + return Some(EventReportProblems::Fatal("execution aborted")); + } }, - ExecutionStatus::NotStarted => return Some("execution not started"), + ExecutionStatus::NotStarted => { + return Some(EventReportProblems::Fatal("execution not started")); + } ExecutionStatus::Running { .. } => { - return Some("execution still running"); + return Some(EventReportProblems::Fatal("execution still running")); } } // Also look for warnings. + let mut had_warnings = false; for (_, step_data) in buf.iter_steps_recursive() { if let Some(reason) = step_data.step_status().completion_reason() { if let Some(info) = reason.step_completed_info() { - match info.outcome { + match &info.outcome { StepOutcome::Success { .. } | StepOutcome::Skipped { .. } => (), - StepOutcome::Warning { .. } => { - return Some("execution has warnings"); + StepOutcome::Warning { message, .. } => { + warn!( + log, + "exeucution warning"; + "message" => %message, + "step" => %step_data.step_info().description, + ); + had_warnings = true; } } } } } - None + if had_warnings { Some(EventReportProblems::HadWarnings) } else { None } } From e40ae95ef466b8ceec14e4fc8ade0d0c05ef0308 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 16 Mar 2026 20:42:12 -0400 Subject: [PATCH 08/14] add crdb safety check --- .../tests/test_execute_expunged_zone.rs | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/live-tests/tests/test_execute_expunged_zone.rs b/live-tests/tests/test_execute_expunged_zone.rs index ed72fd50811..22033774b93 100644 --- a/live-tests/tests/test_execute_expunged_zone.rs +++ b/live-tests/tests/test_execute_expunged_zone.rs @@ -14,6 +14,7 @@ use nexus_lockstep_client::types::LastResult; use nexus_types::deployment::BlueprintZoneConfig; use nexus_types::deployment::ReconfiguratorConfig; use nexus_types::deployment::ReconfiguratorConfigParam; +use omicron_common::policy::COCKROACHDB_REDUNDANCY; use omicron_test_utils::dev::poll::CondCheckError; use omicron_test_utils::dev::poll::wait_for_condition; use omicron_uuid_kinds::BlueprintUuid; @@ -39,6 +40,53 @@ use update_engine::events::StepOutcome; #[live_test] async fn test_execute_expunged_zone(lc: &LiveTestContext) { let log = lc.log(); + let opctx = lc.opctx(); + let datastore = lc.datastore(); + + // Safety check: If running this test multiple times, we may leave behind + // underreplicated cockroach ranges. We shouldn't attempt to proceed if + // those haven't been repair yet. Get the latest inventory collection and + // check. + match datastore.inventory_get_latest_collection(opctx).await { + Ok(Some(collection)) => { + let expected_nodes = COCKROACHDB_REDUNDANCY; + let db_status = collection.cockroach_status; + if db_status.len() < expected_nodes { + panic!( + "refusing to run live test: \ + latest inventory only has status from {} cockroach nodes; \ + expected {expected_nodes}", + db_status.len() + ); + } + for (node_id, status) in db_status { + match status.ranges_underreplicated { + Some(0) => (), + _ => panic!( + "refusing to run live test: inventory reports {:?} \ + for ranges underreplicated on CRDB node {node_id}", + status.ranges_underreplicated, + ), + } + match status.liveness_live_nodes { + Some(n) if n == expected_nodes as u64 => (), + _ => panic!( + "refusing to run live test: inventory reports {:?} \ + for live nodes on CRDB node {node_id}", + status.liveness_live_nodes, + ), + } + } + } + Ok(None) => panic!( + "refusing to run live test: no inventory collections exist yet" + ), + Err(err) => panic!( + "refusing to run live test: \ + error fetching inventory collection: {}", + InlineErrorChain::new(&err), + ), + } for zone_kind in ZoneKind::iter() { match zone_kind { From e0228d36f64f1eb7c539a5c4b19fc63c69dd9ab9 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 16 Mar 2026 21:24:50 -0400 Subject: [PATCH 09/14] test refinement --- .../tests/test_execute_expunged_zone.rs | 97 +++++++++---------- 1 file changed, 48 insertions(+), 49 deletions(-) diff --git a/live-tests/tests/test_execute_expunged_zone.rs b/live-tests/tests/test_execute_expunged_zone.rs index 22033774b93..fe19f206a97 100644 --- a/live-tests/tests/test_execute_expunged_zone.rs +++ b/live-tests/tests/test_execute_expunged_zone.rs @@ -267,12 +267,22 @@ async fn test_execute_expunged_zone_of_kind( // Wait until we see execution of this latest blueprint. wait_for_condition( || async { - let status = nexus - .bgtask_view("blueprint_executor") - .await - .expect("got status for `blueprint_executor` bgtask") - .into_inner() - .last; + let status = match nexus.bgtask_view("blueprint_executor").await { + Ok(task_view) => task_view.into_inner().last, + Err(err) => { + // We don't generally expect this to fail, but it could if + // we're testing Nexus or Cockroach (or recently did), since + // either of those being expunged may cause transient errors + // attempting to fetch task status from an arbitrary Nexus + // present in DNS. + warn!( + log, + "failed to get blueprint_executor status from Nexus"; + InlineErrorChain::new(&err), + ); + return Err(CondCheckError::NotYet); + } + }; let details = match status { LastResult::NeverCompleted => { @@ -313,7 +323,7 @@ async fn test_execute_expunged_zone_of_kind( // Make sure we're looking at the execution of the blueprint we care // about; otherwise, keep waiting. if details.target_id != planned_bp_2.id { - info!( + warn!( log, "still waiting for execution: executed blueprint ID \ does not match ID we're waiting for"; @@ -325,29 +335,23 @@ async fn test_execute_expunged_zone_of_kind( // Check that execution completely cleanly. if let Some(err) = details.execution_error { - return Err(CondCheckError::Failed(format!( - "execution error: {}", + warn!( + log, "execution had an error"; InlineErrorChain::new(&err), - ))); + ); + return Err(CondCheckError::NotYet); } match details.event_report { Ok(event_report) => { - match event_report_problems(event_report, log) { - // Success! - None => Ok(()), - // Some warnings when executing new zones are expected - // (e.g., failure to talk to a zone immediately after - // starting it), but we expect those to clear up - // quickly. - Some(EventReportProblems::HadWarnings) => { - Err(CondCheckError::NotYet) - } - Some(EventReportProblems::Fatal(err)) => { - Err(CondCheckError::Failed(format!( - "execution problem: {err}" - ))) - } + // If the event report indicates any problems, we'll try + // again - we expect to see some transient problems and + // warnings, but also expect them to clear up on their own + // pretty quickly. + if event_report_has_problems(event_report, log) { + Err(CondCheckError::NotYet) + } else { + Ok(()) } } Err(err) => Err(CondCheckError::Failed(format!( @@ -357,7 +361,7 @@ async fn test_execute_expunged_zone_of_kind( } }, &Duration::from_secs(1), - &Duration::from_secs(120), + &Duration::from_secs(180), ) .await .expect("waited for successful execution"); @@ -472,39 +476,35 @@ impl ZonesOfInterest { } } -enum EventReportProblems { - Fatal(&'static str), - HadWarnings, -} - -fn event_report_problems( +fn event_report_has_problems( event_report: EventReport, log: &Logger, -) -> Option { +) -> bool { let mut buf = EventBuffer::default(); buf.add_event_report(event_report); // Check for outright failure. let Some(summary) = buf.root_execution_summary() else { - return Some(EventReportProblems::Fatal( - "missing root_execution_summary", - )); + warn!(log, "event report missing root_execution_summary"); + return true; }; match summary.execution_status { ExecutionStatus::Terminal(info) => match info.kind { TerminalKind::Completed => (), - TerminalKind::Failed => { - return Some(EventReportProblems::Fatal("execution failed")); - } - TerminalKind::Aborted => { - return Some(EventReportProblems::Fatal("execution aborted")); + TerminalKind::Failed | TerminalKind::Aborted => { + warn!( + log, "execution ended with unexpected terminal status"; + "stats" => ?info.kind, + ); + return true; } }, - ExecutionStatus::NotStarted => { - return Some(EventReportProblems::Fatal("execution not started")); - } - ExecutionStatus::Running { .. } => { - return Some(EventReportProblems::Fatal("execution still running")); + ExecutionStatus::NotStarted | ExecutionStatus::Running { .. } => { + warn!( + log, "execution has unexpected execution status"; + "stats" => ?summary.execution_status, + ); + return true; } } @@ -518,8 +518,7 @@ fn event_report_problems( | StepOutcome::Skipped { .. } => (), StepOutcome::Warning { message, .. } => { warn!( - log, - "exeucution warning"; + log, "execution warning"; "message" => %message, "step" => %step_data.step_info().description, ); @@ -530,5 +529,5 @@ fn event_report_problems( } } - if had_warnings { Some(EventReportProblems::HadWarnings) } else { None } + had_warnings } From 4889ebcfc2bbc28b67d7e5841249fe04a5dcf44d Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 17 Mar 2026 15:16:11 -0400 Subject: [PATCH 10/14] better error logging in warnings --- .../execution/src/clickhouse.rs | 11 ++++---- nexus/reconfigurator/execution/src/lib.rs | 28 +++++++++---------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/nexus/reconfigurator/execution/src/clickhouse.rs b/nexus/reconfigurator/execution/src/clickhouse.rs index 2fc1973343d..54c3787ccf5 100644 --- a/nexus/reconfigurator/execution/src/clickhouse.rs +++ b/nexus/reconfigurator/execution/src/clickhouse.rs @@ -5,6 +5,7 @@ //! Deployment of Clickhouse keeper and server nodes via clickhouse-admin running in //! deployed clickhouse zones. +use anyhow::Context; use anyhow::anyhow; use camino::Utf8PathBuf; use clickhouse_admin_keeper_client::Client as ClickhouseKeeperClient; @@ -259,11 +260,11 @@ where let admin_url = format!("http://{admin_addr}"); let log = opctx.log.new(slog::o!("admin_url" => admin_url.clone())); let client = ClickhouseSingleClient::new(&admin_url, log.clone()); - client.init_db().await.map(|_| ()).map_err(|e| { - anyhow!( - "failed to initialize single-node clickhouse database: {e}", - ) - }) + client + .init_db() + .await + .map(|_| ()) + .context("failed to initialize single-node clickhouse database") } else { Ok(()) } diff --git a/nexus/reconfigurator/execution/src/lib.rs b/nexus/reconfigurator/execution/src/lib.rs index bbe585cde57..f197f0f1c0a 100644 --- a/nexus/reconfigurator/execution/src/lib.rs +++ b/nexus/reconfigurator/execution/src/lib.rs @@ -374,7 +374,11 @@ fn register_support_bundle_failure_step<'a>( "support bundle expunge report: {report:?}" )) .build(), - Err(err) => StepWarning::new((), err.to_string()).build(), + Err(err) => StepWarning::new( + (), + InlineErrorChain::new(&err).to_string(), + ) + .build(), }; Ok(res) }, @@ -418,19 +422,12 @@ fn register_deploy_db_metadata_nexus_records_step<'a>( return StepSkipped::new((), "not running as Nexus").into(); }; - match database::deploy_db_metadata_nexus_records( + let result = database::deploy_db_metadata_nexus_records( opctx, &datastore, &blueprint, nexus_id, ) .await - { - Ok(()) => StepSuccess::new(()).into(), - Err(err) => StepWarning::new( - (), - err.context("ensuring db_metadata_nexus_state") - .to_string(), - ) - .into(), - } + .context("ensuring db_metadata_nexus_state"); + Ok(map_err_to_step_warning(result)) }, ) .register(); @@ -685,9 +682,12 @@ fn register_reassign_sagas_step<'a>( needs_saga_recovery, ), ), - Err(error) => ( - StepWarning::new(false, error.to_string()) - .build(), + Err(err) => ( + StepWarning::new( + false, + InlineErrorChain::new(&*err).to_string(), + ) + .build(), SagaReassignmentDone::Indeterminate, ), } From 0f24ae20a77558060fcbf6fad73d6d642ba0ed0f Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 17 Mar 2026 20:03:46 -0400 Subject: [PATCH 11/14] wait for cockroach health after expunging it --- Cargo.lock | 1 + live-tests/Cargo.toml | 1 + live-tests/tests/common/mod.rs | 17 ++ .../tests/test_execute_expunged_zone.rs | 234 ++++++++++++++---- 4 files changed, 210 insertions(+), 43 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 71a4616b5e6..d73bdf8f0ef 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8423,6 +8423,7 @@ version = "0.1.0" dependencies = [ "anyhow", "assert_matches", + "chrono", "dns-service-client", "dropshot", "futures", diff --git a/live-tests/Cargo.toml b/live-tests/Cargo.toml index ce63a17f087..b1a29348e04 100644 --- a/live-tests/Cargo.toml +++ b/live-tests/Cargo.toml @@ -15,6 +15,7 @@ omicron-workspace-hack.workspace = true [dev-dependencies] anyhow.workspace = true assert_matches.workspace = true +chrono.workspace = true dns-service-client.workspace = true dropshot.workspace = true futures.workspace = true diff --git a/live-tests/tests/common/mod.rs b/live-tests/tests/common/mod.rs index a22bda45e33..7044202ab41 100644 --- a/live-tests/tests/common/mod.rs +++ b/live-tests/tests/common/mod.rs @@ -68,6 +68,23 @@ impl LiveTestContext { &self.datastore } + /// Establish a new `DataStore` connection pointed at this deployed system's + /// database + /// + /// Most consumers should prefer `datastore()`, which returns a reference to + /// a `DataStore` constructed when this context was created. This method is + /// useful if a caller needs to reevaluate what Cockroach instances are + /// available in DNS (e.g., due to zone expungement) or needs a `DataStore` + /// instance that is not shared. + pub async fn new_datastore_connection( + &self, + ) -> anyhow::Result<(OpContext, Arc)> { + let log = &self.logctx.log; + let datastore = create_datastore(log, &self.resolver).await?; + let opctx = OpContext::for_tests(log.clone(), datastore.clone()); + Ok((opctx, datastore)) + } + /// Returns a client for a Nexus internal API at the given socket address pub fn specific_internal_nexus_client( &self, diff --git a/live-tests/tests/test_execute_expunged_zone.rs b/live-tests/tests/test_execute_expunged_zone.rs index fe19f206a97..1fc96abcc41 100644 --- a/live-tests/tests/test_execute_expunged_zone.rs +++ b/live-tests/tests/test_execute_expunged_zone.rs @@ -6,19 +6,24 @@ pub mod common; use crate::common::reconfigurator::blueprint_load_target_enabled; use anyhow::Context; +use anyhow::bail; +use chrono::Utc; use common::LiveTestContext; use common::reconfigurator::blueprint_edit_current_target_disabled; +use dns_service_client::ClientInfo; use live_tests_macros::live_test; +use nexus_lockstep_client::types::BackgroundTasksActivateRequest; use nexus_lockstep_client::types::BlueprintTargetSet; use nexus_lockstep_client::types::LastResult; use nexus_types::deployment::BlueprintZoneConfig; +use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::ReconfiguratorConfig; use nexus_types::deployment::ReconfiguratorConfigParam; +use nexus_types::inventory::Collection; use omicron_common::policy::COCKROACHDB_REDUNDANCY; use omicron_test_utils::dev::poll::CondCheckError; use omicron_test_utils::dev::poll::wait_for_condition; use omicron_uuid_kinds::BlueprintUuid; -use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::SledUuid; use serde::Deserialize; use sled_agent_types::inventory::ZoneKind; @@ -49,33 +54,12 @@ async fn test_execute_expunged_zone(lc: &LiveTestContext) { // check. match datastore.inventory_get_latest_collection(opctx).await { Ok(Some(collection)) => { - let expected_nodes = COCKROACHDB_REDUNDANCY; - let db_status = collection.cockroach_status; - if db_status.len() < expected_nodes { - panic!( - "refusing to run live test: \ - latest inventory only has status from {} cockroach nodes; \ - expected {expected_nodes}", - db_status.len() - ); - } - for (node_id, status) in db_status { - match status.ranges_underreplicated { - Some(0) => (), - _ => panic!( - "refusing to run live test: inventory reports {:?} \ - for ranges underreplicated on CRDB node {node_id}", - status.ranges_underreplicated, - ), - } - match status.liveness_live_nodes { - Some(n) if n == expected_nodes as u64 => (), - _ => panic!( - "refusing to run live test: inventory reports {:?} \ - for live nodes on CRDB node {node_id}", - status.liveness_live_nodes, - ), - } + if let Err(err) = + validate_cockroach_is_healthy_according_to_inventory( + &collection, + ) + { + panic!("refusing to run live test: {err:#}"); } } Ok(None) => panic!( @@ -133,7 +117,8 @@ async fn test_execute_expunged_zone_of_kind( let log = lc.log(); let initial_nexus_clients = lc.all_internal_nexus_clients().await.unwrap(); - let nexus = initial_nexus_clients.first().expect("internal Nexus client"); + let mut nexus = + initial_nexus_clients.first().expect("internal Nexus client"); // Safety check: We expect the going-in state to be "blueprint execution is // enabled". If it isn't, bail out and don't attempt to run this test. @@ -172,13 +157,35 @@ async fn test_execute_expunged_zone_of_kind( zone_kind, ); - let (sled_id, zone_id) = - zones_of_interest.pick_any_zone_to_expunge(); + let (sled_id, zone) = zones_of_interest.pick_any_zone_to_expunge(); builder - .sled_expunge_zone(sled_id, zone_id) + .sled_expunge_zone(sled_id, zone.id) .context("expunging zone")?; + // If we're expunging a Nexus, check and see if we should switch to + // a different Nexus for the remainder of this test. This is pretty + // hacky. + if let BlueprintZoneType::Nexus(nexus_cfg) = &zone.zone_type { + let client_baseurl = nexus.baseurl(); + let expunged_host = + format!("[{}]", nexus_cfg.lockstep_address().ip()); + info!( + log, + "checking current Nexus client's `baseurl` \ + against the Nexus zone we're expunging"; + "client_baseurl" => %client_baseurl, + "expunged_host" => %expunged_host, + ); + + if client_baseurl.contains(&expunged_host) { + nexus = initial_nexus_clients + .get(1) + .expect("at least two Nexus clients available"); + info!(log, "swapping to another Nexus client"); + } + } + orig_zones_of_interest = Some(zones_of_interest); Ok(()) @@ -222,10 +229,9 @@ async fn test_execute_expunged_zone_of_kind( // Edit the newly-planned blueprint and expunge the zone it just added. let (_, edit2) = blueprint_edit_current_target_disabled(log, nexus, |builder| { - let (sled_id, zone_id) = - plan1_zones_added.first().copied().unwrap(); + let (sled_id, zone) = plan1_zones_added.first().unwrap(); builder - .sled_expunge_zone(sled_id, zone_id) + .sled_expunge_zone(*sled_id, zone.id) .context("expunging zone")?; Ok(()) }) @@ -367,14 +373,20 @@ async fn test_execute_expunged_zone_of_kind( .expect("waited for successful execution"); // Log the IDs of the zone in this state. - let (sled_id, zone_id) = plan1_zones_added.first().copied().unwrap(); + let (sled_id, zone) = plan1_zones_added.first().unwrap(); info!( log, "got successful execution of blueprint with \ expunged-and-never-in-service zone"; "sled_id" => %sled_id, - "zone_id" => %zone_id, + "zone_id" => %zone.id, ); + + // If we just expunged a cockroach node, wait for the cluster to be healthy + // again before we return. + if zone_kind == ZoneKind::CockroachDb { + wait_for_cockroach_cluster_to_be_healthy(nexus, lc, log).await; + } } async fn disable_blueprint_planning( @@ -437,7 +449,7 @@ async fn disable_blueprint_execution( } struct ZonesOfInterest { - in_service_zones: BTreeSet<(SledUuid, OmicronZoneUuid)>, + in_service_zones: BTreeSet<(SledUuid, BlueprintZoneConfig)>, } impl ZonesOfInterest { @@ -452,26 +464,26 @@ impl ZonesOfInterest { continue; } - in_service_zones.insert((sled_id, zone_cfg.id)); + in_service_zones.insert((sled_id, zone_cfg.clone())); } Self { in_service_zones } } - fn pick_any_zone_to_expunge(&self) -> (SledUuid, OmicronZoneUuid) { + fn pick_any_zone_to_expunge(&self) -> (SledUuid, BlueprintZoneConfig) { self.in_service_zones .first() - .copied() + .cloned() .expect("no zones of relevant kind found") } fn zones_added_since( &self, other: &Self, - ) -> BTreeSet<(SledUuid, OmicronZoneUuid)> { + ) -> BTreeSet<(SledUuid, BlueprintZoneConfig)> { self.in_service_zones .difference(&other.in_service_zones) - .copied() + .cloned() .collect() } } @@ -531,3 +543,139 @@ fn event_report_has_problems( had_warnings } + +async fn wait_for_cockroach_cluster_to_be_healthy( + nexus: &nexus_lockstep_client::Client, + lc: &LiveTestContext, + log: &Logger, +) { + let wait_start_time = Utc::now(); + let inventory_collector_bgtask = vec!["inventory_collection".to_string()]; + + info!(log, "starting to wait for cockroachdb to be reported healthy"); + wait_for_condition( + || async { + // Attempt to activate inventory collection; ignore any errors here, + // since activation might fail transiently. + match nexus + .bgtask_activate(&BackgroundTasksActivateRequest { + bgtask_names: inventory_collector_bgtask.clone(), + }) + .await + { + Ok(_) => (), + Err(err) => { + warn!( + log, "failed to activate inventory collector bg task"; + InlineErrorChain::new(&err), + ); + } + } + + // Get a new datastore - if we're waiting for cockroach to be + // healthy, it's possible we expunged the cockroach that `lc` cached + // and returns via its `.datastore()` method. Look up cockroach in + // DNS again in every iteration of this check. + let (opctx, datastore) = match lc.new_datastore_connection().await { + Ok((opctx, datastore)) => (opctx, datastore), + Err(err) => { + warn!( + log, "failed to establish new datastore connection"; + InlineErrorChain::new(&*err), + ); + return Err(CondCheckError::NotYet); + } + }; + + // Load the latest inventory collection. + let collection_result = + datastore.inventory_get_latest_collection(&opctx).await; + + // Cleanly terminate this newly-created datastore. + datastore.terminate().await; + + let collection = match collection_result { + Ok(Some(collection)) => collection, + // This should never happen: we know from starting the test + // that at least one inventory collection already existed. + Ok(None) => { + return Err(CondCheckError::Failed( + "no inventory collections exist?!", + )); + } + Err(err) => { + warn!( + log, "failed to load latest inventory collection"; + InlineErrorChain::new(&err), + ); + return Err(CondCheckError::NotYet); + } + }; + + // Wait until the inventory collection is sufficiently new. + if collection.time_started < wait_start_time { + info!( + log, "waiting for new inventory collection"; + "latest-collection-started-at" => %collection.time_started, + "must-be-newer-than" => %wait_start_time, + ); + return Err(CondCheckError::NotYet); + } + + // Is cockroach healthy? + match validate_cockroach_is_healthy_according_to_inventory( + &collection, + ) { + Ok(()) => Ok(()), + Err(err) => { + warn!( + log, "cockroach cluster is not yet healthy"; + InlineErrorChain::new(&*err), + ); + Err(CondCheckError::NotYet) + } + } + }, + &Duration::from_secs(1), + // Even on freshly-set-up systems, it can take more than 10 minutes for + // CRDB to become healthy after an expungement. Is 20 generous enough? + &Duration::from_secs(20 * 60), + ) + .await + .expect("cockroach cluster failed to become healthy sufficently quickly"); +} + +fn validate_cockroach_is_healthy_according_to_inventory( + collection: &Collection, +) -> anyhow::Result<()> { + let expected_nodes = COCKROACHDB_REDUNDANCY; + + let db_status = &collection.cockroach_status; + if db_status.len() < expected_nodes { + bail!( + "latest inventory only has status from {} cockroach nodes; \ + expected {expected_nodes}", + db_status.len() + ); + } + + for (node_id, status) in db_status { + match status.ranges_underreplicated { + Some(0) => (), + _ => bail!( + "inventory reports {:?} for ranges underreplicated on \ + CRDB node {node_id}", + status.ranges_underreplicated, + ), + } + match status.liveness_live_nodes { + Some(n) if n == expected_nodes as u64 => (), + _ => bail!( + "inventory reports {:?} for live nodes on CRDB node {node_id}", + status.liveness_live_nodes, + ), + } + } + + Ok(()) +} From 48695aa8d420f6acb2a543c2ac515a49036a8feb Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 17 Mar 2026 20:16:41 -0400 Subject: [PATCH 12/14] readme fixes --- live-tests/README.adoc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/live-tests/README.adoc b/live-tests/README.adoc index 3042eb4d0c9..c18b52fcd85 100644 --- a/live-tests/README.adoc +++ b/live-tests/README.adoc @@ -22,7 +22,7 @@ First, deploy Omicron using `a4x2` or one of the hardware test rigs. Ensure the system's target blueprint is enabled. The live tests require this to avoid a case where the live tests generate blueprints based on a target blueprint that is not current, and then make a bunch of changes to the system unrelated to the tests. -Ensure the system's blueprint planner is disabled. The live tests will generate various blueprints with potentially-questiable content, and we don't want the planner to run mid-test and attempt to "fix" the state of the system. +Ensure the system's blueprint planner is disabled. Live tests may generate various blueprints with content that won't make sense to the planner, and we don't want the planner to run mid-test and attempt to "fix" the state of the system. On a fresh system, you will have to disable the blueprint planner yourself: @@ -30,8 +30,6 @@ On a fresh system, you will have to disable the blueprint planner yourself: omdb --destructive nexus reconfigurator-config set --planner-enabled false ``` -This essentially enables reconfigurator, causing it to constantly try to make the system match its target blueprint. You only need to do this once in the lifetime of the system, not every time you re-run the live tests. - At this point the system is prepared for testing. In your Omicron workspace, run `cargo xtask live-tests` to build an archive and then follow the instructions: ``` From aa0291303637ccf06834b4926e47e98d1bf21f3e Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 17 Mar 2026 20:20:49 -0400 Subject: [PATCH 13/14] minor cleanup --- live-tests/tests/common/reconfigurator.rs | 2 +- live-tests/tests/test_execute_expunged_zone.rs | 13 ++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/live-tests/tests/common/reconfigurator.rs b/live-tests/tests/common/reconfigurator.rs index 54cca720763..2eff8591193 100644 --- a/live-tests/tests/common/reconfigurator.rs +++ b/live-tests/tests/common/reconfigurator.rs @@ -183,7 +183,7 @@ where target_id: blueprint2.id, }) .await - .expect("setting new target"); + .context("setting new target")?; info!(log, "finished editing target blueprint"; "old_target_id" => %blueprint1.id, "new_target_id" => %blueprint2.id, diff --git a/live-tests/tests/test_execute_expunged_zone.rs b/live-tests/tests/test_execute_expunged_zone.rs index 1fc96abcc41..bf2065ad7f8 100644 --- a/live-tests/tests/test_execute_expunged_zone.rs +++ b/live-tests/tests/test_execute_expunged_zone.rs @@ -10,7 +10,6 @@ use anyhow::bail; use chrono::Utc; use common::LiveTestContext; use common::reconfigurator::blueprint_edit_current_target_disabled; -use dns_service_client::ClientInfo; use live_tests_macros::live_test; use nexus_lockstep_client::types::BackgroundTasksActivateRequest; use nexus_lockstep_client::types::BlueprintTargetSet; @@ -50,7 +49,7 @@ async fn test_execute_expunged_zone(lc: &LiveTestContext) { // Safety check: If running this test multiple times, we may leave behind // underreplicated cockroach ranges. We shouldn't attempt to proceed if - // those haven't been repair yet. Get the latest inventory collection and + // those haven't been repaired yet. Get the latest inventory collection and // check. match datastore.inventory_get_latest_collection(opctx).await { Ok(Some(collection)) => { @@ -143,7 +142,7 @@ async fn test_execute_expunged_zone_of_kind( // * One zone that did not exist the last time a blueprint was executed // exists in the blueprint but with the expunged disposition // * One zone that did not exist the last time a blueprint was executed - // exists and is should be put into service + // exists and should be put into service // Disable planning and execution. disable_blueprint_planning(nexus, log).await; @@ -339,7 +338,7 @@ async fn test_execute_expunged_zone_of_kind( return Err(CondCheckError::NotYet); } - // Check that execution completely cleanly. + // Check that execution completed cleanly. if let Some(err) = details.execution_error { warn!( log, "execution had an error"; @@ -505,8 +504,8 @@ fn event_report_has_problems( TerminalKind::Completed => (), TerminalKind::Failed | TerminalKind::Aborted => { warn!( - log, "execution ended with unexpected terminal status"; - "stats" => ?info.kind, + log, "execution ended with unexpected terminal kind"; + "kind" => ?info.kind, ); return true; } @@ -642,7 +641,7 @@ async fn wait_for_cockroach_cluster_to_be_healthy( &Duration::from_secs(20 * 60), ) .await - .expect("cockroach cluster failed to become healthy sufficently quickly"); + .expect("cockroach cluster failed to become healthy sufficiently quickly"); } fn validate_cockroach_is_healthy_according_to_inventory( From bb4505a21d82d324df13bd26211bf041996a4a29 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 17 Mar 2026 20:33:08 -0400 Subject: [PATCH 14/14] missing import --- live-tests/tests/test_execute_expunged_zone.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/live-tests/tests/test_execute_expunged_zone.rs b/live-tests/tests/test_execute_expunged_zone.rs index bf2065ad7f8..e572d8bbe54 100644 --- a/live-tests/tests/test_execute_expunged_zone.rs +++ b/live-tests/tests/test_execute_expunged_zone.rs @@ -11,6 +11,7 @@ use chrono::Utc; use common::LiveTestContext; use common::reconfigurator::blueprint_edit_current_target_disabled; use live_tests_macros::live_test; +use nexus_lockstep_client::ClientInfo; use nexus_lockstep_client::types::BackgroundTasksActivateRequest; use nexus_lockstep_client::types::BlueprintTargetSet; use nexus_lockstep_client::types::LastResult;