From 5245858d476bcb9dc8b139984b2e09f83fec316f Mon Sep 17 00:00:00 2001 From: Rain Date: Wed, 18 Mar 2026 22:51:22 -0700 Subject: [PATCH] [spr] initial version Created using spr 1.3.6-beta.1 --- Cargo.lock | 12 + Cargo.toml | 3 + dev-tools/ls-apis-shared/Cargo.toml | 13 + .../ls-apis-shared/src/deployment_unit_dag.rs | 43 +++ dev-tools/ls-apis-shared/src/lib.rs | 16 + dev-tools/ls-apis/Cargo.toml | 1 + dev-tools/ls-apis/api-manifest.toml | 10 + dev-tools/ls-apis/src/api_metadata.rs | 39 +- dev-tools/ls-apis/src/bin/ls-apis.rs | 28 +- dev-tools/ls-apis/src/system_apis.rs | 70 +++- .../tests/output/deployment-unit-dag.toml | 50 +++ dev-tools/ls-apis/tests/test_dependencies.rs | 12 + nexus/inventory/src/builder.rs | 11 +- nexus/reconfigurator/planning/Cargo.toml | 2 + nexus/reconfigurator/planning/src/system.rs | 33 +- .../tests/integration_tests/planner.rs | 341 ++++++++++++++++++ 16 files changed, 646 insertions(+), 38 deletions(-) create mode 100644 dev-tools/ls-apis-shared/Cargo.toml create mode 100644 dev-tools/ls-apis-shared/src/deployment_unit_dag.rs create mode 100644 dev-tools/ls-apis-shared/src/lib.rs create mode 100644 dev-tools/ls-apis/tests/output/deployment-unit-dag.toml diff --git a/Cargo.lock b/Cargo.lock index 8bc4b88fc38..8f3a543726d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6326,6 +6326,15 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" +[[package]] +name = "ls-apis-shared" +version = "0.1.0" +dependencies = [ + "newtype_derive", + "omicron-workspace-hack", + "serde", +] + [[package]] name = "lzma-sys" version = "0.1.20" @@ -7380,6 +7389,7 @@ dependencies = [ "internal-dns-types", "ipnet", "itertools 0.14.0", + "ls-apis-shared", "maplit", "nexus-config", "nexus-inventory", @@ -7408,6 +7418,7 @@ dependencies = [ "test-strategy", "thiserror 2.0.18", "tokio", + "toml 0.8.23", "tufaceous", "tufaceous-artifact", "tufaceous-lib", @@ -8465,6 +8476,7 @@ dependencies = [ "iddqd", "indent_write", "itertools 0.14.0", + "ls-apis-shared", "newtype_derive", "omicron-test-utils", "omicron-workspace-hack", diff --git a/Cargo.toml b/Cargo.toml index bc096d12ee1..712a3ffdb85 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,6 +41,7 @@ members = [ "dev-tools/downloader", "dev-tools/dropshot-apis", "dev-tools/ls-apis", + "dev-tools/ls-apis-shared", "dev-tools/mgs-dev", "dev-tools/omdb", "dev-tools/omicron-dev", @@ -217,6 +218,7 @@ default-members = [ "dev-tools/downloader", "dev-tools/dropshot-apis", "dev-tools/ls-apis", + "dev-tools/ls-apis-shared", "dev-tools/mgs-dev", "dev-tools/omdb", "dev-tools/omicron-dev", @@ -579,6 +581,7 @@ linear-map = "1.2.0" live-tests-macros = { path = "live-tests/macros" } lldpd_client = { git = "https://github.com/oxidecomputer/lldp", package = "lldpd-client" } lldp_protocol = { git = "https://github.com/oxidecomputer/lldp", package = "protocol" } +ls-apis-shared = { path = "dev-tools/ls-apis-shared" } macaddr = { version = "1.0.1", features = ["serde_std"] } maplit = "1.0.2" newtype_derive = "0.1.6" diff --git a/dev-tools/ls-apis-shared/Cargo.toml b/dev-tools/ls-apis-shared/Cargo.toml new file mode 100644 index 00000000000..4fbd6445b2c --- /dev/null +++ b/dev-tools/ls-apis-shared/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "ls-apis-shared" +version = "0.1.0" +edition.workspace = true +license = "MPL-2.0" + +[lints] +workspace = true + +[dependencies] +newtype_derive.workspace = true +omicron-workspace-hack.workspace = true +serde.workspace = true diff --git a/dev-tools/ls-apis-shared/src/deployment_unit_dag.rs b/dev-tools/ls-apis-shared/src/deployment_unit_dag.rs new file mode 100644 index 00000000000..72e7e02998c --- /dev/null +++ b/dev-tools/ls-apis-shared/src/deployment_unit_dag.rs @@ -0,0 +1,43 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use serde::{Deserialize, Serialize}; + +/// The path to omicron-ls-apis relative to the workspace root. +pub const OMICRON_LS_APIS_PATH: &str = "dev-tools/ls-apis"; + +/// The path to the `deployment_unit_dag.toml` file relative to the +/// omicron-ls-apis directory. +pub const DEPLOYMENT_UNIT_DAG_PATH: &str = + "tests/output/deployment-unit-dag.toml"; + +/// Short, machine-friendly identifier for a deployment unit +/// (e.g. "nexus", "dns_server"). +#[derive( + Clone, Deserialize, Serialize, Hash, Ord, PartialOrd, Eq, PartialEq, +)] +#[serde(transparent)] +pub struct DeploymentUnitId(String); +NewtypeDebug! { () pub struct DeploymentUnitId(String); } +NewtypeDeref! { () pub struct DeploymentUnitId(String); } +NewtypeDisplay! { () pub struct DeploymentUnitId(String); } +NewtypeFrom! { () pub struct DeploymentUnitId(String); } + +/// A single directed edge in the deployment unit dependency DAG. +/// +/// `consumer` depends on `producer`: the producer must be fully updated +/// before the consumer starts updating. +#[derive( + Clone, Debug, Deserialize, Serialize, Eq, PartialEq, Ord, PartialOrd, +)] +pub struct DagEdge { + pub consumer: DeploymentUnitId, + pub producer: DeploymentUnitId, +} + +/// Top-level structure of the `deployment_unit_dag.toml` file. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct DagEdgesFile { + pub edges: Vec, +} diff --git a/dev-tools/ls-apis-shared/src/lib.rs b/dev-tools/ls-apis-shared/src/lib.rs new file mode 100644 index 00000000000..87c7432589a --- /dev/null +++ b/dev-tools/ls-apis-shared/src/lib.rs @@ -0,0 +1,16 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Types shared between `omicron-ls-apis` and consumers of its output +//! (e.g. the reconfigurator planner tests). + +#[macro_use] +extern crate newtype_derive; + +mod deployment_unit_dag; + +pub use deployment_unit_dag::{ + DEPLOYMENT_UNIT_DAG_PATH, DagEdge, DagEdgesFile, DeploymentUnitId, + OMICRON_LS_APIS_PATH, +}; diff --git a/dev-tools/ls-apis/Cargo.toml b/dev-tools/ls-apis/Cargo.toml index b1942608144..7f76d911ffb 100644 --- a/dev-tools/ls-apis/Cargo.toml +++ b/dev-tools/ls-apis/Cargo.toml @@ -15,6 +15,7 @@ clap.workspace = true iddqd.workspace = true indent_write.workspace = true itertools.workspace = true +ls-apis-shared.workspace = true newtype_derive.workspace = true parse-display.workspace = true petgraph.workspace = true diff --git a/dev-tools/ls-apis/api-manifest.toml b/dev-tools/ls-apis/api-manifest.toml index 4caf0db8491..66e2644e503 100644 --- a/dev-tools/ls-apis/api-manifest.toml +++ b/dev-tools/ls-apis/api-manifest.toml @@ -67,6 +67,7 @@ ignored_non_clients = [ # The host OS includes Sled Agent, Propolis, and all the components that get # bundled into the switch zone. [[deployment_units]] +id = "host_os" label = "Host OS" packages = [ "omicron-sled-agent", @@ -83,20 +84,24 @@ packages = [ # Installinator gets packaged into its own host OS image. [[deployment_units]] +id = "installinator" label = "Installinator" packages = [ "installinator" ] # The rest of these get bundled by standard control plane zone images. [[deployment_units]] +id = "crucible" label = "Crucible" packages = [ "crucible-agent", "crucible-downstairs" ] [[deployment_units]] +id = "crucible_pantry" label = "Crucible Pantry" packages = [ "crucible-pantry" ] [[deployment_units]] +id = "cockroach" label = "Cockroach" packages = [ "omicron-cockroach-admin" ] @@ -104,22 +109,27 @@ packages = [ "omicron-cockroach-admin" ] # our purposes, and the tooling doesn't support multiple deployment units # that each expose a particular service. [[deployment_units]] +id = "clickhouse" label = "Clickhouse (single-node) / Clickhouse Server (multi-node) / Clickhouse Keeper (multi-node)" packages = [ "omicron-clickhouse-admin" ] [[deployment_units]] +id = "dns_server" label = "DNS Server" packages = [ "dns-server" ] [[deployment_units]] +id = "nexus" label = "Nexus" packages = [ "omicron-nexus" ] [[deployment_units]] +id = "ntp" label = "NTP" packages = [ "omicron-ntp-admin" ] [[deployment_units]] +id = "oximeter" label = "Oximeter" packages = [ "oximeter-collector" ] diff --git a/dev-tools/ls-apis/src/api_metadata.rs b/dev-tools/ls-apis/src/api_metadata.rs index 0b26f19e750..f786d07151c 100644 --- a/dev-tools/ls-apis/src/api_metadata.rs +++ b/dev-tools/ls-apis/src/api_metadata.rs @@ -14,6 +14,7 @@ use anyhow::{Result, bail}; use iddqd::IdOrdItem; use iddqd::IdOrdMap; use iddqd::id_upcast; +use ls_apis_shared::DeploymentUnitId; use serde::Deserialize; use std::borrow::Borrow; use std::collections::BTreeMap; @@ -27,7 +28,7 @@ use std::collections::BTreeSet; #[serde(try_from = "RawApiMetadata")] pub struct AllApiMetadata { apis: BTreeMap, - deployment_units: BTreeMap, + deployment_units: IdOrdMap, dependency_rules: BTreeMap>, ignored_non_clients: BTreeSet, intra_deployment_unit_only_edges: Vec, @@ -42,10 +43,18 @@ impl AllApiMetadata { /// Iterate over the deployment units defined in the metadata pub fn deployment_units( &self, - ) -> impl Iterator { + ) -> impl Iterator { self.deployment_units.iter() } + /// Look up a deployment unit's info by its ID + pub fn deployment_unit_info( + &self, + id: &DeploymentUnitId, + ) -> Option<&DeploymentUnitInfo> { + self.deployment_units.get(id) + } + /// Iterate over the package names for all the APIs' clients pub fn client_pkgnames(&self) -> impl Iterator { self.apis.keys() @@ -55,7 +64,7 @@ impl AllApiMetadata { pub fn server_components( &self, ) -> impl Iterator { - self.deployment_units.values().flat_map(|d| d.packages.iter()) + self.deployment_units.iter().flat_map(|d| d.packages.iter()) } /// Look up details about an API based on its client package name @@ -166,14 +175,12 @@ impl TryFrom for AllApiMetadata { } } - let mut deployment_units = BTreeMap::new(); + let mut deployment_units = IdOrdMap::new(); for info in raw.deployment_units { - if let Some(previous) = - deployment_units.insert(info.label.clone(), info) - { + if let Err(e) = deployment_units.insert_unique(info) { bail!( - "duplicate deployment unit in API metadata: {}", - &previous.label, + "duplicate deployment unit id in API metadata: {}", + e.new_item().id, ); } } @@ -206,7 +213,7 @@ impl TryFrom for AllApiMetadata { // Validate that IDU-only edges reference only known server components // and APIs. let known_components: BTreeSet<_> = - deployment_units.values().flat_map(|u| u.packages.iter()).collect(); + deployment_units.iter().flat_map(|u| u.packages.iter()).collect(); for edge in &raw.intra_deployment_unit_only_edges { if !known_components.contains(&edge.server) { bail!( @@ -411,12 +418,22 @@ pub enum ApiConsumerStatus { #[derive(Deserialize)] #[serde(deny_unknown_fields)] pub struct DeploymentUnitInfo { - /// human-readable label, also used as primary key + /// short, machine-friendly identifier (e.g. "nexus", "dns_server") + pub id: DeploymentUnitId, + /// human-readable label for display pub label: DeploymentUnitName, /// list of Rust packages that are shipped in this unit pub packages: Vec, } +impl IdOrdItem for DeploymentUnitInfo { + type Key<'a> = &'a DeploymentUnitId; + fn key(&self) -> Self::Key<'_> { + &self.id + } + id_upcast!(); +} + #[derive(Deserialize)] #[serde(deny_unknown_fields)] pub struct DependencyFilterRule { diff --git a/dev-tools/ls-apis/src/bin/ls-apis.rs b/dev-tools/ls-apis/src/bin/ls-apis.rs index 7768c988e16..0e72e108d26 100644 --- a/dev-tools/ls-apis/src/bin/ls-apis.rs +++ b/dev-tools/ls-apis/src/bin/ls-apis.rs @@ -8,6 +8,7 @@ use anyhow::{Context, Result, bail}; use camino::Utf8PathBuf; use clap::{Args, Parser, Subcommand}; use indent_write::indentable::Indentable; +use ls_apis_shared::DagEdgesFile; use omicron_ls_apis::{ AllApiMetadata, ApiConsumerStatus, ApiDependencyFilter, ApiMetadata, FailedConsumerCheck, LoadArgs, ServerComponentName, SystemApis, @@ -38,6 +39,8 @@ enum Cmds { Apis(ShowDepsArgs), /// check the update DAG and propose changes Check, + /// print deployment unit DAG edges as TOML + DagEdges, /// print out APIs exported and consumed by each deployment unit DeploymentUnits(DotArgs), /// print out APIs exported and consumed, by server component @@ -86,11 +89,26 @@ fn main() -> Result<()> { Cmds::Adoc => run_adoc(&apis), Cmds::Apis(args) => run_apis(&apis, args), Cmds::Check => run_check(&apis), + Cmds::DagEdges => run_dag_edges(&apis), Cmds::DeploymentUnits(args) => run_deployment_units(&apis, args), Cmds::Servers(args) => run_servers(&apis, args), } } +fn run_dag_edges(apis: &SystemApis) -> Result<()> { + let edges = apis.deployment_unit_dag_edges()?; + let output = DagEdgesFile { edges }; + let toml_str = toml::to_string_pretty(&output) + .context("serializing DAG edges as TOML")?; + print!( + "# BEGIN @generated server-side deployment unit DAG edges.\n\ + # To regenerate, run `EXPECTORATE=overwrite cargo nextest run -p omicron-ls-apis`.\n\ + \n\ + {toml_str}" + ); + Ok(()) +} + fn run_adoc(apis: &SystemApis) -> Result<()> { println!("// BEGIN auto-generated by Omicron's `cargo xtask ls-apis adoc`"); println!("// DO NOT EDIT."); @@ -212,9 +230,13 @@ fn run_deployment_units(apis: &SystemApis, args: DotArgs) -> Result<()> { OutputFormat::Dot => println!("{}", apis.dot_by_unit(args.filter)?), OutputFormat::Text => { let metadata = apis.api_metadata(); - for unit in apis.deployment_units() { - let server_components = apis.deployment_unit_servers(unit)?; - println!("{}", unit); + for unit_id in apis.deployment_units() { + let info = metadata + .deployment_unit_info(unit_id) + .expect("deployment unit info exists"); + let server_components = + apis.deployment_unit_servers(unit_id)?; + println!("{}", info.label); print_server_components( apis, metadata, diff --git a/dev-tools/ls-apis/src/system_apis.rs b/dev-tools/ls-apis/src/system_apis.rs index dcc43c4b80c..ed7e7faeb02 100644 --- a/dev-tools/ls-apis/src/system_apis.rs +++ b/dev-tools/ls-apis/src/system_apis.rs @@ -6,7 +6,6 @@ //! the Oxide system use crate::ClientPackageName; -use crate::DeploymentUnitName; use crate::LoadArgs; use crate::ServerComponentName; use crate::ServerPackageName; @@ -28,10 +27,13 @@ use iddqd::IdOrdItem; use iddqd::IdOrdMap; use iddqd::id_upcast; use itertools::Itertools; +use ls_apis_shared::DagEdge; +use ls_apis_shared::DeploymentUnitId; use parse_display::{Display, FromStr}; use petgraph::dot::Dot; use petgraph::graph::Graph; use petgraph::graph::NodeIndex; +use petgraph::visit::EdgeRef; use std::collections::BTreeMap; use std::collections::BTreeSet; use std::fmt; @@ -41,10 +43,10 @@ use std::fmt; pub struct SystemApis { /// maps a deployment unit to its list of service components unit_server_components: - BTreeMap>, + BTreeMap>, /// maps a server component to the deployment unit that it's part of /// (reverse of `unit_server_components`) - server_component_units: BTreeMap, + server_component_units: BTreeMap, /// maps a server component to the list of APIs it uses (using the client /// package name as a primary key for the API) @@ -161,12 +163,10 @@ impl SystemApis { // unit, the deployment unit for any component, the servers in each // component, etc. let mut tracker = ServerComponentsTracker::new(&server_packages); - for (deployment_unit, dunit_info) in api_metadata.deployment_units() { + for dunit_info in api_metadata.deployment_units() { for dunit_pkg in &dunit_info.packages { - tracker.found_deployment_unit_package( - deployment_unit, - dunit_pkg, - )?; + tracker + .found_deployment_unit_package(&dunit_info.id, dunit_pkg)?; let (workspace, server_pkg) = workspaces.find_package_workspace(dunit_pkg)?; let dep_path = DepPath::for_pkg(server_pkg.id.clone()); @@ -346,9 +346,7 @@ impl SystemApis { } /// Iterate over the deployment units - pub fn deployment_units( - &self, - ) -> impl Iterator { + pub fn deployment_units(&self) -> impl Iterator { self.unit_server_components.keys() } @@ -356,14 +354,14 @@ impl SystemApis { pub fn server_component_unit( &self, server_component: &ServerComponentName, - ) -> Option<&DeploymentUnitName> { + ) -> Option<&DeploymentUnitId> { self.server_component_units.get(server_component) } /// For one deployment unit, iterate over the servers contained in it pub fn deployment_unit_servers( &self, - unit: &DeploymentUnitName, + unit: &DeploymentUnitId, ) -> Result + use<'_>> { Ok(self .unit_server_components @@ -536,7 +534,41 @@ impl SystemApis { pub fn dot_by_unit(&self, filter: ApiDependencyFilter) -> Result { let (graph, _) = self.make_deployment_unit_graph(filter, EdgeFilter::All)?; - Ok(Dot::new(&graph).to_string()) + // Map IDs to human-readable labels for dot output. + let labeled = graph.map( + |_, id| { + &self + .api_metadata + .deployment_unit_info(id) + .expect("deployment unit info exists") + .label + }, + |_, edge| *edge, + ); + Ok(Dot::new(&labeled).to_string()) + } + + /// Returns the edges of the deployment unit dependency DAG. + /// + /// Each edge represents a consumer that depends on a producer, meaning + /// the producer must be fully updated before the consumer starts + /// updating. + pub fn deployment_unit_dag_edges(&self) -> Result> { + let idu_only_edges = self.validate_idu_only_edges()?; + let (graph, _nodes) = self.make_deployment_unit_graph( + ApiDependencyFilter::Default, + EdgeFilter::DagOnly(&idu_only_edges), + )?; + + let mut edges = BTreeSet::new(); + for edge_ref in graph.edge_references() { + edges.insert(DagEdge { + consumer: graph[edge_ref.source()].clone(), + producer: graph[edge_ref.target()].clone(), + }); + } + + Ok(edges.into_iter().collect()) } // The complex type below is only used in this one place: the return value @@ -547,8 +579,8 @@ impl SystemApis { dependency_filter: ApiDependencyFilter, edge_filter: EdgeFilter<'_>, ) -> Result<( - Graph<&DeploymentUnitName, &ClientPackageName>, - BTreeMap<&DeploymentUnitName, NodeIndex>, + Graph<&DeploymentUnitId, &ClientPackageName>, + BTreeMap<&DeploymentUnitId, NodeIndex>, )> { let mut graph = Graph::new(); let nodes: BTreeMap<_, _> = self @@ -1300,9 +1332,9 @@ struct ServerComponentsTracker<'a> { &'a BTreeMap>, // outputs (structures that we're building up) - server_component_units: BTreeMap, + server_component_units: BTreeMap, unit_server_components: - BTreeMap>, + BTreeMap>, api_producers: BTreeMap, } @@ -1399,7 +1431,7 @@ impl<'a> ServerComponentsTracker<'a> { /// packages (server components) pub fn found_deployment_unit_package( &mut self, - deployment_unit: &DeploymentUnitName, + deployment_unit: &DeploymentUnitId, server_component: &ServerComponentName, ) -> Result<()> { if let Some(previous) = self diff --git a/dev-tools/ls-apis/tests/output/deployment-unit-dag.toml b/dev-tools/ls-apis/tests/output/deployment-unit-dag.toml new file mode 100644 index 00000000000..ef01bd77b1d --- /dev/null +++ b/dev-tools/ls-apis/tests/output/deployment-unit-dag.toml @@ -0,0 +1,50 @@ +# BEGIN @generated server-side deployment unit DAG edges. +# To regenerate, run `EXPECTORATE=overwrite cargo nextest run -p omicron-ls-apis`. + +[[edges]] +consumer = "host_os" +producer = "cockroach" + +[[edges]] +consumer = "host_os" +producer = "dns_server" + +[[edges]] +consumer = "host_os" +producer = "ntp" + +[[edges]] +consumer = "installinator" +producer = "host_os" + +[[edges]] +consumer = "nexus" +producer = "clickhouse" + +[[edges]] +consumer = "nexus" +producer = "cockroach" + +[[edges]] +consumer = "nexus" +producer = "crucible" + +[[edges]] +consumer = "nexus" +producer = "crucible_pantry" + +[[edges]] +consumer = "nexus" +producer = "dns_server" + +[[edges]] +consumer = "nexus" +producer = "host_os" + +[[edges]] +consumer = "nexus" +producer = "ntp" + +[[edges]] +consumer = "nexus" +producer = "oximeter" diff --git a/dev-tools/ls-apis/tests/test_dependencies.rs b/dev-tools/ls-apis/tests/test_dependencies.rs index 3c6a4486e82..c6dd2a2570d 100644 --- a/dev-tools/ls-apis/tests/test_dependencies.rs +++ b/dev-tools/ls-apis/tests/test_dependencies.rs @@ -9,6 +9,7 @@ //! //! This isn't (supposed to be) a test for the `ls-apis` tool itself. +use ls_apis_shared::DEPLOYMENT_UNIT_DAG_PATH; use omicron_test_utils::dev::test_cmds::EXIT_SUCCESS; use omicron_test_utils::dev::test_cmds::assert_exit_code; use omicron_test_utils::dev::test_cmds::path_to_executable; @@ -38,3 +39,14 @@ fn test_api_check() { println!("stderr:\n------\n{}\n-----", stderr_text); expectorate::assert_contents("tests/api_check.out", &stdout_text); } + +#[test] +fn test_deployment_unit_dag_edges() { + let cmd_path = path_to_executable(CMD_LS_APIS); + let exec = subprocess::Exec::cmd(cmd_path).arg("dag-edges"); + let (exit_status, stdout_text, stderr_text) = run_command(exec); + assert_exit_code(exit_status, EXIT_SUCCESS, &stderr_text); + + println!("stderr:\n------\n{}\n-----", stderr_text); + expectorate::assert_contents(DEPLOYMENT_UNIT_DAG_PATH, &stdout_text); +} diff --git a/nexus/inventory/src/builder.rs b/nexus/inventory/src/builder.rs index b5971cace24..7109d9f867a 100644 --- a/nexus/inventory/src/builder.rs +++ b/nexus/inventory/src/builder.rs @@ -708,7 +708,7 @@ impl CollectionBuilder { .context("NTP service reported time multiple times") } - /// Record metrics from a CockroachDB node + /// Record metrics from a CockroachDB node. pub fn found_cockroach_metrics( &mut self, node_id: InternalNodeId, @@ -719,6 +719,15 @@ impl CollectionBuilder { metrics.get_metric_unsigned(CockroachMetric::RangesUnderreplicated); status.liveness_live_nodes = metrics.get_metric_unsigned(CockroachMetric::LivenessLiveNodes); + self.found_cockroach_status(node_id, status); + } + + /// Record pre-built status for a CockroachDB node. + pub fn found_cockroach_status( + &mut self, + node_id: InternalNodeId, + status: CockroachStatus, + ) { self.cockroach_status.insert(node_id, status); } diff --git a/nexus/reconfigurator/planning/Cargo.toml b/nexus/reconfigurator/planning/Cargo.toml index 49714064cd5..69d161de3ce 100644 --- a/nexus/reconfigurator/planning/Cargo.toml +++ b/nexus/reconfigurator/planning/Cargo.toml @@ -62,6 +62,7 @@ expectorate.workspace = true hex-literal.workspace = true hickory-resolver.workspace = true internal-dns-types.workspace = true +ls-apis-shared.workspace = true maplit.workspace = true nexus-reconfigurator-simulation.workspace = true omicron-common = { workspace = true, features = ["testing"] } @@ -70,3 +71,4 @@ proptest.workspace = true reconfigurator-cli.workspace = true semver.workspace = true test-strategy.workspace = true +toml.workspace = true diff --git a/nexus/reconfigurator/planning/src/system.rs b/nexus/reconfigurator/planning/src/system.rs index 7dfc80f07d5..c670874aa22 100644 --- a/nexus/reconfigurator/planning/src/system.rs +++ b/nexus/reconfigurator/planning/src/system.rs @@ -1121,14 +1121,39 @@ impl SystemDescription { ).unwrap(); } - // TODO: We may want to include responses from Boundary NTP - // and CockroachDb zones here too - but neither of those are - // currently part of the example system, so their synthetic - // responses to inventory collection aren't necessary yet. + // Synthesize time-sync responses from NTP zones. + if zone.zone_type.is_ntp() { + builder + .found_ntp_timesync( + nexus_types::inventory::TimeSync { + zone_id: zone.id, + synced: true, + }, + ) + .unwrap(); + } } } } + // Synthesize CockroachDb status: every running CockroachDb zone + // reports healthy metrics. We assign sequential node IDs and report + // the total cluster size as liveness_live_nodes. + let cockroach_zone_count = + builder.ledgered_zones_of_kind(ZoneKind::CockroachDb).count(); + let live_nodes = cockroach_zone_count as u64; + for i in 0..cockroach_zone_count { + builder.found_cockroach_status( + cockroach_admin_types::node::InternalNodeId::new( + (i + 1).to_string(), + ), + nexus_types::inventory::CockroachStatus { + ranges_underreplicated: Some(0), + liveness_live_nodes: Some(live_nodes), + }, + ); + } + for membership in &self.clickhouse_keeper_cluster_membership { builder .found_clickhouse_keeper_cluster_membership(membership.clone()); diff --git a/nexus/reconfigurator/planning/tests/integration_tests/planner.rs b/nexus/reconfigurator/planning/tests/integration_tests/planner.rs index 2298fe7ba60..c78dc0080cf 100644 --- a/nexus/reconfigurator/planning/tests/integration_tests/planner.rs +++ b/nexus/reconfigurator/planning/tests/integration_tests/planner.rs @@ -3,12 +3,17 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use assert_matches::assert_matches; +use camino::Utf8PathBuf; use chrono::DateTime; use chrono::Utc; use clickhouse_admin_types::keeper::ClickhouseKeeperClusterMembership; use clickhouse_admin_types::keeper::KeeperId; use expectorate::assert_contents; use iddqd::IdOrdMap; +use ls_apis_shared::DEPLOYMENT_UNIT_DAG_PATH; +use ls_apis_shared::DagEdgesFile; +use ls_apis_shared::DeploymentUnitId; +use ls_apis_shared::OMICRON_LS_APIS_PATH; use nexus_reconfigurator_planning::blueprint_editor::ExternalNetworkingAllocator; use nexus_reconfigurator_simulation::BlueprintId; use nexus_reconfigurator_simulation::CollectionId; @@ -65,6 +70,7 @@ use omicron_common::policy::COCKROACHDB_REDUNDANCY; use omicron_common::policy::CRUCIBLE_PANTRY_REDUNDANCY; use omicron_common::policy::INTERNAL_DNS_REDUNDANCY; use omicron_common::policy::NEXUS_REDUNDANCY; +use omicron_common::policy::OXIMETER_REDUNDANCY; use omicron_common::update::ArtifactId; use omicron_test_utils::dev::test_setup_log; use omicron_uuid_kinds::ExternalIpUuid; @@ -83,10 +89,12 @@ use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; use std::collections::BTreeSet; use std::collections::HashMap; +use std::env; use std::net::IpAddr; use std::net::Ipv4Addr; use std::net::Ipv6Addr; use std::sync::Arc; +use strum::IntoEnumIterator; use tufaceous_artifact::ArtifactHash; use tufaceous_artifact::ArtifactKind; use tufaceous_artifact::ArtifactVersion; @@ -4963,3 +4971,336 @@ fn test_multiple_measurements() { panic!("did not converge after {MAX_PLANNING_ITERATIONS} iterations"); } + +/// Maps a `ZoneKind` to its deployment unit ID (matching the `id` fields in +/// `api-manifest.toml`). This is an exhaustive match so that adding a new +/// zone kind forces an update here. +fn zone_kind_to_deployment_unit(kind: ZoneKind) -> DeploymentUnitId { + let s = match kind { + ZoneKind::BoundaryNtp | ZoneKind::InternalNtp => "ntp", + ZoneKind::Clickhouse + | ZoneKind::ClickhouseKeeper + | ZoneKind::ClickhouseServer => "clickhouse", + ZoneKind::CockroachDb => "cockroach", + ZoneKind::Crucible => "crucible", + ZoneKind::CruciblePantry => "crucible_pantry", + ZoneKind::ExternalDns | ZoneKind::InternalDns => "dns_server", + ZoneKind::Nexus => "nexus", + ZoneKind::Oximeter => "oximeter", + }; + DeploymentUnitId::from(s.to_owned()) +} + +/// Verify that the planner's zone update ordering is a topological sort of the +/// deployment unit dependency DAG. +#[test] +fn test_zone_update_ordering_respects_dependency_dag() { + static TEST_NAME: &str = "zone_update_ordering_respects_dependency_dag"; + let logctx = test_setup_log(TEST_NAME); + + let workspace_root = Utf8PathBuf::from( + env::var("NEXTEST_WORKSPACE_ROOT") + .expect("nextest sets NEXTEST_WORKSPACE_ROOT"), + ); + let dag_path = workspace_root + .join(OMICRON_LS_APIS_PATH) + .join(DEPLOYMENT_UNIT_DAG_PATH); + let dag_contents = std::fs::read_to_string(&dag_path) + .unwrap_or_else(|e| panic!("read {dag_path}: {e}")); + let dag: DagEdgesFile = + toml::from_str(&dag_contents).expect("parsed deployment_unit_dag.toml"); + + // Collect all deployment unit IDs referenced in the DAG. + let dag_unit_ids: BTreeSet<_> = + dag.edges.iter().flat_map(|e| [&e.consumer, &e.producer]).collect(); + + // Deployment units that don't correspond to zones and therefore can't + // be tracked in this test. These are expected to be absent from + // `progress`. + // TODO: track the host OS deployment unit by simulating MGS updates. + let non_zone_units: BTreeSet = + ["host_os", "installinator"] + .into_iter() + .map(|s| DeploymentUnitId::from(s.to_owned())) + .collect(); + for unit in &non_zone_units { + assert!( + dag_unit_ids.contains(unit), + "non_zone_units entry {unit:?} does not appear in the \ + deployment unit DAG ({dag_path}) — is the ID still correct?" + ); + } + + // Set up the example system with all zone types represented, + // including multi-node clickhouse (servers + keepers). + let mut sim = ReconfiguratorCliTestState::new(TEST_NAME, &logctx.log); + sim.load_example_customized(|builder| { + // TODO: this configuration is copied from builder_all_zone_types. + // Should this live in a more central location? + builder + .nsleds(5) + .oximeter_count(OXIMETER_REDUNDANCY) + .cockroachdb_count(COCKROACHDB_REDUNDANCY) + .boundary_ntp_count(2) + .external_dns_count(1)? + .clickhouse_policy(ClickhousePolicy { + version: 0, + mode: ClickhouseMode::Both { + target_servers: 2, + target_keepers: 3, + }, + time_created: Utc::now(), + }) + .with_target_release_0_0_1() + }) + .expect("loaded example system"); + let blueprint1 = sim.assert_latest_blueprint_is_blippy_clean(); + + // Set a new target release with fake images for all zones. + // + // TODO: The SP/host artifacts intentionally match the initial state so the + // planner doesn't issue MGS updates — simulating MGS update completion + // requires some additional infrastructure. + let version = ArtifactVersion::new_static("2.0.0-freeform") + .expect("can't parse artifact version"); + let fake_hash = ArtifactHash([0; 32]); + let target_image_source = BlueprintZoneImageSource::Artifact { + version: BlueprintArtifactVersion::Available { + version: version.clone(), + }, + hash: fake_hash, + }; + let description = TargetReleaseDescription::TufRepo(TufRepoDescription { + repo: TufRepoMeta { + hash: fake_hash, + targets_role_version: 0, + valid_until: Utc::now(), + system_version: Version::new(1, 0, 0), + file_name: String::from(""), + }, + artifacts: create_zone_artifacts_at_version(&version), + }); + + sim.change_description("set new target release", |desc| { + desc.set_target_release(description); + Ok(()) + }) + .unwrap(); + + // For each deployment unit, track the first iteration where the planner + // touched any zone (update or expunge), and the first iteration where all + // in-service zones reached the target image. + #[derive(Debug)] + struct UnitProgress { + first_activity: usize, + all_at_target: Option, + } + + let mut progress = BTreeMap::new(); + + const MAX_PLANNING_ITERATIONS: usize = 100; + + let mut parent = blueprint1; + for i in 1..=MAX_PLANNING_ITERATIONS { + sim_update_collection_from_blueprint(&mut sim, &parent); + + let blueprint = sim.run_planner().expect("planning succeeded"); + + let BlueprintSource::Planner(report) = &blueprint.source else { + panic!("unexpected source: {:?}", blueprint.source); + }; + + // TODO: track the host OS deployment unit by simulating MGS updates. + + // Record the first iteration with zone activity per deployment unit. + let zone_updates = &report.zone_updates; + let active_zones = zone_updates + .updated_zones + .values() + .chain(zone_updates.expunged_zones.values()) + .flatten(); + for z in active_zones { + let unit = zone_kind_to_deployment_unit(z.kind); + progress.entry(unit).or_insert(UnitProgress { + first_activity: i, + all_at_target: None, + }); + } + + // Check which deployment units have all in-service zones at the + // target image in this blueprint. + let mut zones_by_unit: BTreeMap< + DeploymentUnitId, + Vec<&BlueprintZoneConfig>, + > = BTreeMap::new(); + for (_, zone) in blueprint.in_service_zones() { + let unit = zone_kind_to_deployment_unit(zone.zone_type.kind()); + zones_by_unit.entry(unit).or_default().push(zone); + } + for (unit, zones) in &zones_by_unit { + if let Some(p) = progress.get_mut(unit) { + if p.all_at_target.is_none() + && zones + .iter() + .all(|z| z.image_source == target_image_source) + { + p.all_at_target = Some(i); + } + } + } + + // Check for convergence. (This is in a scope so that the borrow can be + // dropped before moving blueprint into parent.) + { + let summary = blueprint.diff_since_blueprint(&parent); + // TODO: should this check live on BlueprintDiffSummary? + if summary.total_zones_added() == 0 + && summary.total_zones_removed() == 0 + && summary.total_zones_modified() == 0 + && summary.before.nexus_generation + == summary.after.nexus_generation + && summary.total_measurements_added() == 0 + && summary.total_measurements_removed() == 0 + { + let not_at_target: Vec<_> = blueprint + .in_service_zones() + .filter(|(_, zone)| { + zone.image_source != target_image_source + }) + .map(|(sled, zone)| { + format!( + " sled {sled}: zone {} ({:?}), \ + image_source: {}", + zone.id, + zone.zone_type.kind(), + zone.image_source, + ) + }) + .collect(); + if !not_at_target.is_empty() { + panic!( + "diff summary claims convergence, \ + but {} zones are not at target:\n{}", + not_at_target.len(), + not_at_target.join("\n"), + ); + } + println!("planning converged after {i} iterations"); + break; + } + } + + parent = blueprint; + assert!( + i < MAX_PLANNING_ITERATIONS, + "did not converge after {MAX_PLANNING_ITERATIONS} iterations" + ); + } + + // Verify that every zone-based deployment unit was actually exercised. This + // catches cases where the example system doesn't include a zone type that + // the DAG covers. + let zone_based_units: BTreeSet<_> = + ZoneKind::iter().map(zone_kind_to_deployment_unit).collect(); + for unit in &zone_based_units { + let p = progress.get(unit).unwrap_or_else(|| { + panic!( + "deployment unit {unit:?} is zone-based but was never \ + updated — is it missing from the example system?" + ) + }); + // Verify that every zone-based unit ID actually appears in the + // DAG. This catches typos in zone_kind_to_deployment_unit's + // string literals. + assert!( + dag_unit_ids.contains(unit), + "zone_kind_to_deployment_unit produced {unit:?}, which does \ + not appear in the deployment unit DAG ({dag_path}) — is the \ + ID correct?" + ); + // Verify that every zone-based unit reached the target image. + // Without this, a unit that was touched but never fully converged + // could go undetected if it only appears as a consumer (never a + // producer) in checked DAG edges. + assert!( + p.all_at_target.is_some(), + "deployment unit {unit:?} was updated (first activity at \ + iteration {}) but never had all zones reach the target image", + p.first_activity, + ); + } + + // TODO: simulate MGS updates and verify that host OS updates happen at the + // right time. + + println!("\ndeployment unit update activity:"); + for (unit, p) in &progress { + println!( + " {unit}: first_activity = {}, all_at_target = {:?}", + p.first_activity, p.all_at_target, + ); + } + + let mut violations = Vec::new(); + for edge in &dag.edges { + // Skip edges involving non-zone deployment units that this test can't + // track yet. Fail if a unit is missing for any other reason. + let consumer = match progress.get(&edge.consumer) { + Some(p) => p, + None => { + assert!( + non_zone_units.contains(&edge.consumer), + "DAG edge consumer {:?} is not in the progress map \ + and is not a known non-zone unit — is this a new \ + deployment unit that needs tracking?", + edge.consumer, + ); + continue; + } + }; + let producer = match progress.get(&edge.producer) { + Some(p) => p, + None => { + assert!( + non_zone_units.contains(&edge.producer), + "DAG edge producer {:?} is not in the progress map \ + and is not a known non-zone unit — is this a new \ + deployment unit that needs tracking?", + edge.producer, + ); + continue; + } + }; + let producer_done = match producer.all_at_target { + Some(v) => v, + None => panic!( + "producer {:?} has zones but never reached target", + edge.producer + ), + }; + + if consumer.first_activity < producer_done { + violations.push(format!( + "consumer {:?} started updating at iteration {}, \ + but producer {:?} was not fully updated until \ + iteration {producer_done}", + edge.consumer, consumer.first_activity, edge.producer, + )); + } + } + + if !violations.is_empty() { + println!("\nDAG ordering violations:"); + for v in &violations { + println!(" - {v}"); + } + panic!( + "zone update ordering violated the dependency DAG \ + ({} violations)", + violations.len() + ); + } + + println!("\nall DAG ordering constraints satisfied"); + logctx.cleanup_successful(); +}