From 79a1f00aa67c28de98837e34f13e98374236f9e1 Mon Sep 17 00:00:00 2001 From: David Frank Date: Wed, 10 Dec 2025 16:22:52 +0100 Subject: [PATCH 1/3] WIP --- Cargo.lock | 2 + .../ic_boundary/src/test_utils.rs | 1 + rs/crypto/temp_crypto/src/lib.rs | 1 + rs/nns/governance/api/src/types.rs | 2 +- .../integration_tests/src/subnet_handler.rs | 2 + rs/orchestrator/Cargo.toml | 2 + .../registry_replicator/src/internal_state.rs | 1 + .../src/catch_up_package_provider.rs | 4 +- rs/orchestrator/src/upgrade.rs | 347 +++++++++++++++++- rs/prep/src/subnet_configuration.rs | 1 + .../def/registry/subnet/v1/subnet.proto | 9 + .../src/gen/registry/registry.subnet.v1.rs | 9 + .../src/gen/state/registry.subnet.v1.rs | 9 + .../src/gen/types/registry.subnet.v1.rs | 9 + rs/registry/admin/bin/main.rs | 29 +- rs/registry/canister/canister/registry.did | 5 + .../canister/canister/registry_test.did | 5 + .../src/mutations/do_create_subnet.rs | 2 + .../do_set_subnet_operational_level.rs | 38 ++ .../do_set_subnet_operational_level/tests.rs | 90 +++++ .../src/mutations/do_update_subnet.rs | 4 + rs/registry/canister/tests/update_subnet.rs | 4 + rs/test_utilities/registry/src/lib.rs | 1 + rs/tests/driver/src/nns.rs | 1 + 24 files changed, 572 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 53ba5633b1f1..6d63b75386fe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -19137,6 +19137,7 @@ dependencies = [ "hyper-rustls 0.27.7", "hyper-util", "ic-agent 0.40.1", + "ic-certification-test-utils", "ic-config", "ic-consensus-cup-utils", "ic-consensus-dkg", @@ -19150,6 +19151,7 @@ dependencies = [ "ic-crypto-test-utils-tls", "ic-crypto-tls-interfaces", "ic-crypto-tls-interfaces-mocks", + "ic-crypto-tree-hash", "ic-dashboard", "ic-ed25519 0.5.0", "ic-http-endpoints-async-utils", diff --git a/rs/boundary_node/ic_boundary/src/test_utils.rs b/rs/boundary_node/ic_boundary/src/test_utils.rs index b4849917e718..67f413e6ce9d 100644 --- a/rs/boundary_node/ic_boundary/src/test_utils.rs +++ b/rs/boundary_node/ic_boundary/src/test_utils.rs @@ -122,6 +122,7 @@ pub fn test_subnet_record() -> SubnetRecord { ssh_backup_access: vec![], chain_key_config: None, canister_cycles_cost_schedule: CanisterCyclesCostSchedule::Normal as i32, + recalled_replica_version_ids: vec![], } } diff --git a/rs/crypto/temp_crypto/src/lib.rs b/rs/crypto/temp_crypto/src/lib.rs index 55473d6f71f1..912b6c16e6eb 100644 --- a/rs/crypto/temp_crypto/src/lib.rs +++ b/rs/crypto/temp_crypto/src/lib.rs @@ -1117,6 +1117,7 @@ impl EcdsaSubnetConfig { max_parallel_pre_signature_transcripts_in_creation: None, }), canister_cycles_cost_schedule: CanisterCyclesCostSchedule::Normal as i32, + recalled_replica_version_ids: vec![], }, } } diff --git a/rs/nns/governance/api/src/types.rs b/rs/nns/governance/api/src/types.rs index 9ee896be5fcb..ea4e69cfe3b7 100644 --- a/rs/nns/governance/api/src/types.rs +++ b/rs/nns/governance/api/src/types.rs @@ -2979,7 +2979,7 @@ pub struct ListProposalInfoResponse { /// Paging is available if the result set is larger than `MAX_LIST_NEURONS_RESULTS`, /// which is currently 500 neurons. If you are unsure of the number of results in a set, /// you can use the `total_pages_available` field in the response to determine how many -/// additional pages need to be queried. It will be based on your `page_size` parameter. +/// additional pages need to be queried. It will be based on your `page_size` parameter. /// When paging through results, it is good to keep in mind that newly inserted neurons /// could be missed if they are inserted between calls to pages, and this could result in missing /// a neuron in the combined responses. diff --git a/rs/nns/integration_tests/src/subnet_handler.rs b/rs/nns/integration_tests/src/subnet_handler.rs index 82285706b56f..698d045ead76 100644 --- a/rs/nns/integration_tests/src/subnet_handler.rs +++ b/rs/nns/integration_tests/src/subnet_handler.rs @@ -52,6 +52,7 @@ fn test_submit_and_accept_update_subnet_proposal() { ssh_backup_access: vec![], chain_key_config: None, canister_cycles_cost_schedule: CanisterCyclesCostSchedule::Normal as i32, + recalled_replica_version_ids: vec![], }; let key = make_subnet_record_key(subnet_id); @@ -170,6 +171,7 @@ fn test_submit_and_accept_update_subnet_proposal() { ssh_backup_access: vec!["pub_key_1".to_string()], chain_key_config: None, canister_cycles_cost_schedule: CanisterCyclesCostSchedule::Normal as i32, + recalled_replica_version_ids: vec![], } ); Ok(()) diff --git a/rs/orchestrator/Cargo.toml b/rs/orchestrator/Cargo.toml index f5bfafcfcff9..2949d71140c1 100644 --- a/rs/orchestrator/Cargo.toml +++ b/rs/orchestrator/Cargo.toml @@ -71,6 +71,7 @@ qrcode = { workspace = true } [dev-dependencies] async-stream = { workspace = true } hyper-util = { workspace = true } +ic-certification-test-utils = { path = "../certification/test-utils" } ic-crypto-temp-crypto = { path = "../crypto/temp_crypto" } ic-crypto-test-utils-canister-threshold-sigs = { path = "../crypto/test_utils/canister_threshold_sigs" } ic-crypto-test-utils-crypto-returning-ok = { path = "../crypto/test_utils/crypto_returning_ok" } @@ -78,6 +79,7 @@ ic-crypto-test-utils-ni-dkg = { path = "../crypto/test_utils/ni-dkg" } ic-crypto-test-utils-reproducible-rng = { path = "../crypto/test_utils/reproducible_rng" } ic-crypto-test-utils-tls = { path = "../crypto/test_utils/tls" } ic-crypto-tls-interfaces-mocks = { path = "../crypto/tls_interfaces/mocks" } +ic-crypto-tree-hash = { path = "../crypto/tree_hash" } ic-registry-client-fake = { path = "../registry/fake" } ic-registry-subnet-type = { path = "../registry/subnet_type" } ic-registry-proto-data-provider = { path = "../registry/proto_data_provider" } diff --git a/rs/orchestrator/registry_replicator/src/internal_state.rs b/rs/orchestrator/registry_replicator/src/internal_state.rs index 526814557e4a..9981e6985f02 100644 --- a/rs/orchestrator/registry_replicator/src/internal_state.rs +++ b/rs/orchestrator/registry_replicator/src/internal_state.rs @@ -577,6 +577,7 @@ mod test { ssh_backup_access: vec![], chain_key_config: None, canister_cycles_cost_schedule: 0, + recalled_replica_version_ids: vec![], } } diff --git a/rs/orchestrator/src/catch_up_package_provider.rs b/rs/orchestrator/src/catch_up_package_provider.rs index fe0a2b3a83bc..5cd6fc3b40c1 100644 --- a/rs/orchestrator/src/catch_up_package_provider.rs +++ b/rs/orchestrator/src/catch_up_package_provider.rs @@ -501,7 +501,7 @@ fn get_cup_proto_height(cup: &pb::CatchUpPackage) -> Option { } #[cfg(test)] -mod tests { +pub(crate) mod tests { use super::*; use crate::{ catch_up_package_provider::CatchUpPackageProvider, registry_helper::RegistryHelper, @@ -690,7 +690,7 @@ mod tests { )) } - fn mock_tls_config() -> MockTlsConfig { + pub(crate) fn mock_tls_config() -> MockTlsConfig { #[derive(Debug)] struct NoVerify; impl ServerCertVerifier for NoVerify { diff --git a/rs/orchestrator/src/upgrade.rs b/rs/orchestrator/src/upgrade.rs index 770726414820..30b74f868e9f 100644 --- a/rs/orchestrator/src/upgrade.rs +++ b/rs/orchestrator/src/upgrade.rs @@ -160,7 +160,7 @@ impl Upgrade { /// Checks for a new release package, and if found, upgrades to this release /// package pub(crate) async fn check(&mut self) -> OrchestratorResult { - let latest_registry_version = self.registry.get_latest_version(); + let latest_registry_version = self.replicate_latest_registry_version().await?; // Determine the subnet_id using the local CUP. let (subnet_id, local_cup_proto, local_cup) = { let maybe_proto = self.cup_provider.get_local_cup_proto(); @@ -311,7 +311,22 @@ impl Upgrade { let new_replica_version = self .registry .get_replica_version(subnet_id, cup_registry_version)?; - if new_replica_version != self.replica_version { + let subnet_record = self + .registry + .get_subnet_record(subnet_id, latest_registry_version)?; + let version_is_recalled = subnet_record + .recalled_replica_version_ids + .contains(&new_replica_version.to_string()); + + if version_is_recalled { + warn!( + self.logger, + "New replica version {} is recalled for subnet {} at latest registry version {}", + new_replica_version, + subnet_id, + latest_registry_version + ); + } else if new_replica_version != self.replica_version { info!( self.logger, "Starting version upgrade at CUP registry version {}: {} -> {}", @@ -344,6 +359,22 @@ impl Upgrade { Ok(flow) } + async fn replicate_latest_registry_version( + &mut self, + ) -> Result { + let mut latest_registry_version = self.registry.get_latest_version(); + loop { + self.registry_replicator.poll().await.map_err(|err| { + OrchestratorError::UpgradeError(format!("Could not poll registry: {err}")) + })?; + if latest_registry_version == self.registry.get_latest_version() { + break; + } + latest_registry_version = self.registry.get_latest_version(); + } + Ok(latest_registry_version) + } + pub fn node_id(&self) -> NodeId { self.node_id } @@ -1555,4 +1586,316 @@ mod tests { ); } } + + fn make_cup_with_registry_version( + h: Height, + registry_version: RegistryVersion, + ) -> CatchUpPackage { + let block = Block::new( + CryptoHashOf::from(CryptoHash(Vec::new())), + Payload::new( + ic_types::crypto::crypto_hash, + BlockPayload::Summary(SummaryPayload { + dkg: DkgSummary::fake(), + idkg: Some(idkg::IDkgPayload::empty( + h, + subnet_test_id(1), + Vec::new(), + )), + }), + ), + h, + Rank(46), + ValidationContext { + registry_version, + certified_height: Height::from(42), + time: UNIX_EPOCH, + }, + ); + + CatchUpPackage { + content: CatchUpContent::new( + HashedBlock::new(ic_types::crypto::crypto_hash, block), + HashedRandomBeacon::new( + ic_types::crypto::crypto_hash, + RandomBeacon::fake(RandomBeaconContent::new( + h, + CryptoHashOf::from(CryptoHash(Vec::new())), + )), + ), + CryptoHashOf::from(CryptoHash(Vec::new())), + None, + ), + signature: ThresholdSignature::fake(), + } + } + + #[tokio::test] + async fn test_recalled_replica_version_prevents_upgrade() { + use candid::Encode; + use ic_crypto_test_utils_crypto_returning_ok::CryptoReturningOk; + use ic_crypto_utils_threshold_sig_der::parse_threshold_sig_key_from_der; + use ic_nervous_system_integration_tests::pocket_ic_helpers::install_canister; + use ic_nns_constants::REGISTRY_CANISTER_ID; + use ic_nns_test_utils::common::{NnsInitPayloadsBuilder, build_test_registry_wasm}; + use ic_protobuf::registry::node::v1::NodeRecord; + use ic_registry_nns_data_provider::registry::RegistryCanister; + use ic_registry_transport::{ + serialize_atomic_mutate_request, deserialize_atomic_mutate_response, + pb::v1::{RegistryMutation, registry_mutation::Type}, + }; + use pocket_ic::PocketIcBuilder; + use ic_protobuf::registry::replica_version::v1::ReplicaVersionRecord; + use ic_protobuf::types::v1 as pb; + use ic_registry_client_fake::FakeRegistryClient; + use ic_registry_keys::{make_node_record_key, make_replica_version_key, make_subnet_record_key}; + use ic_registry_proto_data_provider::ProtoRegistryDataProvider; + use ic_registry_replicator::RegistryReplicator; + use ic_test_utilities_types::ids::node_test_id; + use ic_types::RegistryVersion; + use prost::Message; + use std::time::Duration; + + with_test_replica_logger(|log| async move { + let current_version = ReplicaVersion::try_from("version_1.0.0").unwrap(); + let recalled_version = ReplicaVersion::try_from("version_2.0.0").unwrap(); + let node_id = node_test_id(1); + let subnet_id = subnet_test_id(0); + + let cup_registry_version = RegistryVersion::from(10); + let latest_registry_version = RegistryVersion::from(20); + + // Create a temporary ProtoRegistryDataProvider to build the registry data + let temp_data_provider = Arc::new(ProtoRegistryDataProvider::new()); + + let node_record = NodeRecord { + xnet: Some(ic_protobuf::registry::node::v1::ConnectionEndpoint { + ip_addr: "127.0.0.1".to_string(), + port: 1234, + }), + http: Some(ic_protobuf::registry::node::v1::ConnectionEndpoint { + ip_addr: "127.0.0.1".to_string(), + port: 8080, + }), + ..Default::default() + }; + + temp_data_provider + .add(&make_node_record_key(node_id), cup_registry_version, Some(node_record.clone())) + .unwrap(); + + temp_data_provider + .add(&make_node_record_key(node_id), latest_registry_version, Some(node_record)) + .unwrap(); + + // Add a minimal node record at version 1 (without HTTP endpoint to avoid connection attempts) + temp_data_provider + .add( + &make_node_record_key(node_id), + RegistryVersion::from(1), + Some(NodeRecord { + ..Default::default() + }), + ) + .unwrap(); + + let subnet_record_at_cup_version = SubnetRecord { + membership: vec![node_id.get().to_vec()], + replica_version_id: recalled_version.to_string(), + recalled_replica_version_ids: vec![], + ..Default::default() + }; + + temp_data_provider + .add( + &make_subnet_record_key(subnet_id), + cup_registry_version, + Some(subnet_record_at_cup_version), + ) + .unwrap(); + + temp_data_provider + .add( + &make_replica_version_key(recalled_version.to_string()), + cup_registry_version, + Some(ReplicaVersionRecord::default()), + ) + .unwrap(); + + let subnet_record_at_latest_version = SubnetRecord { + membership: vec![node_id.get().to_vec()], + replica_version_id: recalled_version.to_string(), + recalled_replica_version_ids: vec![recalled_version.to_string()], + ..Default::default() + }; + + temp_data_provider + .add( + &make_subnet_record_key(subnet_id), + latest_registry_version, + Some(subnet_record_at_latest_version), + ) + .unwrap(); + + temp_data_provider + .add( + &make_replica_version_key(recalled_version.to_string()), + latest_registry_version, + Some(ReplicaVersionRecord::default()), + ) + .unwrap(); + + // Add root subnet ID and NNS public key to the temp data provider + use ic_registry_keys::{make_crypto_threshold_signing_pubkey_key, ROOT_SUBNET_ID_KEY}; + use ic_protobuf::registry::crypto::v1::PublicKey as PbPublicKey; + use ic_certification_test_utils::{CertificateBuilder, CertificateData}; + use ic_crypto_tree_hash::Digest; + use ic_types::CanisterId; + + // Add root subnet ID at version 1 + temp_data_provider + .add( + ROOT_SUBNET_ID_KEY, + RegistryVersion::from(1), + Some(ic_types::subnet_id_into_protobuf(subnet_id)), + ) + .unwrap(); + + // Add NNS public key at version 2 + let (_, nns_public_key, _) = CertificateBuilder::new(CertificateData::CanisterData { + canister_id: CanisterId::from_u64(0), + certified_data: Digest([1; 32]), + }) + .build(); + temp_data_provider + .add( + &make_crypto_threshold_signing_pubkey_key(subnet_id), + RegistryVersion::from(2), + Some(PbPublicKey::from(nns_public_key)), + ) + .unwrap(); + + let tmp_dir = tempdir().unwrap(); + let local_store_path = tmp_dir.path().join("ic_registry_local_store"); + std::fs::create_dir_all(&local_store_path).unwrap(); + + use ic_registry_local_store::{KeyMutation, LocalStoreImpl, LocalStoreWriter}; + + let local_store = Arc::new(LocalStoreImpl::new(local_store_path.clone())); + + // Populate the local store with all registry data from temp_data_provider + let changelog = temp_data_provider.get_updates_since(RegistryVersion::from(0)).unwrap(); + let mut version_mutations: std::collections::BTreeMap> = std::collections::BTreeMap::new(); + + for record in changelog { + // Populate local store with all data up to latest_registry_version + if record.version <= latest_registry_version { + version_mutations.entry(record.version).or_insert_with(Vec::new).push(KeyMutation { + key: record.key, + value: record.value, + }); + } + } + + // Populate local store with continuous versions from 1 to latest_registry_version + for version in 1..=latest_registry_version.get() { + let version = RegistryVersion::from(version); + let mutations = version_mutations.remove(&version).unwrap_or_else(|| { + use prost::Message; + let node_record = NodeRecord { + xnet: Some(ic_protobuf::registry::node::v1::ConnectionEndpoint { + ip_addr: "127.0.0.1".to_string(), + port: 1234, + }), + http: Some(ic_protobuf::registry::node::v1::ConnectionEndpoint { + ip_addr: "127.0.0.1".to_string(), + port: 8080, + }), + ..Default::default() + }; + vec![KeyMutation { + key: make_node_record_key(node_id), + value: Some(node_record.encode_to_vec()), + }] + }); + local_store.store(version, mutations).unwrap(); + } + + // Create FakeRegistryClient from the local store (which now has all the data) + let registry_client = Arc::new(FakeRegistryClient::new(local_store.clone() as Arc<_>)); + registry_client.update_to_latest_version(); + + let registry = Arc::new(RegistryHelper::new( + node_id, + registry_client.clone() as Arc<_>, + log.clone(), + )); + + let cup_dir = tmp_dir.path().join("cups"); + std::fs::create_dir_all(&cup_dir).unwrap(); + + let cup = make_cup_with_registry_version(Height::from(10), cup_registry_version); + let cup_proto = pb::CatchUpPackage::from(&cup); + let cup_file = cup_dir.join("cup.types.v1.CatchUpPackage.pb"); + std::fs::write(&cup_file, cup_proto.encode_to_vec()).unwrap(); + + let cup_provider = Arc::new(CatchUpPackageProvider::new( + registry.clone(), + cup_dir.clone(), + Arc::new(CryptoReturningOk::default()), + Arc::new(crate::catch_up_package_provider::tests::mock_tls_config()), + log.clone(), + node_id, + )); + + let registry_replicator = Arc::new( + RegistryReplicator::new( + log.clone(), + local_store_path, + Duration::from_secs(10), + vec![], + None, + ) + .await, + ); + + let metrics = Arc::new(OrchestratorMetrics::new(&MetricsRegistry::new())); + let replica_process = Arc::new(Mutex::new(ProcessManager::new(log.clone()))); + let orchestrator_data_dir = tmp_dir.path().join("orchestrator"); + std::fs::create_dir_all(&orchestrator_data_dir).unwrap(); + + let mut upgrade = Upgrade::new( + registry.clone(), + metrics, + replica_process, + cup_provider, + current_version, + tmp_dir.path().join("replica_config.json"), + node_id, + tmp_dir.path().join("ic_binary"), + registry_replicator, + tmp_dir.path().join("release"), + log.clone(), + orchestrator_data_dir, + None, + ) + .await; + + let result = upgrade.check().await; + + match &result { + Ok(OrchestratorControlFlow::Assigned(id)) => println!("Result: Ok(Assigned({}))", id), + Ok(OrchestratorControlFlow::Unassigned) => println!("Result: Ok(Unassigned)"), + Ok(OrchestratorControlFlow::Leaving(id)) => println!("Result: Ok(Leaving({}))", id), + Ok(OrchestratorControlFlow::Stop) => println!("Result: Ok(Stop)"), + Err(e) => println!("Result: Err({:?})", e), + } + + assert!( + matches!(result, Ok(OrchestratorControlFlow::Assigned(_))), + "Expected Assigned flow when version is recalled, upgrade should not proceed" + ); + }) + .await; + } } diff --git a/rs/prep/src/subnet_configuration.rs b/rs/prep/src/subnet_configuration.rs index 00dc86a29871..3f0627789648 100644 --- a/rs/prep/src/subnet_configuration.rs +++ b/rs/prep/src/subnet_configuration.rs @@ -317,6 +317,7 @@ impl SubnetConfig { ssh_backup_access: self.ssh_backup_access, chain_key_config: self.chain_key_config, canister_cycles_cost_schedule: CanisterCyclesCostSchedule::Normal as i32, + recalled_replica_version_ids: vec![], }; let dkg_dealing_encryption_pubkeys: BTreeMap<_, _> = initialized_nodes diff --git a/rs/protobuf/def/registry/subnet/v1/subnet.proto b/rs/protobuf/def/registry/subnet/v1/subnet.proto index e6ece52f6f9d..05f1fb07c0fa 100644 --- a/rs/protobuf/def/registry/subnet/v1/subnet.proto +++ b/rs/protobuf/def/registry/subnet/v1/subnet.proto @@ -78,6 +78,15 @@ message SubnetRecord { // means to behave according to the `subnet_type` field. CanisterCyclesCostSchedule canister_cycles_cost_schedule = 30; + // List of replica version IDs that are recalled/blocked for this subnet. + // Nodes in this subnet will not upgrade to any version in this list. + // If the replica_version_id of a subnet points to a broken GuestOS and the subnet is stalled, + // even if we manage to rollback the GuestOS locally, the GuestOS would automatically try + // to upgrade to the broken GuestOS again. We can use this field to prevent that. + // While nodes read the recalled_replica_version_ids from the registry vesion from the CUP, + // they check the latest registry version for recalled_replica_version_ids. + repeated string recalled_replica_version_ids = 31; + reserved 1, 2, 4, 6, 13, 20, 21, 22, 27; reserved "ic_version_id"; reserved "initial_dkg_transcript"; diff --git a/rs/protobuf/src/gen/registry/registry.subnet.v1.rs b/rs/protobuf/src/gen/registry/registry.subnet.v1.rs index 8aa2537eb55f..6aab5039a960 100644 --- a/rs/protobuf/src/gen/registry/registry.subnet.v1.rs +++ b/rs/protobuf/src/gen/registry/registry.subnet.v1.rs @@ -74,6 +74,15 @@ pub struct SubnetRecord { /// means to behave according to the `subnet_type` field. #[prost(enumeration = "CanisterCyclesCostSchedule", tag = "30")] pub canister_cycles_cost_schedule: i32, + /// List of replica version IDs that are recalled/blocked for this subnet. + /// Nodes in this subnet will not upgrade to any version in this list. + /// If the replica_version_id of a subnet points to a broken GuestOS and the subnet is stalled, + /// even if we manage to rollback the GuestOS locally, the GuestOS would automatically try + /// to upgrade to the broken GuestOS again. We can use this field to prevent that. + /// While nodes read the recalled_replica_version_ids from the registry vesion from the CUP, + /// they check the latest registry version for recalled_replica_version_ids. + #[prost(string, repeated, tag = "31")] + pub recalled_replica_version_ids: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, } #[derive(serde::Serialize, serde::Deserialize, Clone, PartialEq, ::prost::Message)] pub struct EcdsaInitialization { diff --git a/rs/protobuf/src/gen/state/registry.subnet.v1.rs b/rs/protobuf/src/gen/state/registry.subnet.v1.rs index acedbe31e77c..2ecaac2d2ec4 100644 --- a/rs/protobuf/src/gen/state/registry.subnet.v1.rs +++ b/rs/protobuf/src/gen/state/registry.subnet.v1.rs @@ -74,6 +74,15 @@ pub struct SubnetRecord { /// means to behave according to the `subnet_type` field. #[prost(enumeration = "CanisterCyclesCostSchedule", tag = "30")] pub canister_cycles_cost_schedule: i32, + /// List of replica version IDs that are recalled/blocked for this subnet. + /// Nodes in this subnet will not upgrade to any version in this list. + /// If the replica_version_id of a subnet points to a broken GuestOS and the subnet is stalled, + /// even if we manage to rollback the GuestOS locally, the GuestOS would automatically try + /// to upgrade to the broken GuestOS again. We can use this field to prevent that. + /// While nodes read the recalled_replica_version_ids from the registry vesion from the CUP, + /// they check the latest registry version for recalled_replica_version_ids. + #[prost(string, repeated, tag = "31")] + pub recalled_replica_version_ids: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, } #[derive(Clone, PartialEq, ::prost::Message)] pub struct EcdsaInitialization { diff --git a/rs/protobuf/src/gen/types/registry.subnet.v1.rs b/rs/protobuf/src/gen/types/registry.subnet.v1.rs index acedbe31e77c..2ecaac2d2ec4 100644 --- a/rs/protobuf/src/gen/types/registry.subnet.v1.rs +++ b/rs/protobuf/src/gen/types/registry.subnet.v1.rs @@ -74,6 +74,15 @@ pub struct SubnetRecord { /// means to behave according to the `subnet_type` field. #[prost(enumeration = "CanisterCyclesCostSchedule", tag = "30")] pub canister_cycles_cost_schedule: i32, + /// List of replica version IDs that are recalled/blocked for this subnet. + /// Nodes in this subnet will not upgrade to any version in this list. + /// If the replica_version_id of a subnet points to a broken GuestOS and the subnet is stalled, + /// even if we manage to rollback the GuestOS locally, the GuestOS would automatically try + /// to upgrade to the broken GuestOS again. We can use this field to prevent that. + /// While nodes read the recalled_replica_version_ids from the registry vesion from the CUP, + /// they check the latest registry version for recalled_replica_version_ids. + #[prost(string, repeated, tag = "31")] + pub recalled_replica_version_ids: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, } #[derive(Clone, PartialEq, ::prost::Message)] pub struct EcdsaInitialization { diff --git a/rs/registry/admin/bin/main.rs b/rs/registry/admin/bin/main.rs index be09562cb02c..2f8da7796802 100644 --- a/rs/registry/admin/bin/main.rs +++ b/rs/registry/admin/bin/main.rs @@ -846,6 +846,16 @@ struct ProposeToTakeSubnetOfflineForRepairsCmd { /// so most likely, this won't be an issue. #[clap(long, value_parser, num_args(1..))] pub ssh_node_state_write_access: Vec, + + /// List of replica version IDs to recall. These versions will be marked as + /// recalled for this subnet, preventing them from being used. + #[clap(long, num_args(1..))] + pub recalled_replica_version_ids: Option>, + + /// If set, recall the current replica version of the subnet. This prevents + /// the subnet from being upgraded to this version again. + #[clap(long)] + pub recall_current_replica_version: bool, } impl ProposalTitle for ProposeToTakeSubnetOfflineForRepairsCmd { @@ -858,7 +868,22 @@ impl ProposalTitle for ProposeToTakeSubnetOfflineForRepairsCmd { #[async_trait] impl ProposalPayload for ProposeToTakeSubnetOfflineForRepairsCmd { - async fn payload(&self, _agent: &Agent) -> SetSubnetOperationalLevelPayload { + async fn payload(&self, agent: &Agent) -> SetSubnetOperationalLevelPayload { + if self.recalled_replica_version_ids.is_some() && self.recall_current_replica_version { + panic!( + "Cannot specify both --recalled-replica-version-ids and --recall-current-replica-version" + ); + } + + let recalled_replica_version_ids = if self.recall_current_replica_version { + let registry_canister = RegistryCanister::new_with_agent(agent.clone()); + let subnet_id = SubnetId::from(self.subnet); + let subnet_record = get_subnet_record(®istry_canister, subnet_id).await; + Some(vec![subnet_record.replica_version_id]) + } else { + self.recalled_replica_version_ids.clone() + }; + let ssh_node_state_write_access = self .ssh_node_state_write_access .clone() @@ -871,6 +896,7 @@ impl ProposalPayload for ProposeToTakeSubnetOf operational_level: Some(operational_level::DOWN_FOR_REPAIRS), ssh_readonly_access: Some(self.ssh_readonly_access.clone()), ssh_node_state_write_access: Some(ssh_node_state_write_access), + recalled_replica_version_ids, } } } @@ -975,6 +1001,7 @@ impl ProposalPayload operational_level: Some(operational_level::NORMAL), ssh_readonly_access: Some(vec![]), ssh_node_state_write_access: Some(ssh_node_state_write_access), + recalled_replica_version_ids: Some(vec![]), } } } diff --git a/rs/registry/canister/canister/registry.did b/rs/registry/canister/canister/registry.did index 2083b77fcfdd..d12509b1249f 100644 --- a/rs/registry/canister/canister/registry.did +++ b/rs/registry/canister/canister/registry.did @@ -489,6 +489,11 @@ type SetSubnetOperationalLevelPayload = record { // Similar to ssh_read_only_access, except that this targets one node at a // time, not all the nodes in the subnet. ssh_node_state_write_access : opt vec NodeSshAccess; + + // Replica version IDs to recall for this subnet. If specified, these versions + // will be added to the subnet's recalled_replica_version_ids list, preventing + // the subnet from upgrading to these versions. + recalled_replica_version_ids : opt vec text; }; type NodeSshAccess = record { diff --git a/rs/registry/canister/canister/registry_test.did b/rs/registry/canister/canister/registry_test.did index a88836069ba4..966ae94bfe12 100644 --- a/rs/registry/canister/canister/registry_test.did +++ b/rs/registry/canister/canister/registry_test.did @@ -489,6 +489,11 @@ type SetSubnetOperationalLevelPayload = record { // Similar to ssh_read_only_access, except that this targets one node at a // time, not all the nodes in the subnet. ssh_node_state_write_access : opt vec NodeSshAccess; + + // Replica version IDs to recall for this subnet. If specified, these versions + // will be added to the subnet's recalled_replica_version_ids list, preventing + // the subnet from upgrading to these versions. + recalled_replica_version_ids : opt vec text; }; type NodeSshAccess = record { diff --git a/rs/registry/canister/src/mutations/do_create_subnet.rs b/rs/registry/canister/src/mutations/do_create_subnet.rs index 6a2c3aafed72..582a60f322d4 100644 --- a/rs/registry/canister/src/mutations/do_create_subnet.rs +++ b/rs/registry/canister/src/mutations/do_create_subnet.rs @@ -545,6 +545,8 @@ impl From for SubnetRecord { .map(CanisterCyclesCostSchedulePb::from) .unwrap_or(CanisterCyclesCostSchedulePb::Normal) as i32, + + recalled_replica_version_ids: vec![], } } } diff --git a/rs/registry/canister/src/mutations/do_set_subnet_operational_level.rs b/rs/registry/canister/src/mutations/do_set_subnet_operational_level.rs index 959e0fea9e6e..ed6dc78f8604 100644 --- a/rs/registry/canister/src/mutations/do_set_subnet_operational_level.rs +++ b/rs/registry/canister/src/mutations/do_set_subnet_operational_level.rs @@ -50,6 +50,7 @@ impl Registry { operational_level, ssh_readonly_access, ssh_node_state_write_access, + recalled_replica_version_ids, } = payload; let mut mutations: Vec = vec![]; @@ -61,6 +62,7 @@ impl Registry { self.get_subnet_or_panic(subnet_id), operational_level, ssh_readonly_access, + recalled_replica_version_ids, )); } @@ -82,10 +84,16 @@ impl Registry { operational_level, ssh_readonly_access, ssh_node_state_write_access, + recalled_replica_version_ids, } = payload; match subnet_id { None => { + if recalled_replica_version_ids.is_some() { + return Err( + "recalled_replica_version_ids specified, but not subnet_id.".to_string() + ); + } if operational_level.is_some() { return Err("operational_level specified, but not subnet_id.".to_string()); } @@ -106,6 +114,7 @@ impl Registry { validate_operational_level(*operational_level)?; validate_ssh_readonly_access(ssh_readonly_access)?; validate_ssh_node_state_write_access(ssh_node_state_write_access)?; + validate_recalled_replica_version_ids(recalled_replica_version_ids)?; Ok(()) } @@ -185,6 +194,22 @@ fn validate_node_ssh_access(node_ssh_access: &NodeSshAccess) -> Result<(), Strin Ok(()) } +fn validate_recalled_replica_version_ids( + recalled_replica_version_ids: &Option>, +) -> Result<(), String> { + if let Some(version_ids) = recalled_replica_version_ids { + for version_id in version_ids { + if version_id.is_empty() { + return Err( + "recalled_replica_version_ids cannot contain empty strings.".to_string() + ); + } + } + } + + Ok(()) +} + /// Returns mutation(s) (possibly 0) to subnet_record to effect /// operational_level and ssh_readonly_access. fn modify_subnet_record_for_set_subnet_operational_level( @@ -192,6 +217,7 @@ fn modify_subnet_record_for_set_subnet_operational_level( mut subnet_record: SubnetRecord, operational_level: Option, ssh_readonly_access: Option>, + recalled_replica_version_ids: Option>, ) -> RegistryMutation { if let Some(operational_level) = operational_level { let is_halted = match operational_level { @@ -207,6 +233,17 @@ fn modify_subnet_record_for_set_subnet_operational_level( subnet_record.ssh_readonly_access = ssh_readonly_access; } + if let Some(version_ids) = recalled_replica_version_ids { + for version_id in version_ids { + if !subnet_record + .recalled_replica_version_ids + .contains(&version_id) + { + subnet_record.recalled_replica_version_ids.push(version_id); + } + } + } + update( make_subnet_record_key(subnet_id).into_bytes(), subnet_record.encode_to_vec(), @@ -254,6 +291,7 @@ pub struct SetSubnetOperationalLevelPayload { pub operational_level: Option, pub ssh_readonly_access: Option>, pub ssh_node_state_write_access: Option>, + pub recalled_replica_version_ids: Option>, } #[derive(Clone, Eq, PartialEq, Debug, CandidType, Serialize, Deserialize)] diff --git a/rs/registry/canister/src/mutations/do_set_subnet_operational_level/tests.rs b/rs/registry/canister/src/mutations/do_set_subnet_operational_level/tests.rs index 0616ded07794..86add216e57d 100644 --- a/rs/registry/canister/src/mutations/do_set_subnet_operational_level/tests.rs +++ b/rs/registry/canister/src/mutations/do_set_subnet_operational_level/tests.rs @@ -77,6 +77,7 @@ fn test_set_subnet_operational_level() { node_id: Some(*NODE_ID), public_keys: Some(vec!["fake node state write public key".to_string()]), }]), + recalled_replica_version_ids: None, }); // Step 3A: Verify results. @@ -111,6 +112,7 @@ fn test_set_subnet_operational_level() { node_id: Some(*NODE_ID), public_keys: Some(vec![]), }]), + recalled_replica_version_ids: None, }); // Step 3B: Verify results. In particular, everything is now back to the way it was. @@ -225,6 +227,7 @@ lazy_static! { operational_level: Some(operational_level::NORMAL), ssh_readonly_access: Some(vec!["hello".to_string(), "world".to_string()]), ssh_node_state_write_access: Some(GENERAL_SSH_NODE_STATE_WRITE_ACCESS.clone()), + recalled_replica_version_ids: None, }; } @@ -245,6 +248,7 @@ fn test_validate_payload_no_subnet_ok() { node_id: Some(*NODE_ID), public_keys: Some(vec!["fake node state write public key".to_string()]), }]), + recalled_replica_version_ids: None, }); // Step 3: Verify results. @@ -278,6 +282,7 @@ fn test_validate_payload_no_node_ok() { ssh_readonly_access: Some(vec!["fake read-only public key".to_string()]), ssh_node_state_write_access: None, + recalled_replica_version_ids: None, }); // Step 3: Verify results. @@ -313,6 +318,7 @@ fn test_validate_payload_empty() { ssh_readonly_access: None, ssh_node_state_write_access: None, + recalled_replica_version_ids: None, }); // Step 3: Verify results. @@ -331,6 +337,7 @@ fn test_validate_payload_no_subnet_but_operational_level() { node_id: Some(*NODE_ID), public_keys: Some(vec!["fake node state write public key".to_string()]), }]), + recalled_replica_version_ids: None, }); // Step 3: Verify results. @@ -352,6 +359,7 @@ fn test_validate_payload_no_subnet_but_ssh_readonly_access() { node_id: Some(*NODE_ID), public_keys: Some(vec!["fake node state write public key".to_string()]), }]), + recalled_replica_version_ids: None, }); match result { @@ -359,3 +367,85 @@ fn test_validate_payload_no_subnet_but_ssh_readonly_access() { Err(err) => assert!(err.contains("ssh_readonly_access")), } } + +#[test] +fn test_recall_replica_versions() { + let (mut registry, _node_id, _node_record, _subnet_record) = _FIXTURE.clone(); + + let version_id_1 = "test-version-1".to_string(); + let version_id_2 = "test-version-2".to_string(); + let version_id_3 = "test-version-3".to_string(); + + registry.do_set_subnet_operational_level(SetSubnetOperationalLevelPayload { + subnet_id: Some(*SUBNET_ID), + operational_level: None, + ssh_readonly_access: None, + ssh_node_state_write_access: None, + recalled_replica_version_ids: Some(vec![version_id_1.clone()]), + }); + + registry.do_set_subnet_operational_level(SetSubnetOperationalLevelPayload { + subnet_id: Some(*SUBNET_ID), + operational_level: None, + ssh_readonly_access: None, + ssh_node_state_write_access: None, + recalled_replica_version_ids: Some(vec![version_id_2.clone(), version_id_3.clone()]), + }); + + // version_id_1 again - should be ignored + registry.do_set_subnet_operational_level(SetSubnetOperationalLevelPayload { + subnet_id: Some(*SUBNET_ID), + operational_level: Some(operational_level::DOWN_FOR_REPAIRS), + ssh_readonly_access: None, + ssh_node_state_write_access: None, + recalled_replica_version_ids: Some(vec![version_id_1.clone()]), + }); + + let subnet_record = registry.get_subnet_or_panic(*SUBNET_ID); + assert_eq!( + subnet_record.recalled_replica_version_ids, + vec![version_id_1, version_id_2, version_id_3] + ); +} + +#[test] +fn test_validate_recalled_replica_version_ids_without_subnet_id() { + let result = + REGISTRY.validate_set_subnet_operational_level(&SetSubnetOperationalLevelPayload { + subnet_id: None, + operational_level: None, + ssh_readonly_access: None, + ssh_node_state_write_access: None, + recalled_replica_version_ids: Some(vec!["test-version".to_string()]), + }); + + assert!( + result + .expect_err("Err not returned") + .contains("recalled_replica_version_ids specified, but not subnet_id") + ); + + let subnet_record = registry.get_subnet_or_panic(*SUBNET_ID); + assert_eq!(subnet_record.recalled_replica_version_ids, vec![]); +} + +#[test] +fn test_validate_recalled_replica_version_ids_empty() { + let result = + REGISTRY.validate_set_subnet_operational_level(&SetSubnetOperationalLevelPayload { + subnet_id: Some(*SUBNET_ID), + operational_level: None, + ssh_readonly_access: None, + ssh_node_state_write_access: None, + recalled_replica_version_ids: Some(vec!["".to_string()]), + }); + + assert!( + result + .expect_err("Err not returned") + .contains("recalled_replica_version_ids cannot contain empty strings") + ); + + let subnet_record = registry.get_subnet_or_panic(*SUBNET_ID); + assert_eq!(subnet_record.recalled_replica_version_ids, vec![]); +} diff --git a/rs/registry/canister/src/mutations/do_update_subnet.rs b/rs/registry/canister/src/mutations/do_update_subnet.rs index 0501330f6ed3..fae34c1bc798 100644 --- a/rs/registry/canister/src/mutations/do_update_subnet.rs +++ b/rs/registry/canister/src/mutations/do_update_subnet.rs @@ -566,6 +566,7 @@ mod tests { ssh_backup_access: vec![], chain_key_config: None, canister_cycles_cost_schedule: CanisterCyclesCostSchedule::Normal as i32, + recalled_replica_version_ids: vec![], }; let key_id = EcdsaKeyId { @@ -658,6 +659,7 @@ mod tests { ssh_readonly_access: vec!["pub_key_0".to_string()], ssh_backup_access: vec!["pub_key_1".to_string()], canister_cycles_cost_schedule: CanisterCyclesCostSchedule::Normal as i32, + recalled_replica_version_ids: vec![], } ); } @@ -684,6 +686,7 @@ mod tests { ssh_backup_access: vec![], chain_key_config: None, canister_cycles_cost_schedule: CanisterCyclesCostSchedule::Normal as i32, + recalled_replica_version_ids: vec![], }; let payload = UpdateSubnetPayload { @@ -744,6 +747,7 @@ mod tests { ssh_backup_access: vec![], chain_key_config: None, canister_cycles_cost_schedule: CanisterCyclesCostSchedule::Normal as i32, + recalled_replica_version_ids: vec![], } ); } diff --git a/rs/registry/canister/tests/update_subnet.rs b/rs/registry/canister/tests/update_subnet.rs index 114f9cbd83c8..0c1a57ceabde 100644 --- a/rs/registry/canister/tests/update_subnet.rs +++ b/rs/registry/canister/tests/update_subnet.rs @@ -153,6 +153,7 @@ fn test_a_canister_other_than_the_governance_canister_cannot_update_a_subnets_co ssh_backup_access: vec![], chain_key_config: None, canister_cycles_cost_schedule: CanisterCyclesCostSchedule::Normal as i32, + recalled_replica_version_ids: vec![], }; // An attacker got a canister that is trying to pass for the governance @@ -277,6 +278,7 @@ fn test_the_governance_canister_can_update_a_subnets_configuration() { chain_key_config: None, canister_cycles_cost_schedule: CanisterCyclesCostSchedule::Normal as i32, + recalled_replica_version_ids: vec![], } .encode_to_vec(), )], @@ -368,6 +370,7 @@ fn test_the_governance_canister_can_update_a_subnets_configuration() { ssh_backup_access: vec!["pub_key_1".to_string()], chain_key_config: None, canister_cycles_cost_schedule: CanisterCyclesCostSchedule::Normal as i32, + recalled_replica_version_ids: vec![], } ); @@ -451,6 +454,7 @@ fn test_subnets_configuration_chain_key_fields_are_updated_correctly(key_id: Mas ssh_backup_access: vec![], chain_key_config: None, canister_cycles_cost_schedule: CanisterCyclesCostSchedule::Normal as i32, + recalled_replica_version_ids: vec![], }; // Just create the registry canister and wait until the subnet_handler ID is diff --git a/rs/test_utilities/registry/src/lib.rs b/rs/test_utilities/registry/src/lib.rs index 91b59a3c31c4..a419f0f612b4 100644 --- a/rs/test_utilities/registry/src/lib.rs +++ b/rs/test_utilities/registry/src/lib.rs @@ -234,6 +234,7 @@ pub fn test_subnet_record() -> SubnetRecord { ssh_backup_access: vec![], chain_key_config: None, canister_cycles_cost_schedule: CanisterCyclesCostSchedule::Normal as i32, + recalled_replica_version_ids: vec![], } } diff --git a/rs/tests/driver/src/nns.rs b/rs/tests/driver/src/nns.rs index 94c20bc33e60..ce8a4c3a719e 100644 --- a/rs/tests/driver/src/nns.rs +++ b/rs/tests/driver/src/nns.rs @@ -51,6 +51,7 @@ use registry_canister::mutations::{ do_create_subnet::{CanisterCyclesCostSchedule, CreateSubnetPayload}, do_deploy_guestos_to_all_subnet_nodes::DeployGuestosToAllSubnetNodesPayload, do_deploy_guestos_to_all_unassigned_nodes::DeployGuestosToAllUnassignedNodesPayload, + do_recall_subnet_replica_version::RecallSubnetReplicaVersionPayload, do_remove_nodes_from_subnet::RemoveNodesFromSubnetPayload, do_revise_elected_replica_versions::ReviseElectedGuestosVersionsPayload, do_update_api_boundary_nodes_version::UpdateApiBoundaryNodesVersionPayload, From 9aa76e32c3fd5576cb258d5c0daa7d6ba19d30f5 Mon Sep 17 00:00:00 2001 From: David Frank Date: Wed, 10 Dec 2025 16:59:17 +0100 Subject: [PATCH 2/3] testing --- Cargo.lock | 2 + rs/orchestrator/Cargo.toml | 1 + .../registry_replicator/Cargo.toml | 1 + .../registry_replicator/src/lib.rs | 37 ++ .../registry_replicator/src/mock.rs | 96 ++++ rs/orchestrator/src/upgrade.rs | 531 +++++++++++------- 6 files changed, 462 insertions(+), 206 deletions(-) create mode 100644 rs/orchestrator/registry_replicator/src/mock.rs diff --git a/Cargo.lock b/Cargo.lock index 6d63b75386fe..027daa8e7cfd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13229,6 +13229,7 @@ dependencies = [ name = "ic-registry-replicator" version = "0.9.0" dependencies = [ + "async-trait", "candid", "clap 4.5.27", "ic-certification-test-utils", @@ -19166,6 +19167,7 @@ dependencies = [ "ic-nns-constants", "ic-protobuf", "ic-registry-canister-api", + "ic-registry-client", "ic-registry-client-fake", "ic-registry-client-helpers", "ic-registry-keys", diff --git a/rs/orchestrator/Cargo.toml b/rs/orchestrator/Cargo.toml index 2949d71140c1..95925e825891 100644 --- a/rs/orchestrator/Cargo.toml +++ b/rs/orchestrator/Cargo.toml @@ -80,6 +80,7 @@ ic-crypto-test-utils-reproducible-rng = { path = "../crypto/test_utils/reproduci ic-crypto-test-utils-tls = { path = "../crypto/test_utils/tls" } ic-crypto-tls-interfaces-mocks = { path = "../crypto/tls_interfaces/mocks" } ic-crypto-tree-hash = { path = "../crypto/tree_hash" } +ic-registry-client = { path = "../registry/client" } ic-registry-client-fake = { path = "../registry/fake" } ic-registry-subnet-type = { path = "../registry/subnet_type" } ic-registry-proto-data-provider = { path = "../registry/proto_data_provider" } diff --git a/rs/orchestrator/registry_replicator/Cargo.toml b/rs/orchestrator/registry_replicator/Cargo.toml index 6ba133b41df1..6d72a6be2aa0 100644 --- a/rs/orchestrator/registry_replicator/Cargo.toml +++ b/rs/orchestrator/registry_replicator/Cargo.toml @@ -7,6 +7,7 @@ description.workspace = true documentation.workspace = true [dependencies] +async-trait = { workspace = true } clap = { workspace = true } ic-config = { path = "../../config" } ic-crypto-utils-threshold-sig-der = { path = "../../crypto/utils/threshold_sig_der" } diff --git a/rs/orchestrator/registry_replicator/src/lib.rs b/rs/orchestrator/registry_replicator/src/lib.rs index 24674c8a9ca9..27b4fa9ef3fd 100644 --- a/rs/orchestrator/registry_replicator/src/lib.rs +++ b/rs/orchestrator/registry_replicator/src/lib.rs @@ -63,6 +63,24 @@ use url::Url; pub mod args; mod internal_state; pub mod metrics; +pub mod mock; + +/// Trait for registry replication functionality. +/// This allows for mocking the registry replicator in tests. +#[async_trait::async_trait] +pub trait RegistryReplicatorTrait: Send + Sync { + /// Polls the registry once, fetching and applying updates. + async fn poll(&self) -> Result<(), String>; + + /// Returns the registry client used by this replicator. + fn get_registry_client(&self) -> Arc; + + /// Returns the local store used by this replicator. + fn get_local_store(&self) -> Arc; + + /// Stops polling and sets the local registry data to what is contained in the provided local store. + async fn stop_polling_and_set_local_registry_data(&self, new_local_store: &dyn LocalStore); +} trait PollableRegistryClient: RegistryClient { /// Polls the registry once, updating its cache by polling the latest local store changes. @@ -486,6 +504,25 @@ impl Drop for RegistryReplicator { } } +#[async_trait::async_trait] +impl RegistryReplicatorTrait for RegistryReplicator { + async fn poll(&self) -> Result<(), String> { + self.poll().await + } + + fn get_registry_client(&self) -> Arc { + self.get_registry_client() + } + + fn get_local_store(&self) -> Arc { + self.get_local_store() + } + + async fn stop_polling_and_set_local_registry_data(&self, new_local_store: &dyn LocalStore) { + self.stop_polling_and_set_local_registry_data(new_local_store).await + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/rs/orchestrator/registry_replicator/src/mock.rs b/rs/orchestrator/registry_replicator/src/mock.rs new file mode 100644 index 000000000000..e85fa951fd4d --- /dev/null +++ b/rs/orchestrator/registry_replicator/src/mock.rs @@ -0,0 +1,96 @@ +//! Mock implementation of RegistryReplicatorTrait for testing. + +use crate::RegistryReplicatorTrait; +use ic_interfaces_registry::{RegistryClient, RegistryDataProvider}; +use ic_registry_client::client::RegistryClientImpl; +use ic_registry_local_store::{KeyMutation, LocalStore, LocalStoreImpl, LocalStoreWriter}; +use ic_types::RegistryVersion; +use std::sync::Arc; + +/// Mock registry replicator that syncs data from a "remote" registry data provider +/// to a local store and registry client. +pub struct MockRegistryReplicator { + /// The "remote" registry data provider that simulates the NNS registry + remote_data_provider: Arc, + /// The local store that gets updated during replication + local_store: Arc, + /// The registry client that reads from the local store + registry_client: Arc, +} + +impl MockRegistryReplicator { + /// Creates a new mock registry replicator. + /// + /// # Arguments + /// * `remote_data_provider` - The "remote" registry that will be replicated from + /// * `local_store` - The local store that will be updated during replication + /// * `registry_client` - The registry client that reads from the local store + pub fn new( + remote_data_provider: Arc, + local_store: Arc, + registry_client: Arc, + ) -> Self { + Self { + remote_data_provider, + local_store, + registry_client, + } + } +} + +#[async_trait::async_trait] +impl RegistryReplicatorTrait for MockRegistryReplicator { + /// Simulates polling the remote registry by fetching updates from the remote + /// data provider and applying them to the local store. + async fn poll(&self) -> Result<(), String> { + // Get the current latest version in the local store + let local_latest_version = self.registry_client.get_latest_version(); + + // Fetch all updates from the remote data provider since the local version + let updates = self + .remote_data_provider + .get_updates_since(local_latest_version) + .map_err(|e| format!("Failed to get updates from remote: {:?}", e))?; + + // Group updates by version + let mut version_mutations: std::collections::BTreeMap> = + std::collections::BTreeMap::new(); + + for record in updates { + version_mutations + .entry(record.version) + .or_insert_with(Vec::new) + .push(KeyMutation { + key: record.key, + value: record.value, + }); + } + + // Apply updates to the local store + for (version, mutations) in version_mutations { + self.local_store + .store(version, mutations) + .map_err(|e| format!("Failed to store updates: {:?}", e))?; + } + + // Update the registry client to see the new data + self.registry_client + .poll_once() + .map_err(|e| format!("Failed to update registry client: {:?}", e))?; + + Ok(()) + } + + fn get_registry_client(&self) -> Arc { + self.registry_client.clone() + } + + fn get_local_store(&self) -> Arc { + self.local_store.clone() + } + + async fn stop_polling_and_set_local_registry_data(&self, _new_local_store: &dyn LocalStore) { + // Mock implementation does nothing - not needed for testing + } +} + diff --git a/rs/orchestrator/src/upgrade.rs b/rs/orchestrator/src/upgrade.rs index 30b74f868e9f..0d617ebfda48 100644 --- a/rs/orchestrator/src/upgrade.rs +++ b/rs/orchestrator/src/upgrade.rs @@ -20,7 +20,7 @@ use ic_management_canister_types_private::MasterPublicKeyId; use ic_protobuf::proxy::try_from_option_field; use ic_registry_client_helpers::{node::NodeRegistry, subnet::SubnetRegistry}; use ic_registry_local_store::LocalStoreImpl; -use ic_registry_replicator::RegistryReplicator; +use ic_registry_replicator::RegistryReplicatorTrait; use ic_types::{ Height, NodeId, RegistryVersion, ReplicaVersion, SubnetId, consensus::{CatchUpPackage, HasHeight}, @@ -91,7 +91,7 @@ pub(crate) struct Upgrade { replica_config_file: PathBuf, pub ic_binary_dir: PathBuf, pub image_path: PathBuf, - registry_replicator: Arc, + registry_replicator: Arc, pub logger: ReplicaLogger, node_id: NodeId, disk_encryption_key_exchange_agent: Option, @@ -111,7 +111,7 @@ impl Upgrade { replica_config_file: PathBuf, node_id: NodeId, ic_binary_dir: PathBuf, - registry_replicator: Arc, + registry_replicator: Arc, release_content_dir: PathBuf, logger: ReplicaLogger, orchestrator_data_directory: PathBuf, @@ -1591,12 +1591,15 @@ mod tests { h: Height, registry_version: RegistryVersion, ) -> CatchUpPackage { + let mut dkg_summary = DkgSummary::fake(); + dkg_summary.registry_version = registry_version; + let block = Block::new( CryptoHashOf::from(CryptoHash(Vec::new())), Payload::new( ic_types::crypto::crypto_hash, BlockPayload::Summary(SummaryPayload { - dkg: DkgSummary::fake(), + dkg: dkg_summary, idkg: Some(idkg::IDkgPayload::empty( h, subnet_test_id(1), @@ -1630,212 +1633,178 @@ mod tests { } } - #[tokio::test] - async fn test_recalled_replica_version_prevents_upgrade() { - use candid::Encode; - use ic_crypto_test_utils_crypto_returning_ok::CryptoReturningOk; - use ic_crypto_utils_threshold_sig_der::parse_threshold_sig_key_from_der; - use ic_nervous_system_integration_tests::pocket_ic_helpers::install_canister; - use ic_nns_constants::REGISTRY_CANISTER_ID; - use ic_nns_test_utils::common::{NnsInitPayloadsBuilder, build_test_registry_wasm}; - use ic_protobuf::registry::node::v1::NodeRecord; - use ic_registry_nns_data_provider::registry::RegistryCanister; - use ic_registry_transport::{ - serialize_atomic_mutate_request, deserialize_atomic_mutate_response, - pb::v1::{RegistryMutation, registry_mutation::Type}, + /// Helper function to add a subnet record to a registry data provider. + fn add_subnet_record_to_provider( + data_provider: &ic_registry_proto_data_provider::ProtoRegistryDataProvider, + subnet_id: SubnetId, + version: RegistryVersion, + membership: Vec, + replica_version_id: &str, + recalled_replica_version_ids: Vec, + ) { + use ic_protobuf::registry::subnet::v1::SubnetRecord; + use ic_registry_keys::make_subnet_record_key; + + let subnet_record = SubnetRecord { + membership: membership.iter().map(|id| id.get().to_vec()).collect(), + replica_version_id: replica_version_id.to_string(), + recalled_replica_version_ids, + ..Default::default() }; - use pocket_ic::PocketIcBuilder; - use ic_protobuf::registry::replica_version::v1::ReplicaVersionRecord; - use ic_protobuf::types::v1 as pb; - use ic_registry_client_fake::FakeRegistryClient; - use ic_registry_keys::{make_node_record_key, make_replica_version_key, make_subnet_record_key}; - use ic_registry_proto_data_provider::ProtoRegistryDataProvider; - use ic_registry_replicator::RegistryReplicator; - use ic_test_utilities_types::ids::node_test_id; - use ic_types::RegistryVersion; - use prost::Message; - use std::time::Duration; - - with_test_replica_logger(|log| async move { - let current_version = ReplicaVersion::try_from("version_1.0.0").unwrap(); - let recalled_version = ReplicaVersion::try_from("version_2.0.0").unwrap(); - let node_id = node_test_id(1); - let subnet_id = subnet_test_id(0); - - let cup_registry_version = RegistryVersion::from(10); - let latest_registry_version = RegistryVersion::from(20); - - // Create a temporary ProtoRegistryDataProvider to build the registry data - let temp_data_provider = Arc::new(ProtoRegistryDataProvider::new()); - - let node_record = NodeRecord { - xnet: Some(ic_protobuf::registry::node::v1::ConnectionEndpoint { - ip_addr: "127.0.0.1".to_string(), - port: 1234, - }), - http: Some(ic_protobuf::registry::node::v1::ConnectionEndpoint { - ip_addr: "127.0.0.1".to_string(), - port: 8080, - }), - ..Default::default() - }; - - temp_data_provider - .add(&make_node_record_key(node_id), cup_registry_version, Some(node_record.clone())) - .unwrap(); - - temp_data_provider - .add(&make_node_record_key(node_id), latest_registry_version, Some(node_record)) - .unwrap(); - - // Add a minimal node record at version 1 (without HTTP endpoint to avoid connection attempts) - temp_data_provider - .add( - &make_node_record_key(node_id), - RegistryVersion::from(1), - Some(NodeRecord { - ..Default::default() - }), - ) - .unwrap(); - - let subnet_record_at_cup_version = SubnetRecord { - membership: vec![node_id.get().to_vec()], - replica_version_id: recalled_version.to_string(), - recalled_replica_version_ids: vec![], - ..Default::default() - }; - - temp_data_provider - .add( - &make_subnet_record_key(subnet_id), - cup_registry_version, - Some(subnet_record_at_cup_version), - ) - .unwrap(); - - temp_data_provider - .add( - &make_replica_version_key(recalled_version.to_string()), - cup_registry_version, - Some(ReplicaVersionRecord::default()), - ) - .unwrap(); - - let subnet_record_at_latest_version = SubnetRecord { - membership: vec![node_id.get().to_vec()], - replica_version_id: recalled_version.to_string(), - recalled_replica_version_ids: vec![recalled_version.to_string()], - ..Default::default() - }; - temp_data_provider - .add( - &make_subnet_record_key(subnet_id), - latest_registry_version, - Some(subnet_record_at_latest_version), - ) - .unwrap(); - - temp_data_provider - .add( - &make_replica_version_key(recalled_version.to_string()), - latest_registry_version, - Some(ReplicaVersionRecord::default()), - ) - .unwrap(); + data_provider + .add(&make_subnet_record_key(subnet_id), version, Some(subnet_record)) + .unwrap(); + } - // Add root subnet ID and NNS public key to the temp data provider - use ic_registry_keys::{make_crypto_threshold_signing_pubkey_key, ROOT_SUBNET_ID_KEY}; - use ic_protobuf::registry::crypto::v1::PublicKey as PbPublicKey; - use ic_certification_test_utils::{CertificateBuilder, CertificateData}; - use ic_crypto_tree_hash::Digest; - use ic_types::CanisterId; + /// Helper function to add a replica version record to a registry data provider. + fn add_replica_version_to_provider( + data_provider: &ic_registry_proto_data_provider::ProtoRegistryDataProvider, + replica_version: &ReplicaVersion, + version: RegistryVersion, + ) { + use ic_protobuf::registry::replica_version::v1::ReplicaVersionRecord; + use ic_registry_keys::make_replica_version_key; - // Add root subnet ID at version 1 - temp_data_provider - .add( - ROOT_SUBNET_ID_KEY, - RegistryVersion::from(1), - Some(ic_types::subnet_id_into_protobuf(subnet_id)), - ) - .unwrap(); + data_provider + .add( + &make_replica_version_key(replica_version.to_string()), + version, + Some(ReplicaVersionRecord::default()), + ) + .unwrap(); + } - // Add NNS public key at version 2 - let (_, nns_public_key, _) = CertificateBuilder::new(CertificateData::CanisterData { - canister_id: CanisterId::from_u64(0), - certified_data: Digest([1; 32]), - }) - .build(); - temp_data_provider - .add( - &make_crypto_threshold_signing_pubkey_key(subnet_id), - RegistryVersion::from(2), - Some(PbPublicKey::from(nns_public_key)), - ) - .unwrap(); + /// Helper struct for setting up upgrade tests with mock registry replication. + /// + /// This struct provides a convenient way to set up all the components needed for testing + /// the Upgrade module with a mock registry replicator. It handles: + /// - Creating a temporary directory for test files + /// - Setting up a remote registry data provider (simulating the NNS registry) + /// - Creating a local store and populating it with data from the remote provider + /// - Creating a mock registry replicator that syncs from remote to local + /// - Setting up all necessary components (CUP provider, metrics, process manager, etc.) + /// + /// # Example + /// ```ignore + /// let setup = UpgradeTestSetup::new_with_remote_provider( + /// node_id, + /// subnet_id, + /// cup_registry_version, + /// remote_data_provider, + /// logger, + /// ).await; + /// + /// setup.add_subnet_record(subnet_id, version, vec![node_id], "version_1.0.0", vec![]); + /// setup.add_replica_version(&version, version); + /// let upgrade = setup.create_upgrade(node_id, current_version, logger).await; + /// ``` + struct UpgradeTestSetup { + #[allow(dead_code)] + tmp_dir: tempfile::TempDir, + remote_data_provider: Arc, + #[allow(dead_code)] + local_store: Arc, + #[allow(dead_code)] + registry_client: Arc, + registry_replicator: Arc, + registry: Arc, + cup_provider: Arc, + metrics: Arc, + replica_process: Arc>>, + ic_binary_dir: std::path::PathBuf, + orchestrator_data_dir: std::path::PathBuf, + } + + impl UpgradeTestSetup { + /// Creates a new test setup with a mock registry replicator and a pre-populated remote provider. + /// + /// # Arguments + /// * `node_id` - The node ID for this test + /// * `_subnet_id` - The subnet ID for this test (unused but kept for API consistency) + /// * `cup_registry_version` - The registry version to use in the CUP + /// * `remote_data_provider` - Pre-populated remote registry data provider + /// * `logger` - The logger to use + /// + /// The setup creates: + /// - A local store populated up to `cup_registry_version` from the remote provider + /// - A mock registry replicator that syncs from remote to local + /// - All necessary components for creating an Upgrade instance + async fn new_with_remote_provider( + node_id: NodeId, + _subnet_id: SubnetId, + cup_registry_version: RegistryVersion, + remote_data_provider: Arc, + logger: ReplicaLogger, + ) -> Self { + use ic_crypto_test_utils_crypto_returning_ok::CryptoReturningOk; + use ic_interfaces_registry::RegistryDataProvider; + use ic_registry_client::client::RegistryClientImpl; + use ic_registry_local_store::{KeyMutation, LocalStoreImpl, LocalStoreWriter}; + use ic_registry_replicator::mock::MockRegistryReplicator; + use tempfile::tempdir; let tmp_dir = tempdir().unwrap(); + + // Set up local store and registry client let local_store_path = tmp_dir.path().join("ic_registry_local_store"); std::fs::create_dir_all(&local_store_path).unwrap(); - - use ic_registry_local_store::{KeyMutation, LocalStoreImpl, LocalStoreWriter}; - let local_store = Arc::new(LocalStoreImpl::new(local_store_path.clone())); - // Populate the local store with all registry data from temp_data_provider - let changelog = temp_data_provider.get_updates_since(RegistryVersion::from(0)).unwrap(); - let mut version_mutations: std::collections::BTreeMap> = std::collections::BTreeMap::new(); + // Populate the local store with data up to CUP version (simulating what's already local) + let changelog = remote_data_provider + .get_updates_since(RegistryVersion::from(0)) + .unwrap(); + let mut version_mutations: std::collections::BTreeMap> = + std::collections::BTreeMap::new(); for record in changelog { - // Populate local store with all data up to latest_registry_version - if record.version <= latest_registry_version { - version_mutations.entry(record.version).or_insert_with(Vec::new).push(KeyMutation { - key: record.key, - value: record.value, - }); + if record.version <= cup_registry_version { + version_mutations + .entry(record.version) + .or_insert_with(Vec::new) + .push(KeyMutation { + key: record.key, + value: record.value, + }); } } - // Populate local store with continuous versions from 1 to latest_registry_version - for version in 1..=latest_registry_version.get() { + // Local store requires continuous versions, so fill in any gaps with dummy mutations + for version in 1..=cup_registry_version.get() { let version = RegistryVersion::from(version); let mutations = version_mutations.remove(&version).unwrap_or_else(|| { - use prost::Message; - let node_record = NodeRecord { - xnet: Some(ic_protobuf::registry::node::v1::ConnectionEndpoint { - ip_addr: "127.0.0.1".to_string(), - port: 1234, - }), - http: Some(ic_protobuf::registry::node::v1::ConnectionEndpoint { - ip_addr: "127.0.0.1".to_string(), - port: 8080, - }), - ..Default::default() - }; vec![KeyMutation { - key: make_node_record_key(node_id), - value: Some(node_record.encode_to_vec()), + key: format!("dummy_key_{}", version.get()), + value: Some(vec![0]), }] }); local_store.store(version, mutations).unwrap(); } - // Create FakeRegistryClient from the local store (which now has all the data) - let registry_client = Arc::new(FakeRegistryClient::new(local_store.clone() as Arc<_>)); - registry_client.update_to_latest_version(); + // Create registry client from the local store + let registry_client = Arc::new(RegistryClientImpl::new(local_store.clone(), None)); + registry_client.fetch_and_start_polling().unwrap(); + + // Create mock registry replicator that will sync from remote to local + let registry_replicator = Arc::new(MockRegistryReplicator::new( + remote_data_provider.clone(), + local_store.clone(), + registry_client.clone(), + )); let registry = Arc::new(RegistryHelper::new( node_id, registry_client.clone() as Arc<_>, - log.clone(), + logger.clone(), )); + // Set up CUP provider let cup_dir = tmp_dir.path().join("cups"); std::fs::create_dir_all(&cup_dir).unwrap(); let cup = make_cup_with_registry_version(Height::from(10), cup_registry_version); - let cup_proto = pb::CatchUpPackage::from(&cup); + let cup_proto = ic_protobuf::types::v1::CatchUpPackage::from(&cup); let cup_file = cup_dir.join("cup.types.v1.CatchUpPackage.pb"); std::fs::write(&cup_file, cup_proto.encode_to_vec()).unwrap(); @@ -1844,52 +1813,202 @@ mod tests { cup_dir.clone(), Arc::new(CryptoReturningOk::default()), Arc::new(crate::catch_up_package_provider::tests::mock_tls_config()), - log.clone(), + logger.clone(), node_id, )); - let registry_replicator = Arc::new( - RegistryReplicator::new( - log.clone(), - local_store_path, - Duration::from_secs(10), - vec![], - None, - ) - .await, - ); - let metrics = Arc::new(OrchestratorMetrics::new(&MetricsRegistry::new())); - let replica_process = Arc::new(Mutex::new(ProcessManager::new(log.clone()))); + let replica_process = Arc::new(Mutex::new(ProcessManager::new(logger.clone()))); + let orchestrator_data_dir = tmp_dir.path().join("orchestrator"); std::fs::create_dir_all(&orchestrator_data_dir).unwrap(); - let mut upgrade = Upgrade::new( - registry.clone(), + // Create a dummy replica binary so ensure_replica_is_running doesn't fail + let ic_binary_dir = tmp_dir.path().join("ic_binary"); + std::fs::create_dir_all(&ic_binary_dir).unwrap(); + let replica_binary = ic_binary_dir.join("replica"); + std::fs::write(&replica_binary, "#!/bin/sh\nsleep 1000\n").unwrap(); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + std::fs::set_permissions(&replica_binary, std::fs::Permissions::from_mode(0o755)) + .unwrap(); + } + + Self { + tmp_dir, + remote_data_provider, + local_store, + registry_client, + registry_replicator, + registry, + cup_provider, metrics, replica_process, - cup_provider, + ic_binary_dir, + orchestrator_data_dir, + } + } + + /// Adds a subnet record to the remote registry at the specified version. + fn add_subnet_record( + &self, + subnet_id: SubnetId, + version: RegistryVersion, + membership: Vec, + replica_version_id: &str, + recalled_replica_version_ids: Vec, + ) { + use ic_protobuf::registry::subnet::v1::SubnetRecord; + use ic_registry_keys::make_subnet_record_key; + + let subnet_record = SubnetRecord { + membership: membership.iter().map(|id| id.get().to_vec()).collect(), + replica_version_id: replica_version_id.to_string(), + recalled_replica_version_ids, + ..Default::default() + }; + + self.remote_data_provider + .add(&make_subnet_record_key(subnet_id), version, Some(subnet_record)) + .unwrap(); + } + + /// Adds a replica version record to the remote registry at the specified version. + fn add_replica_version(&self, replica_version: &ReplicaVersion, version: RegistryVersion) { + use ic_protobuf::registry::replica_version::v1::ReplicaVersionRecord; + use ic_registry_keys::make_replica_version_key; + + self.remote_data_provider + .add( + &make_replica_version_key(replica_version.to_string()), + version, + Some(ReplicaVersionRecord::default()), + ) + .unwrap(); + } + + /// Adds dummy data for intermediate versions to ensure continuous versions. + fn fill_intermediate_versions(&self, from: RegistryVersion, to: RegistryVersion) { + for version in (from.get() + 1)..to.get() { + self.remote_data_provider + .add( + &format!("dummy_key_{}", version), + RegistryVersion::from(version), + Some(vec![0]), + ) + .unwrap(); + } + } + + /// Creates an Upgrade instance with the given parameters. + async fn create_upgrade( + &self, + node_id: NodeId, + current_version: ReplicaVersion, + logger: ReplicaLogger, + ) -> Upgrade { + Upgrade::new( + self.registry.clone(), + self.metrics.clone(), + self.replica_process.clone(), + self.cup_provider.clone(), current_version, - tmp_dir.path().join("replica_config.json"), + self.tmp_dir.path().join("replica_config.json"), node_id, - tmp_dir.path().join("ic_binary"), - registry_replicator, - tmp_dir.path().join("release"), - log.clone(), - orchestrator_data_dir, + self.ic_binary_dir.clone(), + self.registry_replicator.clone(), + self.tmp_dir.path().join("release"), + logger, + self.orchestrator_data_dir.clone(), None, ) + .await + } + } + + /// Test that verifies the orchestrator correctly handles recalled replica versions. + /// + /// This test simulates a scenario where: + /// 1. A subnet is running version_1.0.0 (current version) + /// 2. The CUP (at registry version 10) indicates an upgrade to version_2.0.0 + /// 3. The remote registry (at version 20) marks version_2.0.0 as recalled + /// 4. The orchestrator replicates the remote registry and detects the recalled version + /// 5. The orchestrator should NOT upgrade to the recalled version + /// 6. The orchestrator should continue running the current version (version_1.0.0) + /// + /// The test uses a MockRegistryReplicator to simulate registry replication from a + /// "remote" NNS registry to a local store, allowing us to test the full replication + /// and recalled version detection flow. + #[tokio::test] + async fn test_recalled_replica_version_prevents_upgrade() { + use ic_registry_proto_data_provider::ProtoRegistryDataProvider; + use ic_test_utilities_types::ids::{node_test_id, subnet_test_id}; + use ic_types::{RegistryVersion, ReplicaVersion}; + use std::sync::Arc; + + with_test_replica_logger(|log| async move { + let current_version = ReplicaVersion::try_from("version_1.0.0").unwrap(); + let recalled_version = ReplicaVersion::try_from("version_2.0.0").unwrap(); + let node_id = node_test_id(1); + let subnet_id = subnet_test_id(0); + + let cup_registry_version = RegistryVersion::from(10); + let latest_registry_version = RegistryVersion::from(20); + + // Create the remote data provider and populate it with data BEFORE creating the setup + let remote_data_provider = Arc::new(ProtoRegistryDataProvider::new()); + + // Add initial subnet record at version 1 (required for the node to be assigned) + add_subnet_record_to_provider( + &remote_data_provider, + subnet_id, + RegistryVersion::from(1), + vec![node_id], + ¤t_version.to_string(), + vec![], + ); + add_replica_version_to_provider(&remote_data_provider, ¤t_version, RegistryVersion::from(1)); + + // At CUP version (10): subnet record points to recalled version with empty recalled list + add_subnet_record_to_provider( + &remote_data_provider, + subnet_id, + cup_registry_version, + vec![node_id], + &recalled_version.to_string(), + vec![], + ); + add_replica_version_to_provider(&remote_data_provider, &recalled_version, cup_registry_version); + + // Now create the setup with the pre-populated remote data provider + let setup = UpgradeTestSetup::new_with_remote_provider( + node_id, + subnet_id, + cup_registry_version, + remote_data_provider.clone(), + log.clone(), + ) .await; - let result = upgrade.check().await; + // At latest version (20): subnet record points back to current version and adds recalled version to recalled list + // This simulates the scenario where an upgrade was attempted to version_2.0.0, but then it was recalled + // and the subnet was rolled back to version_1.0.0 + setup.add_subnet_record( + subnet_id, + latest_registry_version, + vec![node_id], + ¤t_version.to_string(), + vec![recalled_version.to_string()], + ); + setup.add_replica_version(&recalled_version, latest_registry_version); - match &result { - Ok(OrchestratorControlFlow::Assigned(id)) => println!("Result: Ok(Assigned({}))", id), - Ok(OrchestratorControlFlow::Unassigned) => println!("Result: Ok(Unassigned)"), - Ok(OrchestratorControlFlow::Leaving(id)) => println!("Result: Ok(Leaving({}))", id), - Ok(OrchestratorControlFlow::Stop) => println!("Result: Ok(Stop)"), - Err(e) => println!("Result: Err({:?})", e), - } + // Add dummy data for intermediate versions to ensure continuous versions + setup.fill_intermediate_versions(cup_registry_version, latest_registry_version); + + let mut upgrade = setup.create_upgrade(node_id, current_version, log.clone()).await; + + let result = upgrade.check().await; assert!( matches!(result, Ok(OrchestratorControlFlow::Assigned(_))), From 5e682c9afdc25b57e1d3b09bbbd6f145d2730452 Mon Sep 17 00:00:00 2001 From: David Frank Date: Thu, 11 Dec 2025 16:38:22 +0100 Subject: [PATCH 3/3] testing --- rs/orchestrator/src/upgrade.rs | 639 +++++++++++++-------------------- 1 file changed, 247 insertions(+), 392 deletions(-) diff --git a/rs/orchestrator/src/upgrade.rs b/rs/orchestrator/src/upgrade.rs index 0d617ebfda48..cd35dc85307d 100644 --- a/rs/orchestrator/src/upgrade.rs +++ b/rs/orchestrator/src/upgrade.rs @@ -38,6 +38,7 @@ use std::{ const KEY_CHANGES_FILENAME: &str = "key_changed_metric.cbor"; +#[derive(Debug, PartialEq, Eq)] #[must_use = "This may be a `Stop` variant, which should be handled"] pub(crate) enum OrchestratorControlFlow { /// The node is assigned to the subnet with the given subnet id. @@ -1038,28 +1039,35 @@ fn report_master_public_key_changed_metric( #[cfg(test)] mod tests { - use prost::Message; - use std::collections::BTreeMap; - use super::*; use ic_crypto_test_utils_canister_threshold_sigs::{ CanisterThresholdSigTestEnvironment, IDkgParticipants, generate_key_transcript, }; + use ic_crypto_test_utils_crypto_returning_ok::CryptoReturningOk; use ic_crypto_test_utils_ni_dkg::{ NiDkgTestEnvironment, RandomNiDkgConfig, run_ni_dkg_and_create_single_transcript, }; use ic_crypto_test_utils_reproducible_rng::{ReproducibleRng, reproducible_rng}; - use ic_interfaces_registry::{RegistryClientVersionedResult, RegistryVersionedRecord}; + use ic_interfaces_registry::{ + RegistryClientVersionedResult, RegistryDataProvider, RegistryVersionedRecord, + }; use ic_management_canister_types_private::{ EcdsaCurve, EcdsaKeyId, SchnorrAlgorithm, SchnorrKeyId, VetKdCurve, VetKdKeyId, }; use ic_metrics::MetricsRegistry; - use ic_protobuf::registry::subnet::v1::SubnetRecord; + use ic_protobuf::registry::{ + replica_version::v1::ReplicaVersionRecord, subnet::v1::SubnetRecord, + }; + use ic_registry_client::client::RegistryClientImpl; + use ic_registry_keys::{make_replica_version_key, make_subnet_record_key}; + use ic_registry_local_store::{KeyMutation, LocalStoreImpl, LocalStoreWriter}; + use ic_registry_proto_data_provider::ProtoRegistryDataProvider; + use ic_registry_replicator::mock::MockRegistryReplicator; use ic_test_utilities_consensus::fake::{Fake, FakeContent}; use ic_test_utilities_logger::with_test_replica_logger; - use ic_test_utilities_types::ids::subnet_test_id; + use ic_test_utilities_types::ids::{node_test_id, subnet_test_id}; use ic_types::{ - PrincipalId, Time, + PrincipalId, RegistryVersion, ReplicaVersion, Time, batch::ValidationContext, consensus::{ Block, BlockPayload, CatchUpContent, HashedBlock, HashedRandomBeacon, Payload, @@ -1077,8 +1085,204 @@ mod tests { time::UNIX_EPOCH, }; use mockall::mock; + use prost::Message; + use std::{collections::BTreeMap, path::Path, sync::Arc}; use tempfile::{TempDir, tempdir}; + fn make_cup_with_registry_version( + h: Height, + registry_version: RegistryVersion, + ) -> CatchUpPackage { + let mut dkg_summary = DkgSummary::fake(); + dkg_summary.registry_version = registry_version; + + let block = Block::new( + CryptoHashOf::from(CryptoHash(Vec::new())), + Payload::new( + ic_types::crypto::crypto_hash, + BlockPayload::Summary(SummaryPayload { + dkg: dkg_summary, + idkg: Some(idkg::IDkgPayload::empty(h, subnet_test_id(1), Vec::new())), + }), + ), + h, + Rank(46), + ValidationContext { + registry_version, + certified_height: Height::from(42), + time: UNIX_EPOCH, + }, + ); + + CatchUpPackage::fake(CatchUpContent::new( + HashedBlock::new(ic_types::crypto::crypto_hash, block), + HashedRandomBeacon::new( + ic_types::crypto::crypto_hash, + RandomBeacon::fake(RandomBeaconContent::new( + h, + CryptoHashOf::from(CryptoHash(Vec::new())), + )), + ), + CryptoHashOf::from(CryptoHash(Vec::new())), + None, + )) + } + + fn add_subnet_record_to_provider( + data_provider: &ProtoRegistryDataProvider, + subnet_id: SubnetId, + version: RegistryVersion, + membership: Vec, + replica_version_id: &str, + recalled_replica_version_ids: Vec, + ) { + let subnet_record = SubnetRecord { + membership: membership.iter().map(|id| id.get().to_vec()).collect(), + replica_version_id: replica_version_id.to_string(), + recalled_replica_version_ids, + ..Default::default() + }; + + data_provider + .add( + &make_subnet_record_key(subnet_id), + version, + Some(subnet_record), + ) + .unwrap(); + } + + fn add_replica_version_to_provider( + data_provider: &ProtoRegistryDataProvider, + replica_version: &ReplicaVersion, + version: RegistryVersion, + ) { + data_provider + .add( + &make_replica_version_key(replica_version.to_string()), + version, + Some(ReplicaVersionRecord { + release_package_sha256_hex: "sha256".to_string(), + release_package_urls: vec![], + guest_launch_measurements: None, + }), + ) + .unwrap(); + } + + fn create_dummy_replica_binary(ic_binary_dir: &Path) { + std::fs::create_dir_all(ic_binary_dir).unwrap(); + let replica_binary = ic_binary_dir.join("replica"); + std::fs::write(&replica_binary, "#!/bin/sh\nsleep 1000\n").unwrap(); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + std::fs::set_permissions(&replica_binary, std::fs::Permissions::from_mode(0o755)) + .unwrap(); + } + } + + async fn create_upgrade_for_test( + node_id: NodeId, + cup_registry_version: RegistryVersion, + remote_data_provider: Arc, + current_version: ReplicaVersion, + logger: ReplicaLogger, + ) -> (Upgrade, Arc, TempDir) { + let tmp_dir = tempdir().unwrap(); + + let local_store_path = tmp_dir.path().join("ic_registry_local_store"); + std::fs::create_dir_all(&local_store_path).unwrap(); + let local_store = Arc::new(LocalStoreImpl::new(local_store_path.clone())); + + let changelog = remote_data_provider + .get_updates_since(RegistryVersion::from(0)) + .unwrap(); + + let mut version_mutations: BTreeMap> = BTreeMap::new(); + + for record in changelog { + if record.version <= cup_registry_version { + version_mutations + .entry(record.version) + .or_insert_with(Vec::new) + .push(KeyMutation { + key: record.key, + value: record.value, + }); + } + } + + for version in 1..=cup_registry_version.get() { + let version = RegistryVersion::from(version); + let mutations = version_mutations.remove(&version).unwrap(); + local_store.store(version, mutations).unwrap(); + } + + let registry_client = Arc::new(RegistryClientImpl::new(local_store.clone(), None)); + registry_client.fetch_and_start_polling().unwrap(); + + let registry_replicator = Arc::new(MockRegistryReplicator::new( + remote_data_provider.clone(), + local_store.clone(), + registry_client.clone(), + )); + + let registry = Arc::new(RegistryHelper::new( + node_id, + registry_client.clone() as Arc<_>, + logger.clone(), + )); + + let cup_dir = tmp_dir.path().join("cups"); + std::fs::create_dir_all(&cup_dir).unwrap(); + + let cup = make_cup_with_registry_version( + Height::from(cup_registry_version.get()), + cup_registry_version, + ); + let cup_proto = ic_protobuf::types::v1::CatchUpPackage::from(&cup); + let cup_file = cup_dir.join("cup.types.v1.CatchUpPackage.pb"); + std::fs::write(&cup_file, cup_proto.encode_to_vec()).unwrap(); + + let cup_provider = Arc::new(CatchUpPackageProvider::new( + registry.clone(), + cup_dir.clone(), + Arc::new(CryptoReturningOk::default()), + Arc::new(crate::catch_up_package_provider::tests::mock_tls_config()), + logger.clone(), + node_id, + )); + + let metrics = Arc::new(OrchestratorMetrics::new(&MetricsRegistry::new())); + let replica_process = Arc::new(Mutex::new(ProcessManager::new(logger.clone()))); + + let orchestrator_data_dir = tmp_dir.path().join("orchestrator"); + std::fs::create_dir_all(&orchestrator_data_dir).unwrap(); + + let ic_binary_dir = tmp_dir.path().join("ic_binary"); + create_dummy_replica_binary(&ic_binary_dir); + + let upgrade = Upgrade::new( + registry, + metrics, + replica_process, + cup_provider, + current_version, + tmp_dir.path().join("replica_config.json"), + node_id, + ic_binary_dir, + registry_replicator, + tmp_dir.path().join("release"), + logger, + orchestrator_data_dir, + None, + ) + .await; + + (upgrade, remote_data_provider, tmp_dir) + } + fn make_ecdsa_key_id() -> MasterPublicKeyId { MasterPublicKeyId::Ecdsa(EcdsaKeyId { curve: EcdsaCurve::Secp256k1, @@ -1587,379 +1791,19 @@ mod tests { } } - fn make_cup_with_registry_version( - h: Height, - registry_version: RegistryVersion, - ) -> CatchUpPackage { - let mut dkg_summary = DkgSummary::fake(); - dkg_summary.registry_version = registry_version; - - let block = Block::new( - CryptoHashOf::from(CryptoHash(Vec::new())), - Payload::new( - ic_types::crypto::crypto_hash, - BlockPayload::Summary(SummaryPayload { - dkg: dkg_summary, - idkg: Some(idkg::IDkgPayload::empty( - h, - subnet_test_id(1), - Vec::new(), - )), - }), - ), - h, - Rank(46), - ValidationContext { - registry_version, - certified_height: Height::from(42), - time: UNIX_EPOCH, - }, - ); - - CatchUpPackage { - content: CatchUpContent::new( - HashedBlock::new(ic_types::crypto::crypto_hash, block), - HashedRandomBeacon::new( - ic_types::crypto::crypto_hash, - RandomBeacon::fake(RandomBeaconContent::new( - h, - CryptoHashOf::from(CryptoHash(Vec::new())), - )), - ), - CryptoHashOf::from(CryptoHash(Vec::new())), - None, - ), - signature: ThresholdSignature::fake(), - } - } - - /// Helper function to add a subnet record to a registry data provider. - fn add_subnet_record_to_provider( - data_provider: &ic_registry_proto_data_provider::ProtoRegistryDataProvider, - subnet_id: SubnetId, - version: RegistryVersion, - membership: Vec, - replica_version_id: &str, - recalled_replica_version_ids: Vec, - ) { - use ic_protobuf::registry::subnet::v1::SubnetRecord; - use ic_registry_keys::make_subnet_record_key; - - let subnet_record = SubnetRecord { - membership: membership.iter().map(|id| id.get().to_vec()).collect(), - replica_version_id: replica_version_id.to_string(), - recalled_replica_version_ids, - ..Default::default() - }; - - data_provider - .add(&make_subnet_record_key(subnet_id), version, Some(subnet_record)) - .unwrap(); - } - - /// Helper function to add a replica version record to a registry data provider. - fn add_replica_version_to_provider( - data_provider: &ic_registry_proto_data_provider::ProtoRegistryDataProvider, - replica_version: &ReplicaVersion, - version: RegistryVersion, - ) { - use ic_protobuf::registry::replica_version::v1::ReplicaVersionRecord; - use ic_registry_keys::make_replica_version_key; - - data_provider - .add( - &make_replica_version_key(replica_version.to_string()), - version, - Some(ReplicaVersionRecord::default()), - ) - .unwrap(); - } - - /// Helper struct for setting up upgrade tests with mock registry replication. - /// - /// This struct provides a convenient way to set up all the components needed for testing - /// the Upgrade module with a mock registry replicator. It handles: - /// - Creating a temporary directory for test files - /// - Setting up a remote registry data provider (simulating the NNS registry) - /// - Creating a local store and populating it with data from the remote provider - /// - Creating a mock registry replicator that syncs from remote to local - /// - Setting up all necessary components (CUP provider, metrics, process manager, etc.) - /// - /// # Example - /// ```ignore - /// let setup = UpgradeTestSetup::new_with_remote_provider( - /// node_id, - /// subnet_id, - /// cup_registry_version, - /// remote_data_provider, - /// logger, - /// ).await; - /// - /// setup.add_subnet_record(subnet_id, version, vec![node_id], "version_1.0.0", vec![]); - /// setup.add_replica_version(&version, version); - /// let upgrade = setup.create_upgrade(node_id, current_version, logger).await; - /// ``` - struct UpgradeTestSetup { - #[allow(dead_code)] - tmp_dir: tempfile::TempDir, - remote_data_provider: Arc, - #[allow(dead_code)] - local_store: Arc, - #[allow(dead_code)] - registry_client: Arc, - registry_replicator: Arc, - registry: Arc, - cup_provider: Arc, - metrics: Arc, - replica_process: Arc>>, - ic_binary_dir: std::path::PathBuf, - orchestrator_data_dir: std::path::PathBuf, - } - - impl UpgradeTestSetup { - /// Creates a new test setup with a mock registry replicator and a pre-populated remote provider. - /// - /// # Arguments - /// * `node_id` - The node ID for this test - /// * `_subnet_id` - The subnet ID for this test (unused but kept for API consistency) - /// * `cup_registry_version` - The registry version to use in the CUP - /// * `remote_data_provider` - Pre-populated remote registry data provider - /// * `logger` - The logger to use - /// - /// The setup creates: - /// - A local store populated up to `cup_registry_version` from the remote provider - /// - A mock registry replicator that syncs from remote to local - /// - All necessary components for creating an Upgrade instance - async fn new_with_remote_provider( - node_id: NodeId, - _subnet_id: SubnetId, - cup_registry_version: RegistryVersion, - remote_data_provider: Arc, - logger: ReplicaLogger, - ) -> Self { - use ic_crypto_test_utils_crypto_returning_ok::CryptoReturningOk; - use ic_interfaces_registry::RegistryDataProvider; - use ic_registry_client::client::RegistryClientImpl; - use ic_registry_local_store::{KeyMutation, LocalStoreImpl, LocalStoreWriter}; - use ic_registry_replicator::mock::MockRegistryReplicator; - use tempfile::tempdir; - - let tmp_dir = tempdir().unwrap(); - - // Set up local store and registry client - let local_store_path = tmp_dir.path().join("ic_registry_local_store"); - std::fs::create_dir_all(&local_store_path).unwrap(); - let local_store = Arc::new(LocalStoreImpl::new(local_store_path.clone())); - - // Populate the local store with data up to CUP version (simulating what's already local) - let changelog = remote_data_provider - .get_updates_since(RegistryVersion::from(0)) - .unwrap(); - let mut version_mutations: std::collections::BTreeMap> = - std::collections::BTreeMap::new(); - - for record in changelog { - if record.version <= cup_registry_version { - version_mutations - .entry(record.version) - .or_insert_with(Vec::new) - .push(KeyMutation { - key: record.key, - value: record.value, - }); - } - } - - // Local store requires continuous versions, so fill in any gaps with dummy mutations - for version in 1..=cup_registry_version.get() { - let version = RegistryVersion::from(version); - let mutations = version_mutations.remove(&version).unwrap_or_else(|| { - vec![KeyMutation { - key: format!("dummy_key_{}", version.get()), - value: Some(vec![0]), - }] - }); - local_store.store(version, mutations).unwrap(); - } - - // Create registry client from the local store - let registry_client = Arc::new(RegistryClientImpl::new(local_store.clone(), None)); - registry_client.fetch_and_start_polling().unwrap(); - - // Create mock registry replicator that will sync from remote to local - let registry_replicator = Arc::new(MockRegistryReplicator::new( - remote_data_provider.clone(), - local_store.clone(), - registry_client.clone(), - )); - - let registry = Arc::new(RegistryHelper::new( - node_id, - registry_client.clone() as Arc<_>, - logger.clone(), - )); - - // Set up CUP provider - let cup_dir = tmp_dir.path().join("cups"); - std::fs::create_dir_all(&cup_dir).unwrap(); - - let cup = make_cup_with_registry_version(Height::from(10), cup_registry_version); - let cup_proto = ic_protobuf::types::v1::CatchUpPackage::from(&cup); - let cup_file = cup_dir.join("cup.types.v1.CatchUpPackage.pb"); - std::fs::write(&cup_file, cup_proto.encode_to_vec()).unwrap(); - - let cup_provider = Arc::new(CatchUpPackageProvider::new( - registry.clone(), - cup_dir.clone(), - Arc::new(CryptoReturningOk::default()), - Arc::new(crate::catch_up_package_provider::tests::mock_tls_config()), - logger.clone(), - node_id, - )); - - let metrics = Arc::new(OrchestratorMetrics::new(&MetricsRegistry::new())); - let replica_process = Arc::new(Mutex::new(ProcessManager::new(logger.clone()))); - - let orchestrator_data_dir = tmp_dir.path().join("orchestrator"); - std::fs::create_dir_all(&orchestrator_data_dir).unwrap(); - - // Create a dummy replica binary so ensure_replica_is_running doesn't fail - let ic_binary_dir = tmp_dir.path().join("ic_binary"); - std::fs::create_dir_all(&ic_binary_dir).unwrap(); - let replica_binary = ic_binary_dir.join("replica"); - std::fs::write(&replica_binary, "#!/bin/sh\nsleep 1000\n").unwrap(); - #[cfg(unix)] - { - use std::os::unix::fs::PermissionsExt; - std::fs::set_permissions(&replica_binary, std::fs::Permissions::from_mode(0o755)) - .unwrap(); - } - - Self { - tmp_dir, - remote_data_provider, - local_store, - registry_client, - registry_replicator, - registry, - cup_provider, - metrics, - replica_process, - ic_binary_dir, - orchestrator_data_dir, - } - } - - /// Adds a subnet record to the remote registry at the specified version. - fn add_subnet_record( - &self, - subnet_id: SubnetId, - version: RegistryVersion, - membership: Vec, - replica_version_id: &str, - recalled_replica_version_ids: Vec, - ) { - use ic_protobuf::registry::subnet::v1::SubnetRecord; - use ic_registry_keys::make_subnet_record_key; - - let subnet_record = SubnetRecord { - membership: membership.iter().map(|id| id.get().to_vec()).collect(), - replica_version_id: replica_version_id.to_string(), - recalled_replica_version_ids, - ..Default::default() - }; - - self.remote_data_provider - .add(&make_subnet_record_key(subnet_id), version, Some(subnet_record)) - .unwrap(); - } - - /// Adds a replica version record to the remote registry at the specified version. - fn add_replica_version(&self, replica_version: &ReplicaVersion, version: RegistryVersion) { - use ic_protobuf::registry::replica_version::v1::ReplicaVersionRecord; - use ic_registry_keys::make_replica_version_key; - - self.remote_data_provider - .add( - &make_replica_version_key(replica_version.to_string()), - version, - Some(ReplicaVersionRecord::default()), - ) - .unwrap(); - } - - /// Adds dummy data for intermediate versions to ensure continuous versions. - fn fill_intermediate_versions(&self, from: RegistryVersion, to: RegistryVersion) { - for version in (from.get() + 1)..to.get() { - self.remote_data_provider - .add( - &format!("dummy_key_{}", version), - RegistryVersion::from(version), - Some(vec![0]), - ) - .unwrap(); - } - } - - /// Creates an Upgrade instance with the given parameters. - async fn create_upgrade( - &self, - node_id: NodeId, - current_version: ReplicaVersion, - logger: ReplicaLogger, - ) -> Upgrade { - Upgrade::new( - self.registry.clone(), - self.metrics.clone(), - self.replica_process.clone(), - self.cup_provider.clone(), - current_version, - self.tmp_dir.path().join("replica_config.json"), - node_id, - self.ic_binary_dir.clone(), - self.registry_replicator.clone(), - self.tmp_dir.path().join("release"), - logger, - self.orchestrator_data_dir.clone(), - None, - ) - .await - } - } - - /// Test that verifies the orchestrator correctly handles recalled replica versions. - /// - /// This test simulates a scenario where: - /// 1. A subnet is running version_1.0.0 (current version) - /// 2. The CUP (at registry version 10) indicates an upgrade to version_2.0.0 - /// 3. The remote registry (at version 20) marks version_2.0.0 as recalled - /// 4. The orchestrator replicates the remote registry and detects the recalled version - /// 5. The orchestrator should NOT upgrade to the recalled version - /// 6. The orchestrator should continue running the current version (version_1.0.0) - /// - /// The test uses a MockRegistryReplicator to simulate registry replication from a - /// "remote" NNS registry to a local store, allowing us to test the full replication - /// and recalled version detection flow. #[tokio::test] async fn test_recalled_replica_version_prevents_upgrade() { - use ic_registry_proto_data_provider::ProtoRegistryDataProvider; - use ic_test_utilities_types::ids::{node_test_id, subnet_test_id}; - use ic_types::{RegistryVersion, ReplicaVersion}; - use std::sync::Arc; - with_test_replica_logger(|log| async move { let current_version = ReplicaVersion::try_from("version_1.0.0").unwrap(); let recalled_version = ReplicaVersion::try_from("version_2.0.0").unwrap(); let node_id = node_test_id(1); let subnet_id = subnet_test_id(0); - let cup_registry_version = RegistryVersion::from(10); - let latest_registry_version = RegistryVersion::from(20); + let cup_registry_version = RegistryVersion::from(2); + let latest_registry_version = RegistryVersion::from(3); - // Create the remote data provider and populate it with data BEFORE creating the setup + // Set up initial registry state at version 1 with subnet running current_version (version_1.0.0) let remote_data_provider = Arc::new(ProtoRegistryDataProvider::new()); - - // Add initial subnet record at version 1 (required for the node to be assigned) add_subnet_record_to_provider( &remote_data_provider, subnet_id, @@ -1968,9 +1812,14 @@ mod tests { ¤t_version.to_string(), vec![], ); - add_replica_version_to_provider(&remote_data_provider, ¤t_version, RegistryVersion::from(1)); + add_replica_version_to_provider( + &remote_data_provider, + ¤t_version, + RegistryVersion::from(1), + ); - // At CUP version (10): subnet record points to recalled version with empty recalled list + // At CUP version (2), the subnet record indicates an upgrade to recalled_version (version_2.0.0) + // Note: At this point, the version is not yet marked as recalled add_subnet_record_to_provider( &remote_data_provider, subnet_id, @@ -1979,39 +1828,45 @@ mod tests { &recalled_version.to_string(), vec![], ); - add_replica_version_to_provider(&remote_data_provider, &recalled_version, cup_registry_version); + add_replica_version_to_provider( + &remote_data_provider, + &recalled_version, + cup_registry_version, + ); - // Now create the setup with the pre-populated remote data provider - let setup = UpgradeTestSetup::new_with_remote_provider( + // Create the upgrade instance with local registry populated up to version 2 + let (mut upgrade, remote, _tmp_dir) = create_upgrade_for_test( node_id, - subnet_id, cup_registry_version, - remote_data_provider.clone(), + remote_data_provider, + current_version.clone(), log.clone(), ) .await; - // At latest version (20): subnet record points back to current version and adds recalled version to recalled list - // This simulates the scenario where an upgrade was attempted to version_2.0.0, but then it was recalled - // and the subnet was rolled back to version_1.0.0 - setup.add_subnet_record( + // At latest version (3), version_2.0.0 is marked as recalled in the remote registry + // and the subnet is rolled back to current_version (version_1.0.0) + // This simulates the scenario where version_2.0.0 was recalled + add_subnet_record_to_provider( + &remote, subnet_id, latest_registry_version, vec![node_id], ¤t_version.to_string(), vec![recalled_version.to_string()], ); - setup.add_replica_version(&recalled_version, latest_registry_version); - - // Add dummy data for intermediate versions to ensure continuous versions - setup.fill_intermediate_versions(cup_registry_version, latest_registry_version); - - let mut upgrade = setup.create_upgrade(node_id, current_version, log.clone()).await; - - let result = upgrade.check().await; - - assert!( - matches!(result, Ok(OrchestratorControlFlow::Assigned(_))), + add_replica_version_to_provider(&remote, &recalled_version, latest_registry_version); + + // The orchestrator replicates the remote registry and detects the recalled version + // When upgrade.check() runs, it will replicate from remote (getting version 3 data) + // and should detect that recalled_version is in the recalled list + let result = upgrade.check().await.unwrap(); + + // The orchestrator should NOT upgrade to the recalled version + // Instead, it should continue running the current version (version_1.0.0) + assert_eq!( + result, + OrchestratorControlFlow::Assigned(subnet_id), "Expected Assigned flow when version is recalled, upgrade should not proceed" ); })