Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 172 additions & 5 deletions rs/registry/canister/src/mutations/do_migrate_canisters.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,17 @@ impl Registry {
})
.collect::<Result<Vec<_>, _>>()?;

// Subnet deletion is (currently) a manual process and we don't expect to actually delete
// a subnet while a canister migration is ongoing (instead we expect to disable the
// canister migration API first and only delete a subnet once there's no ongoing canister
// migration). However, e.g., if a subnet is permanently stuck, it might be impossible to
// successfully complete an ongoing canister migration and in this case, we'd like to still
// be able to delete such a subnet. In this case, the outcome should be as if the migration
// completed before the subnet got deleted, i.e., the canister should be removed from the
// source subnet's canister ranges and not added to the (deleted) target subnet's canister
// ranges.
// Hence, we intentionally do not validate that the target subnet exists here.
let target_subnet_id = SubnetId::new(target_subnet_id);
Comment thread
Copilot marked this conversation as resolved.
self.get_subnet(target_subnet_id, self.latest_version())?;

Ok((canister_ids, target_subnet_id))
}
Expand Down Expand Up @@ -75,7 +84,8 @@ mod test {
#[test]
fn test_basic_migrate_canisters() {
// We create an invariant compliant registry, then we migrate a single canister
// to a new subnet, and we check that the Routing table has the correct ranges at that point.
// to a new subnet, and we check that the Routing table has the correct ranges
// at that point.

let mut registry = invariant_compliant_registry(0);
let system_subnet =
Expand All @@ -88,6 +98,9 @@ mod test {
let target_subnet_id =
registry_create_subnet_with_nodes(&mut registry, &node_ids_and_dkg_pks, &[0, 1, 2, 3]);

// The target subnet must already appear in the routing table for the migration to keep
// it assigned; give it a dedicated range (256–256) in addition to the system subnet's
// range.
let mut initial_routing_table = RoutingTable::new();
initial_routing_table
.insert(
Expand All @@ -98,6 +111,15 @@ mod test {
system_subnet.into(),
)
.unwrap();
initial_routing_table
.insert(
CanisterIdRange {
start: CanisterId::from_u64(256),
end: CanisterId::from_u64(256),
},
target_subnet_id,
)
.unwrap();

registry.apply_mutations_for_test(routing_table_into_registry_mutation(
&registry,
Expand All @@ -114,7 +136,6 @@ mod test {

let response = registry.do_migrate_canisters(request);

// currently just empty response.
assert_eq!(
MigrateCanistersResponse {
registry_version: registry.latest_version()
Expand All @@ -124,8 +145,8 @@ mod test {

let updated_routing_table = registry.get_routing_table_or_panic(registry.latest_version());

// We expect the range to be split into 3 ranges, with the middle one being a single-canister-id range
// and the others pointing at the system subnet as before.
// We expect the system subnet's range to be split around canister 100, with canister 100
// now assigned to the target subnet alongside its pre-existing range 256–256.
let mut expected_routing_table = RoutingTable::new();
expected_routing_table
.insert(
Expand Down Expand Up @@ -154,6 +175,152 @@ mod test {
system_subnet.into(),
)
.unwrap();
expected_routing_table
.insert(
CanisterIdRange {
start: CanisterId::from_u64(256),
end: CanisterId::from_u64(256),
},
target_subnet_id,
)
.unwrap();
assert_eq!(updated_routing_table, expected_routing_table);
}

#[test]
fn test_migrate_canisters_succeeds_if_source_subnet_deleted() {
// Verify that migrate_canisters succeeds when the source subnet has been deleted
// (has no key in subnet_list and no ranges in routing_table).
//
// We simulate this by using a subnet ID that is not in the routing table to begin with:
// the source subnet for canister 100 simply has no routing_table ranges (gap at 100).

let mut registry = invariant_compliant_registry(0);
let system_subnet_id =
PrincipalId::try_from(registry.get_subnet_list_record().subnets.first().unwrap())
.unwrap();

// Set up routing table: system subnet covers 0–99 and 101–255; canister 100 is
// unrouted, as if its source subnet has been deleted.
let mut initial_routing_table = RoutingTable::new();
initial_routing_table
.insert(
CanisterIdRange {
start: CanisterId::from_u64(0),
end: CanisterId::from_u64(99),
},
system_subnet_id.into(),
)
.unwrap();
initial_routing_table
.insert(
CanisterIdRange {
start: CanisterId::from_u64(101),
end: CanisterId::from_u64(255),
},
system_subnet_id.into(),
)
.unwrap();
registry.apply_mutations_for_test(routing_table_into_registry_mutation(
&registry,
initial_routing_table,
));

// Migrate canister 100 (not in routing table) to the system subnet.
// The system subnet is in the routing table, so the endpoint must succeed.
let response = registry.do_migrate_canisters(MigrateCanistersPayload {
canister_ids: vec![PrincipalId::from(CanisterId::from_u64(100))],
target_subnet_id: system_subnet_id,
});
assert_eq!(
MigrateCanistersResponse {
registry_version: registry.latest_version()
},
response
);

// Canister 100 was unrouted (gap); assigning it to the system subnet fills the gap,
// and optimize() merges 0–99, 100–100, 101–255 into a single range.
let updated_routing_table = registry.get_routing_table_or_panic(registry.latest_version());
let mut expected_routing_table = RoutingTable::new();
expected_routing_table
.insert(
CanisterIdRange {
start: CanisterId::from_u64(0),
end: CanisterId::from_u64(255),
},
system_subnet_id.into(),
)
.unwrap();
assert_eq!(updated_routing_table, expected_routing_table);
}

// Verify that migrate_canisters succeeds when the target subnet does not exist
// (has no key in subnet_list and no ranges in routing_table), e.g.,
// after the target subnet has been deleted.
#[test]
fn test_migrate_canisters_succeeds_if_target_subnet_does_not_exist() {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(We discussed in this in Slack, but just so that this lives in the record of GitHub...)

It seems like such requests should be blocked, not result in canister deletion, because that would be a surprising data loss, the worst kind of surprise.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Subnet deletion is (currently) a manual process and we don't expect to actually delete a subnet while a canister migration is ongoing (instead we expect to disable the canister migration API first and only delete a subnet once there's no ongoing canister migration). However, e.g., if a subnet is permanently stuck, it might be impossible to successfully complete an ongoing canister migration and in this case, we'd like to still be able to delete such a subnet. This leaves us with two options:

  • adjust canister migration so that it can gracefully abort canister migration to a subnet that is stuck/deleted (that would very likely result in substantial engineering overhead and risk);
  • adjust canister migration so that performing canister migration from/to a stuck/deleted subnet (transparently) behaves as if canister migration completed before the subnet got stuck/deleted.

The second option requires less engineering overhead which seems like a good trade-off given that it is rather unlikely that we'll need this in practice.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah. I didn't know all of that. It would be good for that information to be in in the comments on lines do_migrate_canister.rs:42-46.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let mut registry = invariant_compliant_registry(0);
let system_subnet_id =
PrincipalId::try_from(registry.get_subnet_list_record().subnets.first().unwrap())
.unwrap();

// Set up routing table: system subnet covers canister 0–255.
let mut initial_routing_table = RoutingTable::new();
initial_routing_table
.insert(
CanisterIdRange {
start: CanisterId::from_u64(0),
end: CanisterId::from_u64(255),
},
system_subnet_id.into(),
)
.unwrap();
registry.apply_mutations_for_test(routing_table_into_registry_mutation(
&registry,
initial_routing_table,
));

// We simulate a deleted subnet by using a subnet ID
// that is not in the routing table to begin with.
let non_existent_subnet_id = PrincipalId::new_user_test_id(42);

// Migrate canister 100 to a non-existent target subnet.
// The target subnet is not in the routing table, so the endpoint must succeed
// without adding any range for the target subnet.
let response = registry.do_migrate_canisters(MigrateCanistersPayload {
canister_ids: vec![PrincipalId::from(CanisterId::from_u64(100))],
target_subnet_id: non_existent_subnet_id,
});
assert_eq!(
MigrateCanistersResponse {
registry_version: registry.latest_version()
},
response
);

// Canister 100 was removed from the system subnet's range and not assigned to
// any other subnet (as target subnet is absent from routing table), leaving a gap at 100.
let updated_routing_table = registry.get_routing_table_or_panic(registry.latest_version());
let mut expected_routing_table = RoutingTable::new();
expected_routing_table
.insert(
CanisterIdRange {
start: CanisterId::from_u64(0),
end: CanisterId::from_u64(99),
},
system_subnet_id.into(),
)
.unwrap();
expected_routing_table
.insert(
CanisterIdRange {
start: CanisterId::from_u64(101),
end: CanisterId::from_u64(255),
},
system_subnet_id.into(),
)
.unwrap();
assert_eq!(updated_routing_table, expected_routing_table);
}
}
23 changes: 21 additions & 2 deletions rs/registry/canister/src/mutations/routing_table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -296,15 +296,34 @@ impl Registry {
routing_table_into_registry_mutation(self, routing_table)
}

/// Subnet deletion is (currently) a manual process and we don't expect to actually delete
/// a subnet while a canister migration is ongoing (instead we expect to disable the
/// canister migration API first and only delete a subnet once there's no ongoing canister
/// migration). However, e.g., if a subnet is permanently stuck, it might be impossible to
/// successfully complete an ongoing canister migration and in this case, we'd like to still
/// be able to delete such a subnet. In this case, the outcome should be as if the migration
/// completed before the subnet got deleted.
pub fn migrate_canisters_to_subnet(
&self,
version: u64,
canister_ids: Vec<CanisterId>,
subnet_id: SubnetId,
target_subnet_id: SubnetId,
) -> Vec<RegistryMutation> {
self.modify_routing_table(version, |routing_table| {
let target_subnet_in_routing_table = routing_table
.iter()
.any(|(_range, sid)| *sid == target_subnet_id);
// If the source subnet has been deleted, assign_canister still
// successfully removes the canister from the source subnet's canister ranges.
// If the target subnet has been deleted, assign_canister introduces a new
// canister range for it, which is cleaned up below.
for canister_id in canister_ids {
routing_table.assign_canister(canister_id, subnet_id);
routing_table.assign_canister(canister_id, target_subnet_id);
}
if !target_subnet_in_routing_table {
// Target subnet has no canister ranges in the routing table (e.g., it has
// been deleted): remove any canister ranges just assigned to it.
routing_table.remove_subnet(target_subnet_id);
}
routing_table.optimize();
})
Expand Down
Loading