From 460bfa6fc92f1ff3cc0b161c81b91ab3c67a919e Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Tue, 16 Jun 2026 11:07:08 +0000 Subject: [PATCH 01/42] feat: cloud engine firewall --- rs/config/src/config.rs | 9 + rs/config/src/firewall.rs | 51 ++++++ .../config/tool/templates/ic.json5.template | 162 ++++++++++++++++++ rs/orchestrator/src/firewall.rs | 128 +++++++++++++- rs/orchestrator/src/registry_helper.rs | 6 +- ...nftables_assigned_cloud_engine.conf.golden | 17 +- rs/registry/helpers/src/node.rs | 26 ++- 7 files changed, 376 insertions(+), 23 deletions(-) diff --git a/rs/config/src/config.rs b/rs/config/src/config.rs index 60c3f4d8b0a1..ddb18c6eb17e 100644 --- a/rs/config/src/config.rs +++ b/rs/config/src/config.rs @@ -11,6 +11,7 @@ use crate::{ crypto::CryptoConfig, execution_environment::Config as HypervisorConfig, firewall::BoundaryNodeConfig as BoundaryNodeFirewallConfig, + firewall::CloudEngineConfig as CloudEngineFirewallConfig, firewall::ReplicaConfig as ReplicaFirewallConfig, http_handler::Config as HttpHandlerConfig, initial_ipv4_config::IPv4Config, @@ -51,6 +52,7 @@ pub struct Config { pub message_routing: MessageRoutingConfig, pub malicious_behavior: MaliciousBehavior, pub firewall: ReplicaFirewallConfig, + pub cloud_engine_firewall: CloudEngineFirewallConfig, pub boundary_node_firewall: BoundaryNodeFirewallConfig, pub registration: RegistrationConfig, pub nns_registry_replicator: NnsRegistryReplicatorConfig, @@ -79,6 +81,7 @@ pub struct ConfigOptional { pub message_routing: Option, pub malicious_behavior: Option, pub firewall: Option, + pub cloud_engine_firewall: Option, pub boundary_node_firewall: Option, pub registration: Option, pub nns_registry_replicator: Option, @@ -112,6 +115,9 @@ impl Config { message_routing: MessageRoutingConfig::default(), malicious_behavior: MaliciousBehavior::default(), firewall: ReplicaFirewallConfig::new(parent_dir.join("replica_firewall")), + cloud_engine_firewall: CloudEngineFirewallConfig::new( + parent_dir.join("cloud_engine_firewall"), + ), boundary_node_firewall: BoundaryNodeFirewallConfig::new( parent_dir.join("boundary_node_firewall"), ), @@ -167,6 +173,9 @@ impl Config { message_routing: cfg.message_routing.unwrap_or(default.message_routing), malicious_behavior: cfg.malicious_behavior.unwrap_or(default.malicious_behavior), firewall: cfg.firewall.unwrap_or(default.firewall), + cloud_engine_firewall: cfg + .cloud_engine_firewall + .unwrap_or(default.cloud_engine_firewall), boundary_node_firewall: cfg .boundary_node_firewall .unwrap_or(default.boundary_node_firewall), diff --git a/rs/config/src/firewall.rs b/rs/config/src/firewall.rs index c8a0d2c76a3b..aa341447cec0 100644 --- a/rs/config/src/firewall.rs +++ b/rs/config/src/firewall.rs @@ -58,6 +58,57 @@ impl ReplicaConfig { } } +#[derive(Clone, PartialEq, Debug, Deserialize, Serialize)] +#[serde(rename_all = "snake_case")] +#[cfg_attr(test, derive(Arbitrary))] +pub struct CloudEngineConfig { + /// Path to use for storing state on the file system + #[cfg_attr(test, proptest(strategy = "any::().prop_map(PathBuf::from)"))] + pub config_file: PathBuf, + pub file_template: String, + pub ipv4_tcp_rule_template: String, + pub ipv6_tcp_rule_template: String, + pub ipv4_udp_rule_template: String, + pub ipv6_udp_rule_template: String, + pub ipv4_user_output_rule_template: String, + pub ipv6_user_output_rule_template: String, + #[cfg_attr(test, proptest(strategy = "any::().prop_map(|_x| vec![])"))] + pub default_rules: Vec, + /// Ports opened to whitelisted nodes in the network. + pub whitelisted_nodes_tcp_ports_whitelist: Vec, + pub whitelisted_nodes_udp_ports_whitelist: Vec, + /// Ports opened to all nodes in the network (including non-whitelisted). + pub all_nodes_tcp_ports_whitelist: Vec, + pub all_nodes_udp_ports_whitelist: Vec, + pub ports_for_http_adapter_blacklist: Vec, + /// We allow a maximum of `max_simultaneous_connections_per_ip_address` persistent connections to any ip address. + /// Any ip address with `max_simultaneous_connections_per_ip_address` connections will be dropped if a new connection is attempted. + pub max_simultaneous_connections_per_ip_address: u32, +} + +impl CloudEngineConfig { + /// Create a CloudEngineConfig from a given path to the config file. + pub fn new(config_file: PathBuf) -> Self { + Self { + config_file, + file_template: "".to_string(), + ipv4_tcp_rule_template: "".to_string(), + ipv6_tcp_rule_template: "".to_string(), + ipv4_udp_rule_template: "".to_string(), + ipv6_udp_rule_template: "".to_string(), + ipv4_user_output_rule_template: "".to_string(), + ipv6_user_output_rule_template: "".to_string(), + default_rules: vec![], + whitelisted_nodes_tcp_ports_whitelist: vec![], + whitelisted_nodes_udp_ports_whitelist: vec![], + all_nodes_tcp_ports_whitelist: vec![], + all_nodes_udp_ports_whitelist: vec![], + ports_for_http_adapter_blacklist: vec![], + max_simultaneous_connections_per_ip_address: 0, + } + } +} + #[derive(Clone, PartialEq, Debug, Deserialize, Serialize)] #[serde(rename_all = "snake_case")] #[cfg_attr(test, derive(Arbitrary))] diff --git a/rs/ic_os/config/tool/templates/ic.json5.template b/rs/ic_os/config/tool/templates/ic.json5.template index 8a863eaac3e8..4a77df13e795 100644 --- a/rs/ic_os/config/tool/templates/ic.json5.template +++ b/rs/ic_os/config/tool/templates/ic.json5.template @@ -469,6 +469,168 @@ table ip6 filter {\n\ max_simultaneous_connections_per_ip_address: 400, }, + cloud_engine_firewall: { + config_file: "/run/ic-node/nftables-ruleset/nftables.conf", + file_template: "flush ruleset\n\ +\n\ +table filter {\n\ + define icmp_v4_types_accept = {\n\ + destination-unreachable,\n\ + time-exceeded,\n\ + echo-request,\n\ + echo-reply,\n\ + }\n\ +\n\ + set rate_limit {\n\ + type ipv4_addr\n\ + size 65535\n\ + flags dynamic\n\ + }\n\ +\n\ + set connection_limit {\n\ + type ipv4_addr\n\ + size 65535\n\ + flags dynamic\n\ + }\n\ +\n\ + set blackhole {\n\ + type ipv4_addr\n\ + size 65535\n\ + }\n\ +\n\ + counter rate_limit_v4_counter {}\n\ + counter connection_limit_v4_counter {}\n\ +\n\ + chain INPUT {\n\ + type filter hook input priority 0; policy drop;\n\ + iif lo accept\n\ + ip saddr @blackhole drop\n\ + ct state new add @rate_limit { ip saddr limit rate over 1000/minute burst 500 packets } counter name rate_limit_v4_counter drop\n\ + # Notes about the rule below:\n\ + # - The rule allows a maximum of <> persistent connections to any ip address.\n\ + # - The rule drops all new connections that goes over the configured limit.\n\ + ct state new add @connection_limit { ip saddr ct count over <> } counter name connection_limit_v4_counter drop\n\ + icmp type $icmp_v4_types_accept accept\n\ + <>\n\ + <>\n\ + ct state { invalid } drop\n\ + ct state { established, related } accept\n\ + ip saddr { 0.0.0.0-255.255.255.255 } ct state new tcp dport 80 accept\n\ + log prefix \"Drop - default policy: \"\n\ + }\n\ +\n\ + chain FORWARD {\n\ + type filter hook forward priority 0; policy drop;\n\ + }\n\ +\n\ + chain OUTPUT {\n\ + type filter hook output priority 0; policy accept;\n\ + meta skuid ic-http-adapter ip daddr { 127.0.0.0/8 } ct state { new } tcp dport { 1-19999 } reject # Block restricted localhost ic-http-adapter HTTPS access\n\ + <>\n\ + }\n\ +}\n\ +\n\ +table ip6 filter {\n\ + define icmp_v6_types_accept = {\n\ + destination-unreachable,\n\ + packet-too-big,\n\ + time-exceeded,\n\ + echo-request,\n\ + echo-reply,\n\ + nd-router-advert,\n\ + nd-neighbor-solicit,\n\ + nd-neighbor-advert,\n\ + }\n\ +\n\ + set rate_limit {\n\ + type ipv6_addr\n\ + size 65535\n\ + flags dynamic\n\ + }\n\ +\n\ + set connection_limit {\n\ + type ipv6_addr\n\ + size 65535\n\ + flags dynamic\n\ + }\n\ +\n\ + set blackhole6 {\n\ + type ipv6_addr\n\ + size 65535\n\ + }\n\ +\n\ + counter rate_limit_v6_counter {}\n\ + counter connection_limit_v6_counter {}\n\ +\n\ + chain INPUT {\n\ + type filter hook input priority 0; policy drop;\n\ + iif lo accept\n\ + ip6 saddr @blackhole6 drop\n\ + ct state { invalid } drop\n\ + ct state { established, related } accept\n\ + ct state new add @rate_limit { ip6 saddr limit rate over 1000/minute burst 500 packets } counter name rate_limit_v6_counter drop\n\ + # Notes about the rule below:\n\ + # - The rule allows a maximum of <> persistent connections to any ip6 address.\n\ + # - The rule drops all new connections that goes over the configured limit.\n\ + ct state new add @connection_limit { ip6 saddr ct count over <> } counter name connection_limit_v6_counter drop\n\ + icmpv6 type $icmp_v6_types_accept accept\n\ + # DHCPv6\n\ + udp dport { 546 } accept\n\ + # TCP ports required for GuestOS functionality\n\ + ip6 saddr { {{ ipv6_prefix }} } ct state { new } tcp dport { 7070, 9090, 9091, 9100, 9324, 19531, 19100, 19522 } accept\n\ + # Allow access from HostOS metrics-proxy so GuestOS metrics-proxy can proxy certain metrics to HostOS\n\ + ip6 saddr { hostos } ct state { new } tcp dport { 42372 } accept\n\ + ip6 saddr { ::-ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff } ct state new tcp dport 80 accept\n\ + # Custom templated rules\n\ + <>\n\ + <>\n\ + log prefix \"Drop - default policy: \"\n\ + }\n\ +\n\ + chain FORWARD {\n\ + type filter hook forward priority 0; policy drop;\n\ + }\n\ +\n\ + chain OUTPUT {\n\ + type filter hook output priority 0; policy accept;\n\ + meta skuid ic-http-adapter fib daddr type local ct state { new } tcp dport { 1-19999 } reject # Block restricted local addresses ic-http-adapter HTTPS access\n\ + meta skuid ic-http-adapter ip6 daddr { 2a00:fb01:400:42::/64, 2602:fb2b:110::/48, 2602:fb2b:100::/48, 2602:fb2b:120::/48 } ct state { new } tcp dport { 1-19999 } reject # Block restricted outbound ic-http-adapter HTTPS access\n\ + <>\n\ + }\n\ +}\n", + ipv4_tcp_rule_template: "ip saddr {<>} ct state { new } tcp dport {<>} <> # <>", + ipv6_tcp_rule_template: "ip6 saddr {<>} ct state { new } tcp dport {<>} <> # <>", + ipv4_udp_rule_template: "ip saddr {<>} udp dport {<>} <> # <>", + ipv6_udp_rule_template: "ip6 saddr {<>} udp dport {<>} <> # <>", + ipv4_user_output_rule_template: "meta skuid <> ip daddr {<>} ct state { new } tcp dport {<>} <> # <>", + ipv6_user_output_rule_template: "meta skuid <> ip6 daddr {<>} ct state { new } tcp dport {<>} <> # <>", + default_rules: [{ + ipv4_prefixes: [], + ipv6_prefixes: [ + "2602:fb2b:120::/48", + "2602:fb2b:100::/48", + "2602:fb2b:110::/48", + "2600:c00:2:100::/64", + "2001:4c08:2003:b09::/64", + "2600:3007:4401::/48", + "2a00:fb01:400::/56", + "2a00:fb01:400:200::/64", + "2a05:d01c:e2c:a700::/56", + "2a05:d01c:d9:2b00::/56", + ], + ports: [22, 2497, 4100, 7070, 8080, 9090, 9091, 9100, 9324, 19100, 19523, 19531], + action: 1, + comment: "Default rule from template", + direction: 1, + }], + whitelisted_nodes_tcp_ports_whitelist: [22, 8080], + whitelisted_nodes_udp_ports_whitelist: [4100], + all_nodes_tcp_ports_whitelist: [2497], + all_nodes_udp_ports_whitelist: [], + ports_for_http_adapter_blacklist: [22, 2497, 4100, 7070, 8080, 9090, 9091, 9100, 19100, 19523, 19531], + max_simultaneous_connections_per_ip_address: 1000, + }, + registration: { node_reward_type: "{{ node_reward_type }}", nns_url: "{{ nns_urls }}", diff --git a/rs/orchestrator/src/firewall.rs b/rs/orchestrator/src/firewall.rs index 0c32bca8e52f..25c8e073a9f3 100644 --- a/rs/orchestrator/src/firewall.rs +++ b/rs/orchestrator/src/firewall.rs @@ -5,7 +5,8 @@ use crate::{ registry_helper::RegistryHelper, }; use ic_config::firewall::{ - BoundaryNodeConfig as BoundaryNodeFirewallConfig, ReplicaConfig as ReplicaFirewallConfig, + BoundaryNodeConfig as BoundaryNodeFirewallConfig, + CloudEngineConfig as CloudEngineFirewallConfig, ReplicaConfig as ReplicaFirewallConfig, }; use ic_logger::{ReplicaLogger, debug, info, warn}; use ic_protobuf::registry::{ @@ -33,6 +34,7 @@ enum DataSource { /// The role of the node in the IC, i.e., whether it is acting as a replica or a boundary node. enum Role { AssignedReplica(SubnetId), + AssignedCloudEngine(SubnetId), UnassignedReplica, BoundaryNode, } @@ -48,6 +50,7 @@ pub(crate) struct Firewall { local_cup_reader: LocalCUPReader, logger: ReplicaLogger, replica_config: ReplicaFirewallConfig, + cloud_engine_config: CloudEngineFirewallConfig, boundary_node_config: BoundaryNodeFirewallConfig, compiled_config: String, last_applied_version: Arc>, @@ -62,6 +65,7 @@ impl Firewall { registry: Arc, metrics: Arc, replica_config: ReplicaFirewallConfig, + cloud_engine_config: CloudEngineFirewallConfig, boundary_node_config: BoundaryNodeFirewallConfig, local_cup_reader: LocalCUPReader, logger: ReplicaLogger, @@ -71,6 +75,7 @@ impl Firewall { metrics, local_cup_reader, replica_config, + cloud_engine_config, boundary_node_config, logger, compiled_config: Default::default(), @@ -107,11 +112,17 @@ impl Firewall { let maybe_boundary_node_record = self .registry .get_api_boundary_node_record(self.node_id, registry_version); - let maybe_subnet_id = self + let maybe_subnet_id_and_type = self .registry - .get_subnet_id_from_node_id(self.node_id, registry_version); - match (maybe_boundary_node_record, maybe_subnet_id) { - (_, Ok(Some(subnet_id))) => Ok(Role::AssignedReplica(subnet_id)), + .get_subnet_id_and_type_from_node_id(self.node_id, registry_version); + match (maybe_boundary_node_record, maybe_subnet_id_and_type) { + (_, Ok(Some((subnet_id, subnet_type)))) => match subnet_type { + SubnetType::Unspecified + | SubnetType::Application + | SubnetType::System + | SubnetType::VerifiedApplication => Ok(Role::AssignedReplica(subnet_id)), + SubnetType::CloudEngine => Ok(Role::AssignedCloudEngine(subnet_id)), + }, (Err(OrchestratorError::ApiBoundaryNodeMissingError(_, _)), Ok(None)) => { Ok(Role::UnassignedReplica) } @@ -219,6 +230,7 @@ impl Firewall { | NodeRewardType::Type3dot1 | NodeRewardType::Type1dot1, ) => true, + // TODO(CON-1720): consider accepting only from `Type4*` ( NodeRewardType::Type4 | NodeRewardType::Type4dot1 @@ -496,7 +508,7 @@ impl Firewall { let mut udp_rules = Vec::::new(); let firewall_scopes_to_fetch = match role { - Role::AssignedReplica(subnet_id) => vec![ + Role::AssignedReplica(subnet_id) | Role::AssignedCloudEngine(subnet_id) => vec![ FirewallRulesScope::Node(self.node_id), FirewallRulesScope::Subnet(subnet_id), FirewallRulesScope::ReplicaNodes, @@ -537,6 +549,9 @@ impl Firewall { Role::AssignedReplica(_) | Role::UnassignedReplica => { tcp_rules.append(&mut self.replica_config.default_rules.clone()); } + Role::AssignedCloudEngine(_) => { + tcp_rules.append(&mut self.cloud_engine_config.default_rules.clone()); + } Role::BoundaryNode => { tcp_rules.append(&mut self.boundary_node_config.default_rules.clone()); } @@ -549,14 +564,19 @@ impl Firewall { // Whitelisting for node IPs // In addition to any explicit firewall rules we might apply, we also ALWAYS whitelist // all nodes in the registry on the ports used by the protocol - Role::AssignedReplica(_) | Role::UnassignedReplica => { + Role::AssignedReplica(_) | Role::AssignedCloudEngine(_) | Role::UnassignedReplica => { let (more_tcp_rules, more_udp_rules) = self.get_node_whitelisting_rules(registry_version); // Insert the whitelisting rules at the top of the list (highest priority) tcp_rules = more_tcp_rules.into_iter().chain(tcp_rules).collect(); udp_rules = more_udp_rules.into_iter().chain(udp_rules).collect(); - self.replica_config.insert_rules(tcp_rules, udp_rules) + if matches!(role, Role::AssignedCloudEngine(_)) { + self.cloud_engine_config.insert_rules(tcp_rules, udp_rules) + } else { + // matches!(role, Role::AssignedReplica(_) | Role::UnassignedReplica) + self.replica_config.insert_rules(tcp_rules, udp_rules) + } } Role::BoundaryNode => { let socks_proxy_whitelisting_rules = @@ -607,6 +627,7 @@ impl Firewall { fn write_firewall_file(&self, content: &str, role: Role) -> OrchestratorResult<()> { let f = match role { Role::AssignedReplica(_) | Role::UnassignedReplica => &self.replica_config.config_file, + Role::AssignedCloudEngine(_) => &self.cloud_engine_config.config_file, Role::BoundaryNode => &self.boundary_node_config.config_file, }; write_string_using_tmp_file(f, content) @@ -710,6 +731,76 @@ impl FirewallConfigTemplate for ReplicaFirewallConfig { } } +impl FirewallConfigTemplate for CloudEngineFirewallConfig { + fn insert_rules(&self, tcp_rules: Vec, udp_rules: Vec) -> String { + self.file_template + .replace( + "<>", + &compile_rules( + &self.ipv4_tcp_rule_template, + &tcp_rules, + vec![ + FirewallRuleDirection::Inbound, + FirewallRuleDirection::Unspecified, + ], + ), + ) + .replace( + "<>", + &compile_rules( + &self.ipv4_udp_rule_template, + &udp_rules, + vec![ + FirewallRuleDirection::Inbound, + FirewallRuleDirection::Unspecified, + ], + ), + ) + .replace( + "<>", + &compile_rules( + &self.ipv6_tcp_rule_template, + &tcp_rules, + vec![ + FirewallRuleDirection::Inbound, + FirewallRuleDirection::Unspecified, + ], + ), + ) + .replace( + "<>", + &compile_rules( + &self.ipv6_udp_rule_template, + &udp_rules, + vec![ + FirewallRuleDirection::Inbound, + FirewallRuleDirection::Unspecified, + ], + ), + ) + .replace( + "<>", + &compile_rules( + &self.ipv4_user_output_rule_template, + &tcp_rules, + vec![FirewallRuleDirection::Outbound], + ), + ) + .replace( + "<>", + &compile_rules( + &self.ipv6_user_output_rule_template, + &tcp_rules, + vec![FirewallRuleDirection::Outbound], + ), + ) + .replace( + "<>", + &self.max_simultaneous_connections_per_ip_address.to_string(), + ) + } +} + impl FirewallConfigTemplate for BoundaryNodeFirewallConfig { fn insert_rules(&self, tcp_rules: Vec, udp_rules: Vec) -> String { self.file_template @@ -1045,7 +1136,7 @@ mod tests { #[test] fn nftables_golden_assigned_cloud_engine_test() { golden_test( - Role::AssignedReplica(SUBNET_ID), + Role::AssignedCloudEngine(SUBNET_ID), node_test_id(0), Some(NodeRewardType::Type4), NFTABLES_ASSIGNED_CLOUD_ENGINE_GOLDEN_BYTES, @@ -1160,12 +1251,17 @@ mod tests { replica_firewall_config .config_file .clone_from(&nftables_config_path); + let mut cloud_engine_firewall_config = config.cloud_engine_firewall.unwrap(); + cloud_engine_firewall_config + .config_file + .clone_from(&nftables_config_path); let mut boundary_node_firewall_config = config.boundary_node_firewall.unwrap(); boundary_node_firewall_config .config_file .clone_from(&nftables_config_path); let mut firewall = set_up_firewall_dependencies( replica_firewall_config, + cloud_engine_firewall_config, boundary_node_firewall_config, tmp_dir.path(), role, @@ -1236,6 +1332,7 @@ mod tests { /// Sets up all the necessary dependencies of the [`Firewall`] fn set_up_firewall_dependencies( config: ReplicaFirewallConfig, + cloud_engine_config: CloudEngineFirewallConfig, boundary_node_config: BoundaryNodeFirewallConfig, tmp_dir: &Path, role: Role, @@ -1257,6 +1354,7 @@ mod tests { registry_helper, Arc::new(OrchestratorMetrics::new(&ic_metrics::MetricsRegistry::new())), config, + cloud_engine_config, boundary_node_config, cup_reader, no_op_logger(), @@ -1477,6 +1575,18 @@ mod tests { ); subnet_ids.push(subnet_id); } + Role::AssignedCloudEngine(subnet_id) => { + let subnet_record = SubnetRecordBuilder::from(&[node]) + .with_subnet_type(SubnetType::CloudEngine) + .build(); + add_single_subnet_record( + ®istry_data_provider, + registry_version.get(), + subnet_id, + subnet_record, + ); + subnet_ids.push(subnet_id); + } Role::BoundaryNode => { add_api_boundary_node_record(®istry_data_provider, registry_version, node); } diff --git a/rs/orchestrator/src/registry_helper.rs b/rs/orchestrator/src/registry_helper.rs index 848c4fac4362..bbcdd34c9a27 100644 --- a/rs/orchestrator/src/registry_helper.rs +++ b/rs/orchestrator/src/registry_helper.rs @@ -204,13 +204,13 @@ impl RegistryHelper { Ok(ids.unwrap_or_default()) } - pub(crate) fn get_subnet_id_from_node_id( + pub(crate) fn get_subnet_id_and_type_from_node_id( &self, node_id: NodeId, version: RegistryVersion, - ) -> OrchestratorResult> { + ) -> OrchestratorResult> { self.registry_client - .get_subnet_id_from_node_id(node_id, version) + .get_subnet_id_and_type_from_node_id(node_id, version) .map_err(OrchestratorError::RegistryClientError) } diff --git a/rs/orchestrator/testdata/nftables_assigned_cloud_engine.conf.golden b/rs/orchestrator/testdata/nftables_assigned_cloud_engine.conf.golden index c15ea8397ac8..88d8d33b72b5 100644 --- a/rs/orchestrator/testdata/nftables_assigned_cloud_engine.conf.golden +++ b/rs/orchestrator/testdata/nftables_assigned_cloud_engine.conf.golden @@ -20,12 +20,18 @@ table filter { flags dynamic } + set blackhole { + type ipv4_addr + size 65535 + } + counter rate_limit_v4_counter {} counter connection_limit_v4_counter {} chain INPUT { type filter hook input priority 0; policy drop; iif lo accept + ip saddr @blackhole drop ct state new add @rate_limit { ip saddr limit rate over 1000/minute burst 500 packets } counter name rate_limit_v4_counter drop # Notes about the rule below: # - The rule allows a maximum of 1000 persistent connections to any ip address. @@ -40,8 +46,8 @@ ip saddr {4.4.4.4} ct state { new } tcp dport {1004} accept # replica_nodes ip saddr {6.6.6.6} ct state { new } tcp dport {1006} accept # global ip saddr {1.1.1.1,3.0.0.3,3.0.0.4,3.0.0.5,3.0.0.6,3.0.0.7,4.0.0.4,4.0.0.5,4.0.0.6,4.0.0.7} udp dport {4100} accept # Automatic whitelisted nodes whitelisting ct state { invalid } drop - # - The rule accepts all established and related connections. It's required for the IPv4 connectivity check. ct state { established, related } accept + ip saddr { 0.0.0.0-255.255.255.255 } ct state new tcp dport 80 accept log prefix "Drop - default policy: " } @@ -80,12 +86,18 @@ table ip6 filter { flags dynamic } + set blackhole6 { + type ipv6_addr + size 65535 + } + counter rate_limit_v6_counter {} counter connection_limit_v6_counter {} chain INPUT { type filter hook input priority 0; policy drop; iif lo accept + ip6 saddr @blackhole6 drop ct state { invalid } drop ct state { established, related } accept ct state new add @rate_limit { ip6 saddr limit rate over 1000/minute burst 500 packets } counter name rate_limit_v6_counter drop @@ -97,9 +109,10 @@ table ip6 filter { # DHCPv6 udp dport { 546 } accept # TCP ports required for GuestOS functionality - ip6 saddr { ::/64 } ct state { new } tcp dport { 7070, 9090, 9091, 9100, 19531, 19100, 19522 } accept + ip6 saddr { ::/64 } ct state { new } tcp dport { 7070, 9090, 9091, 9100, 9324, 19531, 19100, 19522 } accept # Allow access from HostOS metrics-proxy so GuestOS metrics-proxy can proxy certain metrics to HostOS ip6 saddr { hostos } ct state { new } tcp dport { 42372 } accept + ip6 saddr { ::-ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff } ct state new tcp dport 80 accept # Custom templated rules ip6 saddr {2001:db8:85a3::8a2e:1370:7334,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e5,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e6,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e7,a4c2:7f91:3db6:1e8c:5a4f:cc92:b37:6e41} ct state { new } tcp dport {22,8080} accept # Automatic whitelisted nodes whitelisting ip6 saddr {2001:db8:85a3::8a2e:1370:7334,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e5,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e6,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e7,a4c2:7f91:3db6:1e8c:5a4f:cc92:b37:6e41} ct state { new } tcp dport {2497} accept # Automatic all nodes whitelisting diff --git a/rs/registry/helpers/src/node.rs b/rs/registry/helpers/src/node.rs index 69a9d9ee6570..17a656b8fe60 100644 --- a/rs/registry/helpers/src/node.rs +++ b/rs/registry/helpers/src/node.rs @@ -1,7 +1,8 @@ use crate::deserialize_registry_value; -use crate::subnet::{SubnetListRegistry, SubnetRegistry}; +use crate::subnet::{SubnetListRegistry, SubnetRegistry, get_node_ids_from_subnet_record}; use ic_interfaces_registry::{RegistryClient, RegistryClientResult}; pub use ic_protobuf::registry::node::v1::{ConnectionEndpoint, NodeRecord}; +use ic_protobuf::registry::subnet::v1::SubnetType; use ic_registry_keys::{NODE_RECORD_KEY_PREFIX, get_node_record_node_id, make_node_record_key}; use ic_types::registry::RegistryClientError; pub use ic_types::{NodeId, RegistryVersion, SubnetId}; @@ -13,11 +14,11 @@ pub trait NodeRegistry { version: RegistryVersion, ) -> RegistryClientResult; - fn get_subnet_id_from_node_id( + fn get_subnet_id_and_type_from_node_id( &self, node_id: NodeId, version: RegistryVersion, - ) -> RegistryClientResult; + ) -> RegistryClientResult<(SubnetId, SubnetType)>; /// Returns a list of node ids that contains the id of each node that exists /// at version `version`. @@ -34,17 +35,24 @@ impl NodeRegistry for T { deserialize_registry_value::(bytes) } - fn get_subnet_id_from_node_id( + fn get_subnet_id_and_type_from_node_id( &self, node_id: NodeId, version: RegistryVersion, - ) -> RegistryClientResult { + ) -> RegistryClientResult<(SubnetId, SubnetType)> { if let Some(subnet_ids) = self.get_subnet_ids(version)? { for subnet_id in subnet_ids { - if let Some(node_ids) = self.get_node_ids_on_subnet(subnet_id, version)? - && node_ids.contains(&node_id) - { - return Ok(Some(subnet_id)); + let Some(subnet_record) = self.get_subnet_record(subnet_id, version)? else { + continue; + }; + let node_ids = get_node_ids_from_subnet_record(&subnet_record).map_err(|err| { + RegistryClientError::DecodeError { + error: format!("get_node_ids_from_subnet_record() failed with {err}"), + } + })?; + let subnet_type = subnet_record.subnet_type(); + if node_ids.contains(&node_id) { + return Ok(Some((subnet_id, subnet_type))); } } } From 7a8ba86104ec101dc9dc39a6be7d8ee50dc8eb0a Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Tue, 16 Jun 2026 11:25:11 +0000 Subject: [PATCH 02/42] test: system test --- Cargo.lock | 2 + rs/tests/consensus/orchestrator/BUILD.bazel | 14 ++ rs/tests/consensus/orchestrator/Cargo.toml | 6 + .../cloud_engine_ic_gateway_test.rs | 179 ++++++++++++++++++ 4 files changed, 201 insertions(+) create mode 100644 rs/tests/consensus/orchestrator/cloud_engine_ic_gateway_test.rs diff --git a/Cargo.lock b/Cargo.lock index 324d36b00ebe..d9763c7e79dc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2739,6 +2739,8 @@ dependencies = [ "ic_consensus_system_test_utils", "ic_consensus_threshold_sig_system_test_utils", "registry-canister", + "reqwest", + "serde_cbor", "slog", "tempfile", "tokio", diff --git a/rs/tests/consensus/orchestrator/BUILD.bazel b/rs/tests/consensus/orchestrator/BUILD.bazel index fe5959af6959..920936097c74 100644 --- a/rs/tests/consensus/orchestrator/BUILD.bazel +++ b/rs/tests/consensus/orchestrator/BUILD.bazel @@ -119,6 +119,20 @@ system_test_nns( ], ) +system_test( + name = "cloud_engine_ic_gateway_test", + deps = [ + # Keep sorted. + "//rs/registry/subnet_type", + "//rs/tests/driver:ic-system-test-driver", + "//rs/types/types", + "@crate_index//:anyhow", + "@crate_index//:reqwest", + "@crate_index//:serde_cbor", + "@crate_index//:slog", + ], +) + system_test_nns( name = "rotate_ecdsa_idkg_key_test", tags = [ diff --git a/rs/tests/consensus/orchestrator/Cargo.toml b/rs/tests/consensus/orchestrator/Cargo.toml index bc7b0870659c..fb10f58c9309 100644 --- a/rs/tests/consensus/orchestrator/Cargo.toml +++ b/rs/tests/consensus/orchestrator/Cargo.toml @@ -27,6 +27,8 @@ ic_consensus_system_test_node_registration_test_common = { path = "../node_regis ic_consensus_system_test_utils = { path = "../utils" } ic_consensus_threshold_sig_system_test_utils = { path = "../tecdsa/utils" } registry-canister = { path = "../../../registry/canister" } +reqwest = { workspace = true } +serde_cbor = { workspace = true } slog = { workspace = true } tempfile = { workspace = true } tokio = { workspace = true } @@ -36,6 +38,10 @@ url = { workspace = true } name = "ic-systest-cup-compatibility" path = "cup_compatibility_test.rs" +[[bin]] +name = "ic-systest-cloud-engine-ic-gateway-test" +path = "cloud_engine_ic_gateway_test.rs" + [[bin]] name = "ic-systest-node-reassignment-test" path = "node_reassignment_test.rs" diff --git a/rs/tests/consensus/orchestrator/cloud_engine_ic_gateway_test.rs b/rs/tests/consensus/orchestrator/cloud_engine_ic_gateway_test.rs new file mode 100644 index 000000000000..4bfadb591912 --- /dev/null +++ b/rs/tests/consensus/orchestrator/cloud_engine_ic_gateway_test.rs @@ -0,0 +1,179 @@ +/* tag::catalog[] +Title:: Cloud engine nodes are healthy on port 80 (served by ic-gateway). + +Goal:: +Verify that, for a cloud engine subnet, every node can be asserted healthy by +querying its public API status endpoint (`/api/v2/status`) on port 80 instead of +the replica's own port 8080. + +Background:: +Unlike regular subnets, cloud engine nodes are self-contained: in addition to the +replica, the orchestrator spawns an `ic-gateway` process next to it on the very +same node, which forwards requests from port 80 to the replica's port 8080. + +Runbook:: +0. Set up an IC with one System (NNS) subnet and one cloud engine. +1. For each node in the cloud engine subnet, query `/api/v2/status` on port 80 + and assert that the replica reports `Healthy`. + +Success:: +Every cloud engine node reports a healthy status on port 80. + +end::catalog[] */ + +use anyhow::{Context, Result, bail}; +use ic_registry_subnet_type::SubnetType; +use ic_system_test_driver::driver::group::SystemTestGroup; +use ic_system_test_driver::driver::ic::{InternetComputer, Subnet}; +use ic_system_test_driver::driver::test_env::TestEnv; +use ic_system_test_driver::driver::test_env_api::{ + HasPublicApiUrl, HasTopologySnapshot, IcNodeContainer, IcNodeSnapshot, READY_WAIT_TIMEOUT, + RETRY_BACKOFF, +}; +use ic_system_test_driver::util::block_on; +use ic_system_test_driver::{retry_with_msg_async, systest}; +use ic_types::messages::{HttpStatusResponse, ReplicaHealthStatus}; +use slog::{Logger, info}; +use std::time::Duration; + +/// Port on which `ic-gateway` exposes the public API of a cloud engine node. +/// +/// Regular replicas serve their public API on port 8080. On cloud engine nodes +/// the orchestrator additionally spawns `ic-gateway` next to the replica, and +/// that process terminates the public API on port 80 (the port opened to the +/// network by the cloud-engine firewall rules). +const IC_GATEWAY_PORT: u16 = 80; + +/// Number of nodes in the cloud engine subnet under test. +const CLOUD_ENGINE_NODES: usize = 4; + +/// Per-request timeout when polling the status endpoint. +const STATUS_REQUEST_TIMEOUT: Duration = Duration::from_secs(10); + +fn setup(env: TestEnv) { + InternetComputer::new() + .with_api_boundary_nodes_playnet(1) + .add_fast_single_node_subnet(SubnetType::System) + .add_subnet(Subnet::fast(SubnetType::CloudEngine, CLOUD_ENGINE_NODES)) + .setup_and_start(&env) + .expect("failed to setup IC under test"); + + env.topology_snapshot().subnets().for_each(|subnet| { + subnet + .nodes() + .for_each(|node| node.await_status_is_healthy().unwrap()) + }); +} + +fn test(env: TestEnv) { + let logger = env.logger(); + let topology = env.topology_snapshot(); + + let cloud_engine_subnet = topology + .subnets() + .find(|subnet| subnet.subnet_type() == SubnetType::CloudEngine) + .expect("the topology must contain a cloud engine subnet"); + + let nodes: Vec = cloud_engine_subnet.nodes().collect(); + assert_eq!( + nodes.len(), + CLOUD_ENGINE_NODES, + "unexpected number of cloud engine nodes" + ); + + block_on(async { + for node in &nodes { + info!( + logger, + "Asserting that cloud engine node {} is healthy on port {} (ic-gateway)", + node.node_id, + IC_GATEWAY_PORT, + ); + // The standard `await_status_is_healthy` targets the replica on port + // 8080. For cloud engine nodes we instead assert health on port 80, + // which is served by the `ic-gateway` instance the orchestrator runs + // next to the replica. + await_healthy_on_ic_gateway(node, &logger) + .await + .unwrap_or_else(|err| { + panic!( + "cloud engine node {} is not healthy on port {}: {err}", + node.node_id, IC_GATEWAY_PORT, + ) + }); + } + }); + + info!( + logger, + "All {} cloud engine nodes are healthy on port {} (ic-gateway)", + nodes.len(), + IC_GATEWAY_PORT, + ); +} + +/// Polls `/api/v2/status` of `node` on [`IC_GATEWAY_PORT`] (the port served by +/// the co-located `ic-gateway`) until the replica reports itself `Healthy`. +/// +/// This mirrors the driver's standard health check (`status_is_healthy`), but +/// retargets it from port 8080 to port 80. +async fn await_healthy_on_ic_gateway(node: &IcNodeSnapshot, logger: &Logger) -> Result<()> { + // `get_public_url` yields the replica's URL on port 8080; rewrite the port to + // reach the co-located `ic-gateway` instead. + let mut url = node.get_public_url(); + url.set_port(Some(IC_GATEWAY_PORT)) + .map_err(|_| anyhow::anyhow!("failed to set port {IC_GATEWAY_PORT} on {url}"))?; + let status_url = url + .join("api/v2/status") + .expect("failed to join status path"); + + retry_with_msg_async!( + format!( + "awaiting healthy status of node {} on port {IC_GATEWAY_PORT}", + node.node_id + ), + logger, + READY_WAIT_TIMEOUT, + RETRY_BACKOFF, + || async { + let response = reqwest::Client::builder() + .timeout(STATUS_REQUEST_TIMEOUT) + .build() + .expect("cannot build a reqwest client") + .get(status_url.clone()) + .send() + .await?; + + let status = response.status(); + let body = response + .bytes() + .await + .expect("failed to convert a response to bytes") + .to_vec(); + if status.is_client_error() || status.is_server_error() { + bail!( + "status check failed with {status}: `{}`", + String::from_utf8_lossy(&body) + ); + } + + let cbor = serde_cbor::from_slice(&body).expect("response is not encoded as cbor"); + let status_response = serde_cbor::value::from_value::(cbor) + .expect("failed to deserialize a response to HttpStatusResponse"); + + match status_response.replica_health_status { + Some(ReplicaHealthStatus::Healthy) => Ok(()), + other => bail!("replica not healthy yet, status: {other:?}"), + } + } + ) + .await +} + +fn main() -> Result<()> { + SystemTestGroup::new() + .with_setup(setup) + .add_test(systest!(test)) + .execute_from_args()?; + Ok(()) +} From a70d67159d3d8cad11c98ed388505c390eb55b64 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Tue, 16 Jun 2026 11:25:43 +0000 Subject: [PATCH 03/42] feat: run ic-boundary next to the replica --- ic-os/components/guestos/ic-replica.service | 2 +- .../components/guestos/share/ic-boundary.env | 2 +- rs/orchestrator/src/args.rs | 6 +- rs/orchestrator/src/boundary_node.rs | 144 ++--- rs/orchestrator/src/dashboard.rs | 18 +- rs/orchestrator/src/lib.rs | 1 + rs/orchestrator/src/metrics.rs | 7 +- rs/orchestrator/src/orchestrator.rs | 33 +- rs/orchestrator/src/process_manager.rs | 23 +- rs/orchestrator/src/processes.rs | 137 +++++ rs/orchestrator/src/registry_helper.rs | 10 + rs/orchestrator/src/upgrade.rs | 523 ++++++++++++------ rs/registry/admin/bin/main.rs | 2 +- rs/tests/driver/src/driver/group.rs | 1 + 14 files changed, 602 insertions(+), 307 deletions(-) create mode 100644 rs/orchestrator/src/processes.rs diff --git a/ic-os/components/guestos/ic-replica.service b/ic-os/components/guestos/ic-replica.service index 71cde05416f5..5d360ee96673 100644 --- a/ic-os/components/guestos/ic-replica.service +++ b/ic-os/components/guestos/ic-replica.service @@ -20,7 +20,7 @@ User=ic-replica Environment=RUST_BACKTRACE=1 Environment=RUST_MIN_STACK=8192000 -ExecStart=/opt/ic/bin/orchestrator --replica-binary-dir /var/lib/ic/data/images --cup-dir /var/lib/ic/data/cups --replica-config-file /run/ic-node/config/ic.json5 --enable-provisional-registration --ic-binary-directory /opt/ic/bin --orchestrator-data-directory /var/lib/ic/data/orchestrator --version-file /opt/ic/share/version.txt +ExecStart=/opt/ic/bin/orchestrator --replica-binary-dir /var/lib/ic/data/images --cup-dir /var/lib/ic/data/cups --replica-config-file /run/ic-node/config/ic.json5 --ic-boundary-env-file /opt/ic/share/ic-boundary.env --enable-provisional-registration --ic-binary-directory /opt/ic/bin --orchestrator-data-directory /var/lib/ic/data/orchestrator --version-file /opt/ic/share/version.txt LimitNOFILE=16777216 Restart=always RestartSec=10 diff --git a/ic-os/components/guestos/share/ic-boundary.env b/ic-os/components/guestos/share/ic-boundary.env index c23bfa9de7f2..1791e4a9905a 100644 --- a/ic-os/components/guestos/share/ic-boundary.env +++ b/ic-os/components/guestos/share/ic-boundary.env @@ -1,7 +1,7 @@ LISTEN_HTTPS_PORT="443" TLS_CERT_PATH="/var/lib/ic/data/ic-boundary-tls.crt" TLS_PKEY_PATH="/var/lib/ic/data/ic-boundary-tls.key" -TLS_ACME_CREDENTIALS_PATH="/var/lib/ic/data" +TLS_ACME_CREDENTIALS_PATH="/var/lib/ic/data" # Check that it's fine (it should be) HTTP_CLIENT_TIMEOUT_CONNECT="3s" REGISTRY_LOCAL_STORE_PATH="/var/lib/ic/data/ic_registry_local_store" RATE_LIMIT_GENERIC_CANISTER_ID="u637p-5aaaa-aaaaq-qaaca-cai" diff --git a/rs/orchestrator/src/args.rs b/rs/orchestrator/src/args.rs index 4ea245933fa9..5cbb135b8875 100644 --- a/rs/orchestrator/src/args.rs +++ b/rs/orchestrator/src/args.rs @@ -27,11 +27,15 @@ pub struct OrchestratorArgs { #[clap(long)] pub(crate) replica_config_file: PathBuf, + /// The path to the IC boundary environment file + #[clap(long)] + pub(crate) ic_boundary_env_file: PathBuf, + /// The path to the Replica binary location containing the following in case /// of guest OS deployment: version.txt, manageboot.sh, replica, /// install-upgrade.sh #[clap(long)] - pub(crate) ic_binary_directory: Option, + pub(crate) ic_binary_directory: PathBuf, /// If not set, the default listen addr (0.0.0.0:[`PROMETHEUS_HTTP_PORT`]) /// will be used to export metrics. diff --git a/rs/orchestrator/src/boundary_node.rs b/rs/orchestrator/src/boundary_node.rs index 895592ec5337..a9bc7f1520a1 100644 --- a/rs/orchestrator/src/boundary_node.rs +++ b/rs/orchestrator/src/boundary_node.rs @@ -1,53 +1,25 @@ use crate::{ error::{OrchestratorError, OrchestratorResult}, metrics::OrchestratorMetrics, - process_manager::{Process, ProcessManager, ProcessManagerImpl}, + process_manager::{ProcessRunner, SingleProcessRunner}, + processes::IcBoundaryProcess, registry_helper::RegistryHelper, + upgrade::{start_ic_boundary, stop_ic_boundary}, }; use ic_config::crypto::CryptoConfig; -use ic_logger::{ReplicaLogger, info, warn}; +use ic_logger::{ReplicaLogger, warn}; use ic_types::{NodeId, ReplicaVersion}; use std::{ - collections::HashMap, - ffi::OsString, - path::{Path, PathBuf}, + path::PathBuf, sync::{Arc, Mutex}, }; -struct BoundaryNodeProcess { - version: ReplicaVersion, - binary: PathBuf, - args: Vec, - env: HashMap, -} - -impl Process for BoundaryNodeProcess { - const NAME: &'static str = "Boundary Node"; - - type Version = ReplicaVersion; - - fn get_version(&self) -> &Self::Version { - &self.version - } - - fn get_binary(&self) -> &Path { - &self.binary - } - - fn get_args(&self) -> &[OsString] { - &self.args - } - - fn get_env(&self) -> HashMap { - self.env.clone() - } -} - pub(crate) struct BoundaryNodeManager { registry: Arc, - _metrics: Arc, - process: Arc>>, + metrics: Arc, + process: Arc>>, ic_binary_dir: PathBuf, + ic_boundary_env_file: PathBuf, crypto_config: CryptoConfig, version: ReplicaVersion, logger: ReplicaLogger, @@ -62,14 +34,16 @@ impl BoundaryNodeManager { version: ReplicaVersion, node_id: NodeId, ic_binary_dir: PathBuf, + ic_boundary_env_file: PathBuf, crypto_config: CryptoConfig, logger: ReplicaLogger, ) -> Self { Self { registry, - _metrics: metrics, - process: Arc::new(Mutex::new(ProcessManagerImpl::new(logger.clone()))), + metrics, + process: Arc::new(Mutex::new(SingleProcessRunner::new(logger.clone()))), ic_binary_dir, + ic_boundary_env_file, crypto_config, version, logger, @@ -97,18 +71,20 @@ impl BoundaryNodeManager { } else { match self.registry.get_node_domain_name(registry_version) { Ok(Some(domain_name)) => { - let domain_name = Some(domain_name); + // let domain_name = Some(domain_name); // stop ic-boundary when the domain name changes and start it again. - if domain_name != self.domain_name { - if let Err(err) = self.ensure_boundary_node_stopped() { + if Some(&domain_name) != self.domain_name.as_ref() { + if let Err(err) = self.ensure_ic_boundary_stopped() { warn!(self.logger, "Failed to stop Boundary Node: {}", err); } - self.domain_name = domain_name; + self.domain_name = Some(domain_name.clone()); } // make sure the boundary node is running - if let Err(err) = self.ensure_boundary_node_running(&self.version) { + if let Err(err) = + self.ensure_ic_boundary_running(&self.version, domain_name) + { warn!(self.logger, "Failed to start Boundary Node: {}", err); } } @@ -118,7 +94,7 @@ impl BoundaryNodeManager { self.logger, "There is no domain associated with the node, while this is a requirement for the API boundary node. Shutting ic-boundary down." ); - if let Err(err) = self.ensure_boundary_node_stopped() { + if let Err(err) = self.ensure_ic_boundary_stopped() { warn!(self.logger, "Failed to stop Boundary Node: {}", err); } self.domain_name = None; @@ -133,7 +109,7 @@ impl BoundaryNodeManager { } // BN should not be active Err(OrchestratorError::ApiBoundaryNodeMissingError(_, _)) => { - if let Err(err) = self.ensure_boundary_node_stopped() { + if let Err(err) = self.ensure_ic_boundary_stopped() { warn!(self.logger, "Failed to stop Boundary Node: {}", err); } } @@ -146,68 +122,24 @@ impl BoundaryNodeManager { } /// Start the current boundary node process - fn ensure_boundary_node_running(&self, version: &ReplicaVersion) -> OrchestratorResult<()> { - let mut process = self.process.lock().unwrap(); - - if process.is_running() { - return Ok(()); - } - info!(self.logger, "Starting new boundary node process"); - - let binary = self.ic_binary_dir.join("ic-boundary"); - - let domain_name = self - .domain_name - .as_ref() - .ok_or_else(|| OrchestratorError::DomainNameMissingError(self.node_id))?; - - let env = match env_file_reader::read_file("/opt/ic/share/ic-boundary.env") { - Ok(env) => env - .into_iter() - .map(|(k, v)| (OsString::from(k), OsString::from(v))) - .collect(), - Err(e) => { - return Err(OrchestratorError::IoError( - "unable to read ic-boundary environment variables".to_string(), - e, - )); - } - }; - - let args = vec![ - format!("--tls-hostname={}", domain_name).into(), - format!( - "--crypto-config={}", - serde_json::to_string(&self.crypto_config) - .map_err(OrchestratorError::SerializeCryptoConfigError)? - ) - .into(), - ]; - - process - .start(BoundaryNodeProcess { - version: version.clone(), - binary, - args, - env, - }) - .map_err(|e| { - OrchestratorError::IoError( - "Error when attempting to start new boundary node".into(), - e, - ) - }) + fn ensure_ic_boundary_running( + &self, + replica_version: &ReplicaVersion, + domain_name: String, + ) -> OrchestratorResult<()> { + start_ic_boundary( + &mut *self.process.lock().unwrap(), + &self.ic_binary_dir, + &self.ic_boundary_env_file, + replica_version, + domain_name, + &self.crypto_config, + &self.logger, + &self.metrics, + ) } - /// Stop the current boundary node process. - fn ensure_boundary_node_stopped(&self) -> OrchestratorResult<()> { - let mut process = self.process.lock().unwrap(); - if process.is_running() { - return process.stop().map_err(|e| { - OrchestratorError::IoError("Error when attempting to stop boundary node".into(), e) - }); - } - - Ok(()) + fn ensure_ic_boundary_stopped(&self) -> OrchestratorResult<()> { + stop_ic_boundary(&mut *self.process.lock().unwrap()) } } diff --git a/rs/orchestrator/src/dashboard.rs b/rs/orchestrator/src/dashboard.rs index 71c3b038be73..be121cabe2d3 100644 --- a/rs/orchestrator/src/dashboard.rs +++ b/rs/orchestrator/src/dashboard.rs @@ -1,7 +1,10 @@ use crate::{ - catch_up_package_provider::LocalCUPReader, orchestrator::SubnetAssignment, - process_manager::ProcessManager, registry_helper::RegistryHelper, - ssh_access_manager::SshAccessParameters, upgrade::ReplicaProcess, + catch_up_package_provider::LocalCUPReader, + orchestrator::SubnetAssignment, + process_manager::ProcessRunner, + processes::{ProcessManager, ReplicaProcess}, + registry_helper::RegistryHelper, + ssh_access_manager::SshAccessParameters, }; pub use ic_dashboard::Dashboard; use ic_logger::{ReplicaLogger, info, warn}; @@ -24,7 +27,7 @@ pub(crate) struct OrchestratorDashboard { last_applied_firewall_version: Arc>, last_applied_ipv4_config_version: Arc>, last_poll_certified_time: Arc>, - replica_process: Arc>>, + process_manager: Arc>, subnet_assignment: Arc>, replica_version: ReplicaVersion, hostos_version: Option, @@ -90,7 +93,7 @@ impl OrchestratorDashboard { last_applied_firewall_version: Arc>, last_applied_ipv4_config_version: Arc>, last_poll_certified_time: Arc>, - replica_process: Arc>>, + process_manager: Arc>, subnet_assignment: Arc>, replica_version: ReplicaVersion, hostos_version: Option, @@ -104,7 +107,7 @@ impl OrchestratorDashboard { last_applied_firewall_version, last_applied_ipv4_config_version, last_poll_certified_time, - replica_process, + process_manager, subnet_assignment, replica_version, hostos_version, @@ -135,7 +138,8 @@ impl OrchestratorDashboard { } fn get_pid(&self) -> String { - match self.replica_process.lock().unwrap().get_pid() { + let process_manager = self.process_manager.lock().unwrap(); + match >::get_pid(&process_manager) { Some(pid) => pid.to_string(), None => "None".to_string(), } diff --git a/rs/orchestrator/src/lib.rs b/rs/orchestrator/src/lib.rs index 035fc0772b1d..b796ff1ebd74 100644 --- a/rs/orchestrator/src/lib.rs +++ b/rs/orchestrator/src/lib.rs @@ -43,6 +43,7 @@ mod ipv4_network; mod metrics; pub mod orchestrator; mod process_manager; +mod processes; mod registration; mod registry_helper; mod signer; diff --git a/rs/orchestrator/src/metrics.rs b/rs/orchestrator/src/metrics.rs index 3da83c17dcd5..b343437fdaac 100644 --- a/rs/orchestrator/src/metrics.rs +++ b/rs/orchestrator/src/metrics.rs @@ -19,7 +19,7 @@ pub(crate) struct OrchestratorMetrics { pub(crate) critical_error_state_removal_failed: IntCounter, pub(crate) fstrim_duration: IntGauge, pub(crate) critical_error_task_failed: IntCounterVec, - pub(crate) replica_process_start_attempts: IntCounter, + pub(crate) processes_start_attempts: IntCounterVec, } #[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug, EnumIter, AsRefStr)] @@ -107,9 +107,10 @@ impl OrchestratorMetrics { grouped by the task name and the reason of the failure", &["task_name", "reason"], ), - replica_process_start_attempts: metrics_registry.int_counter( - "orchestrator_replica_process_start_attempts_total", + processes_start_attempts: metrics_registry.int_counter_vec( + "orchestrator_processes_start_attempts_total", "Number of times the replica process was attempted to be started", + &["process_name"], ), } } diff --git a/rs/orchestrator/src/orchestrator.rs b/rs/orchestrator/src/orchestrator.rs index 5d7824716203..818e078ec9ae 100644 --- a/rs/orchestrator/src/orchestrator.rs +++ b/rs/orchestrator/src/orchestrator.rs @@ -7,7 +7,8 @@ use crate::{ hostos_upgrade::HostosUpgrader, ipv4_network::Ipv4Configurator, metrics::OrchestratorMetrics, - process_manager::ProcessManagerImpl, + process_manager::SingleProcessRunner, + processes::ProcessManager, registration::NodeRegistration, registry_helper::RegistryHelper, ssh_access_manager::SshAccessManager, @@ -35,7 +36,7 @@ use std::{ convert::TryFrom, future::Future, net::{Ipv4Addr, Ipv6Addr, SocketAddr}, - path::{Path, PathBuf}, + path::Path, sync::{Arc, Mutex, RwLock}, thread, time::Duration, @@ -240,16 +241,17 @@ impl Orchestrator { Arc::clone(&crypto) as _, ); - let replica_process = Arc::new(Mutex::new(ProcessManagerImpl::new(logger.clone()))); - let ic_binary_directory = args - .ic_binary_directory - .as_ref() - .unwrap_or(&PathBuf::from("/tmp")) - .clone(); + let process_manager = Arc::new(Mutex::new(ProcessManager::new( + Box::new(SingleProcessRunner::new(logger.clone())), + Box::new(SingleProcessRunner::new(logger.clone())), + ))); + let ic_binary_directory = args.ic_binary_directory; let manageboot_runner = Box::new(ManagebootRunnerImpl::new( ic_binary_directory.join("manageboot.sh"), )); + let ic_boundary_env = args.ic_boundary_env_file; + // Create a read-only CUP reader that can be shared among Dashboard and Firewall // They read from the same file, so they'll see the same persisted CUP let local_cup_reader = LocalCUPReader::new(args.cup_dir.clone(), logger.clone()); @@ -281,7 +283,7 @@ impl Orchestrator { Upgrade::new( Arc::clone(®istry) as _, Arc::clone(&metrics), - Arc::clone(&replica_process) as _, + Arc::clone(&process_manager), manageboot_runner, cup_provider, Arc::clone(&subnet_assignment), @@ -289,11 +291,13 @@ impl Orchestrator { args.replica_config_file.clone(), node_id, ic_binary_directory.clone(), + ic_boundary_env.clone(), Arc::clone(®istry_replicator) as _, args.replica_binary_dir.clone(), logger.clone(), args.orchestrator_data_directory.clone(), disk_encryption_key_exchange_agent, + config.crypto.clone(), ) .await, ); @@ -330,6 +334,7 @@ impl Orchestrator { replica_version.clone(), node_id, ic_binary_directory.clone(), + ic_boundary_env, config.crypto.clone(), logger.clone(), ); @@ -339,6 +344,7 @@ impl Orchestrator { Arc::clone(®istry), Arc::clone(&metrics), config.firewall.clone(), + config.cloud_engine_firewall.clone(), config.boundary_node_firewall.clone(), local_cup_reader.clone(), logger.clone(), @@ -365,7 +371,7 @@ impl Orchestrator { firewall.get_last_applied_version(), ipv4_configurator.get_last_applied_version(), registry_replicator.get_latest_certified_time(), - replica_process, + process_manager, Arc::clone(&subnet_assignment), replica_version, hostos_version.ok(), @@ -483,10 +489,11 @@ impl Orchestrator { } info!(log, "Shut down the upgrade loop"); - if let Err(e) = upgrade.stop_replica() { - warn!(log, "Failed to stop the replica process: {e}"); + if let Err(e) = upgrade.stop_children() { + warn!(log, "Failed to stop child processes: {e}"); + } else { + info!(log, "Shut down the child processes"); } - info!(log, "Shut down the replica process"); } async fn hostos_upgrade_checks( diff --git a/rs/orchestrator/src/process_manager.rs b/rs/orchestrator/src/process_manager.rs index e0ed38088494..4a576382dd8c 100644 --- a/rs/orchestrator/src/process_manager.rs +++ b/rs/orchestrator/src/process_manager.rs @@ -8,17 +8,17 @@ use std::{ ffi::OsString, fmt::Debug, io::Result, - path::Path, + path::PathBuf, sync::{Arc, Mutex}, }; type PIDCell = Arc>>; -/// Captures a process that should be run by the [`ProcessManager`] +/// Captures a process that should be run by a [`ProcessRunner`] pub(crate) trait Process { /// Name of the type of process /// - /// Used only for logging purposes + /// Used for logging and metrics const NAME: &'static str; /// Version type of the process @@ -32,18 +32,17 @@ pub(crate) trait Process { fn get_version(&self) -> &Self::Version; /// Return the path to the binary of the [`Process`] - fn get_binary(&self) -> &Path; + fn get_binary(&self) -> PathBuf; /// Return the arguments passed to the [`Process`] - fn get_args(&self) -> &[OsString]; + fn get_args(&self) -> Vec; /// Return the env vars passed to the [`Process`] fn get_env(&self) -> HashMap; } -/// Trait for managing a single versioned [`Process`] -pub(crate) trait ProcessManager: Send { - /// Start the given process. +/// Trait for running a single versioned [`Process`] +pub(crate) trait ProcessRunner: Send { fn start(&mut self, process: P) -> Result<()>; /// Stop the currently running process. @@ -57,15 +56,15 @@ pub(crate) trait ProcessManager: Send { fn get_pid(&self) -> Option; } -/// A [`ProcessManagerImpl`] manages running a single versioned [`Process`] -pub(crate) struct ProcessManagerImpl { +/// Runs a single versioned [`Process`], implementing [`ProcessRunner

`]. +pub(crate) struct SingleProcessRunner { process: Option

, pid_cell: PIDCell, log: ReplicaLogger, join_handle: Option>, } -impl ProcessManagerImpl

{ +impl SingleProcessRunner

{ pub(crate) fn new(logger: ReplicaLogger) -> Self { Self { process: None, @@ -125,7 +124,7 @@ impl ProcessManagerImpl

{ } } -impl ProcessManager

for ProcessManagerImpl

{ +impl ProcessRunner

for SingleProcessRunner

{ fn start(&mut self, process: P) -> Result<()> { // Do nothing if we're already running a process with the requested version if let Some(current_version) = self.process.as_ref().map(|p| p.get_version()) diff --git a/rs/orchestrator/src/processes.rs b/rs/orchestrator/src/processes.rs new file mode 100644 index 000000000000..4ca8be18f3c0 --- /dev/null +++ b/rs/orchestrator/src/processes.rs @@ -0,0 +1,137 @@ +use crate::process_manager::{Process, ProcessRunner}; +use ic_types::{ReplicaVersion, SubnetId}; +use nix::unistd::Pid; +use std::{collections::HashMap, ffi::OsString, path::PathBuf}; + +// --------------------------------------------------------------------------- +// ReplicaProcess +// --------------------------------------------------------------------------- + +pub(crate) struct ReplicaProcess { + pub ic_binary_dir: PathBuf, + pub replica_version: ReplicaVersion, + pub cup_path: PathBuf, + pub config_file: PathBuf, + pub subnet_id: SubnetId, +} + +impl Process for ReplicaProcess { + const NAME: &'static str = "replica"; + type Version = ReplicaVersion; + + fn get_version(&self) -> &Self::Version { + &self.replica_version + } + fn get_binary(&self) -> PathBuf { + self.ic_binary_dir.join(Self::NAME) + } + fn get_args(&self) -> Vec { + vec![ + OsString::from("--replica-version"), + self.replica_version.to_string().into(), + OsString::from("--config-file"), + self.config_file.clone().into(), + OsString::from("--catch-up-package"), + self.cup_path.clone().into(), + OsString::from("--force-subnet"), + self.subnet_id.to_string().into(), + ] + } + fn get_env(&self) -> HashMap { + HashMap::new() + } +} + +// --------------------------------------------------------------------------- +// IcBoundaryProcess +// --------------------------------------------------------------------------- + +pub(crate) struct IcBoundaryProcess { + pub ic_binary_dir: PathBuf, + pub replica_version: ReplicaVersion, + pub domain_name: String, + pub crypto_config: String, + pub env: HashMap, +} + +impl Process for IcBoundaryProcess { + const NAME: &'static str = "ic-boundary"; + type Version = ReplicaVersion; + + fn get_version(&self) -> &Self::Version { + &self.replica_version + } + fn get_binary(&self) -> PathBuf { + self.ic_binary_dir.join(Self::NAME) + } + fn get_args(&self) -> Vec { + vec![ + OsString::from("--tls-hostname"), + self.domain_name.clone().into(), + OsString::from("--crypto-config"), + self.crypto_config.clone().into(), + ] + } + fn get_env(&self) -> HashMap { + self.env + .iter() + .map(|(k, v)| (OsString::from(k), OsString::from(v))) + .collect() + } +} + +// --------------------------------------------------------------------------- +// ProcessManager +// --------------------------------------------------------------------------- + +/// Manages all processes for this orchestrator. +/// +/// Owns one runner per process type. Each field is a `Box>` so the runner can +/// be swapped out in tests without spawning real processes. +/// Implements `ProcessRunner

` for each process type, delegating to the corresponding runner. +pub(crate) struct ProcessManager { + replica: Box>, + ic_boundary: Box>, +} + +impl ProcessManager { + pub(crate) fn new( + replica: Box>, + ic_boundary: Box>, + ) -> Self { + Self { + replica, + ic_boundary, + } + } +} + +impl ProcessRunner for ProcessManager { + fn start(&mut self, process: ReplicaProcess) -> std::io::Result<()> { + self.replica.start(process) + } + fn stop(&mut self) -> std::io::Result<()> { + self.replica.stop() + } + fn is_running(&self) -> bool { + self.replica.is_running() + } + fn get_pid(&self) -> Option { + self.replica.get_pid() + } +} + +impl ProcessRunner for ProcessManager { + fn start(&mut self, process: IcBoundaryProcess) -> std::io::Result<()> { + self.ic_boundary.start(process) + } + fn stop(&mut self) -> std::io::Result<()> { + self.ic_boundary.stop() + } + fn is_running(&self) -> bool { + self.ic_boundary.is_running() + } + fn get_pid(&self) -> Option { + self.ic_boundary.get_pid() + } +} diff --git a/rs/orchestrator/src/registry_helper.rs b/rs/orchestrator/src/registry_helper.rs index bbcdd34c9a27..de8ae7df7c5a 100644 --- a/rs/orchestrator/src/registry_helper.rs +++ b/rs/orchestrator/src/registry_helper.rs @@ -214,6 +214,16 @@ impl RegistryHelper { .map_err(OrchestratorError::RegistryClientError) } + pub(crate) fn get_subnet_type( + &self, + subnet_id: SubnetId, + version: RegistryVersion, + ) -> OrchestratorResult> { + self.registry_client + .get_subnet_type(subnet_id, version) + .map_err(OrchestratorError::RegistryClientError) + } + /// Get the replica version of the given subnet in the given registry /// version pub(crate) fn get_replica_version( diff --git a/rs/orchestrator/src/upgrade.rs b/rs/orchestrator/src/upgrade.rs index bf44008d01b7..0c9b20a18ce1 100644 --- a/rs/orchestrator/src/upgrade.rs +++ b/rs/orchestrator/src/upgrade.rs @@ -3,11 +3,13 @@ use crate::{ error::{OrchestratorError, OrchestratorResult}, metrics::OrchestratorMetrics, orchestrator::SubnetAssignment, - process_manager::{Process, ProcessManager}, + process_manager::{Process, ProcessRunner}, + processes::{IcBoundaryProcess, ProcessManager, ReplicaProcess}, registry_helper::RegistryHelper, }; use async_trait::async_trait; use guest_upgrade_server::DiskEncryptionKeyExchangeServerAgent; +use ic_config::crypto::CryptoConfig; use ic_consensus_dkg::get_vetkey_public_keys; use ic_crypto::get_master_public_key_from_transcript; use ic_http_utils::file_downloader::FileDownloader; @@ -18,7 +20,7 @@ use ic_image_upgrader::{ use ic_interfaces_registry::RegistryClient; use ic_logger::{ReplicaLogger, error, info, warn}; use ic_management_canister_types_private::MasterPublicKeyId; -use ic_protobuf::proxy::try_from_option_field; +use ic_protobuf::{proxy::try_from_option_field, registry::subnet::v1::SubnetType}; use ic_registry_client_helpers::{node::NodeRegistry, subnet::SubnetRegistry}; use ic_registry_local_store::{LocalStore, LocalStoreImpl}; use ic_registry_replicator::RegistryReplicator; @@ -31,8 +33,7 @@ use ic_types::{ }, }; use std::{ - collections::{BTreeMap, HashMap}, - ffi::OsString, + collections::BTreeMap, path::{Path, PathBuf}, sync::{Arc, Mutex, RwLock}, time::{Duration, Instant}, @@ -59,34 +60,6 @@ pub(crate) enum OrchestratorControlFlow { Stop, } -pub struct ReplicaProcess { - version: ReplicaVersion, - binary: PathBuf, - args: Vec, -} - -impl Process for ReplicaProcess { - const NAME: &'static str = "Replica"; - - type Version = ReplicaVersion; - - fn get_version(&self) -> &Self::Version { - &self.version - } - - fn get_binary(&self) -> &Path { - &self.binary - } - - fn get_args(&self) -> &[OsString] { - &self.args - } - - fn get_env(&self) -> HashMap { - HashMap::new() - } -} - /// Trait for the registry replicator used by the upgrade module. #[async_trait] #[cfg_attr(test, mockall::automock)] @@ -118,13 +91,14 @@ impl RegistryReplicatorForUpgrade for RegistryReplicator { pub(crate) struct Upgrade { pub registry: Arc, pub metrics: Arc, - replica_process: Arc>>, + process_manager: Arc>, manageboot_runner: Box, cup_provider: CatchUpPackageProvider, subnet_assignment: Arc>, replica_version: ReplicaVersion, replica_config_file: PathBuf, pub ic_binary_dir: PathBuf, + ic_boundary_env_file: PathBuf, pub image_path: PathBuf, registry_replicator: Arc, init_time: Instant, @@ -134,6 +108,7 @@ pub(crate) struct Upgrade { /// The replica version that is prepared by 'prepare_upgrade' to upgrade to. pub prepared_upgrade_version: Option, pub orchestrator_data_directory: PathBuf, + crypto_config: CryptoConfig, } impl Upgrade { @@ -141,7 +116,7 @@ impl Upgrade { pub(crate) async fn new( registry: Arc, metrics: Arc, - replica_process: Arc>>, + process_manager: Arc>, manageboot_runner: Box, cup_provider: CatchUpPackageProvider, subnet_assignment: Arc>, @@ -149,18 +124,20 @@ impl Upgrade { replica_config_file: PathBuf, node_id: NodeId, ic_binary_dir: PathBuf, + ic_boundary_env_file: PathBuf, registry_replicator: Arc, release_content_dir: PathBuf, logger: ReplicaLogger, orchestrator_data_directory: PathBuf, disk_encryption_key_exchange_agent: Option, + crypto_config: CryptoConfig, ) -> Self { let init_time = Instant::now(); let value = Self { registry, metrics, - replica_process, + process_manager, manageboot_runner, cup_provider, subnet_assignment, @@ -168,6 +145,7 @@ impl Upgrade { replica_version, replica_config_file, ic_binary_dir, + ic_boundary_env_file, image_path: release_content_dir.join("image.bin"), registry_replicator, init_time, @@ -175,6 +153,7 @@ impl Upgrade { prepared_upgrade_version: None, orchestrator_data_directory, disk_encryption_key_exchange_agent, + crypto_config, }; if let Err(e) = value.report_reboot_time() { warn!(logger, "Cannot report the reboot time: {}", e); @@ -207,8 +186,8 @@ impl Upgrade { /// 2. Detecting if a recovery is taking place (i.e. there is a CUP in the registry with higher /// height than any available). /// 3. Downloading and upgrading to a new replica version if necessary. - /// 4. Launching the replica process if assigned to a subnet. - /// 5. Stopping the replica process and removing the node state if leaving the subnet. + /// 4. Launching the child processes if assigned to a subnet. + /// 5. Stopping the child processes and removing the node state if leaving the subnet. pub(crate) async fn check(&mut self) -> OrchestratorResult { let latest_registry_version = self.registry.get_latest_version(); @@ -357,7 +336,7 @@ impl Upgrade { // We are no longer part of the subnet. *self.subnet_assignment.write().unwrap() = SubnetAssignment::Unassigned; - self.stop_replica()?; + self.stop_children()?; self.remove_state().await.inspect_err(|_| { self.metrics.critical_error_state_removal_failed.inc(); @@ -402,8 +381,12 @@ impl Upgrade { // If it is, we restart to pass the unsigned CUP to consensus. self.stop_replica_if_new_recovery_cup(&latest_cup, old_cup_height); - // This will start a new replica process if none is running. - self.ensure_replica_is_running(&self.replica_version, subnet_id)?; + // This will start new child processes if any of them is not running + self.ensure_children_are_running( + &self.replica_version, + subnet_id, + latest_registry_version, + )?; // This will trigger an image download if one is already scheduled but we did // not arrive at the corresponding CUP yet. @@ -456,7 +439,7 @@ impl Upgrade { ) .await .map_err(OrchestratorError::FileDownloadError)?; - if let Err(e) = self.stop_replica() { + if let Err(e) = self.stop_children() { // Even though we fail to stop the replica, we should still // replace the registry local store, so we simply issue a warning. warn!(self.logger, "Failed to stop replica with error {:?}", e); @@ -551,16 +534,6 @@ impl Upgrade { .map(|Rebooting| OrchestratorControlFlow::Stop) } - /// Stop the current replica process. - pub fn stop_replica(&self) -> OrchestratorResult<()> { - self.replica_process.lock().unwrap().stop().map_err(|e| { - OrchestratorError::IoError( - "Error when attempting to stop replica during upgrade".into(), - e, - ) - }) - } - /// Ensure that an upgrade to the given `new_replica_version` should be executed. /// Returns an error if the upgrade should be delayed or blocked, for example due to the new /// replica version being recalled. @@ -616,6 +589,17 @@ impl Upgrade { Ok(()) } + /// Stop the current replica process. + fn stop_replica(&self) -> OrchestratorResult<()> { + let mut process_manager = self.process_manager.lock().unwrap(); + >::stop(&mut process_manager).map_err(|e| { + OrchestratorError::IoError( + "Error when attempting to stop replica during upgrade".into(), + e, + ) + }) + } + /// Stop the replica if the given CUP is unsigned and higher than the given height. /// Without restart, consensus would reject the unsigned artifact. /// If stopping the replica fails, restart the current process instead. @@ -639,42 +623,142 @@ impl Upgrade { } } - /// Start the replica process if not running already + /// Stop all child processes, including the replica. + pub fn stop_children(&self) -> OrchestratorResult<()> { + self.stop_replica()?; + stop_ic_boundary(&mut *self.process_manager.lock().unwrap())?; + + Ok(()) + } + fn ensure_replica_is_running( &self, replica_version: &ReplicaVersion, subnet_id: SubnetId, ) -> OrchestratorResult<()> { - if self.replica_process.lock().unwrap().is_running() { + let mut process_manager = self.process_manager.lock().unwrap(); + if >::is_running(&process_manager) { return Ok(()); } - info!(self.logger, "Starting new replica process"); - self.metrics.replica_process_start_attempts.inc(); - let cup_path = self.cup_provider.get_cup_path(); - let replica_binary = self.ic_binary_dir.join("replica"); - let cmd = vec![ - format!("--replica-version={}", replica_version.as_ref()).into(), - format!( - "--config-file={}", - self.replica_config_file.as_path().display() - ) - .into(), - format!("--catch-up-package={}", cup_path.as_path().display()).into(), - format!("--force-subnet={}", subnet_id).into(), - ]; - self.replica_process - .lock() - .unwrap() + info!(self.logger, "Starting new {} process", ReplicaProcess::NAME); + self.metrics + .processes_start_attempts + .with_label_values(&[ReplicaProcess::NAME]) + .inc(); + process_manager .start(ReplicaProcess { - version: replica_version.clone(), - binary: replica_binary, - args: cmd, + ic_binary_dir: self.ic_binary_dir.clone(), + replica_version: replica_version.clone(), + cup_path: self.cup_provider.get_cup_path(), + config_file: self.replica_config_file.clone(), + subnet_id, }) .map_err(|e| { OrchestratorError::IoError("Error when attempting to start new replica".into(), e) }) } + + fn ensure_ic_boundary_is_running( + &self, + replica_version: &ReplicaVersion, + registry_version: RegistryVersion, + ) -> OrchestratorResult<()> { + let domain_name = self + .registry + .get_node_domain_name(registry_version)? + .ok_or_else(|| OrchestratorError::DomainNameMissingError(self.node_id))?; + + start_ic_boundary( + &mut *self.process_manager.lock().unwrap(), + &self.ic_binary_dir, + &self.ic_boundary_env_file, + replica_version, + domain_name, + &self.crypto_config, + &self.logger, + &self.metrics, + ) + } + + /// Start the child processes if not running already. + fn ensure_children_are_running( + &self, + replica_version: &ReplicaVersion, + subnet_id: SubnetId, + registry_version: RegistryVersion, + ) -> OrchestratorResult<()> { + // First ensure the replica is running before launching any other processes + self.ensure_replica_is_running(replica_version, subnet_id)?; + + match self.registry.get_subnet_type(subnet_id, registry_version)? { + None + | Some(SubnetType::Unspecified) + | Some(SubnetType::Application) + | Some(SubnetType::System) + | Some(SubnetType::VerifiedApplication) => {} + Some(SubnetType::CloudEngine) => { + // Cloud engines are self-contained and also run boundary nodes' stack + self.ensure_ic_boundary_is_running(replica_version, registry_version)?; + } + } + + Ok(()) + } +} + +pub(crate) fn start_ic_boundary( + process: &mut dyn ProcessRunner, + ic_binary_dir: &Path, + ic_boundary_env_file: &Path, + replica_version: &ReplicaVersion, + domain_name: String, + crypto_config: &CryptoConfig, + logger: &ReplicaLogger, + metrics: &OrchestratorMetrics, +) -> OrchestratorResult<()> { + if process.is_running() { + return Ok(()); + } + + let crypto_config = serde_json::to_string(&crypto_config) + .map_err(OrchestratorError::SerializeCryptoConfigError)?; + + let env = match env_file_reader::read_file(ic_boundary_env_file) { + Ok(env) => env, + Err(e) => { + return Err(OrchestratorError::IoError( + "unable to read ic-boundary environment variables".to_string(), + e, + )); + } + }; + + info!(logger, "Starting new {} process", IcBoundaryProcess::NAME); + metrics + .processes_start_attempts + .with_label_values(&[IcBoundaryProcess::NAME]) + .inc(); + process + .start(IcBoundaryProcess { + ic_binary_dir: ic_binary_dir.to_path_buf(), + replica_version: replica_version.clone(), + domain_name, + crypto_config, + env, + }) + .map_err(|e| { + OrchestratorError::IoError("Error when attempting to start ic-boundary".into(), e) + }) +} + +/// Stop the current boundary node process. +pub(crate) fn stop_ic_boundary( + process: &mut dyn ProcessRunner, +) -> OrchestratorResult<()> { + process.stop().map_err(|e| { + OrchestratorError::IoError("Error when attempting to stop ic-boundary".into(), e) + }) } #[async_trait] @@ -778,8 +862,10 @@ fn get_subnet_id(registry: &dyn RegistryClient, cup: &CatchUpPackage) -> Result< .iter() .next() .ok_or("No nodes in current transcript committee found")?; - match registry.get_subnet_id_from_node_id(*node_id, dkg_summary.registry_version) { - Ok(Some(subnet_id)) => Ok(subnet_id), + match registry + .get_subnet_id_and_type_from_node_id(*node_id, dkg_summary.registry_version) + { + Ok(Some((subnet_id, _subnet_type))) => Ok(subnet_id), other => Err(format!( "Couldn't get the subnet id from the registry for node {:?} at registry version {}: {:?}", node_id, dkg_summary.registry_version, other @@ -1174,13 +1260,13 @@ mod tests { use ic_protobuf::registry::subnet::v1::{CatchUpPackageContents, InitialNiDkgTranscriptRecord}; use ic_protobuf::registry::unassigned_nodes_config::v1::UnassignedNodesConfigRecord; use ic_protobuf::registry::{ - replica_version::v1::ReplicaVersionRecord, subnet::v1::SubnetRecord, + node::v1::NodeRecord, replica_version::v1::ReplicaVersionRecord, subnet::v1::SubnetRecord, }; use ic_protobuf::types::v1 as pb; use ic_registry_client_fake::FakeRegistryClient; use ic_registry_keys::{ - ROOT_SUBNET_ID_KEY, make_catch_up_package_contents_key, make_replica_version_key, - make_subnet_record_key, make_unassigned_nodes_config_record_key, + ROOT_SUBNET_ID_KEY, make_catch_up_package_contents_key, make_node_record_key, + make_replica_version_key, make_subnet_record_key, make_unassigned_nodes_config_record_key, }; use ic_registry_proto_data_provider::ProtoRegistryDataProvider; use ic_test_utilities_consensus::fake::{Fake, FakeContent}; @@ -1214,7 +1300,7 @@ mod tests { use rstest::rstest; use slog::Level; use std::{ - collections::{BTreeMap, BTreeSet}, + collections::{BTreeMap, BTreeSet, HashMap}, ffi::OsStr, path::Path, process::Output, @@ -1225,17 +1311,29 @@ mod tests { pub fn subnet_assignment(&self) -> SubnetAssignment { *self.subnet_assignment.read().unwrap() } + + pub fn is_replica_running(&self) -> bool { + let process_manager = self.process_manager.lock().unwrap(); + >::is_running(&process_manager) + } + + pub fn is_ic_boundary_running(&self) -> bool { + let process_manager = self.process_manager.lock().unwrap(); + >::is_running(&process_manager) + } } - pub(crate) struct FakeProcessManager { + /// Fake runner that tracks running state without spawning a real process. + /// Used as a drop-in for `SingleProcessRunner

` inside `ProcessManager`. + pub(crate) struct FakeRunner { running: bool, } - impl FakeProcessManager { + impl FakeRunner { pub(crate) fn new() -> Self { Self { running: false } } } - impl ProcessManager

for FakeProcessManager { + impl ProcessRunner

for FakeRunner { fn start(&mut self, _process: P) -> std::io::Result<()> { self.running = true; Ok(()) @@ -1370,6 +1468,21 @@ mod tests { make_cup_with_summary(height, summary_payload) } + fn add_node_record_to_provider( + data_provider: &ProtoRegistryDataProvider, + registry_version: RegistryVersion, + node_id: NodeId, + node_record: NodeRecord, + ) { + data_provider + .add( + &make_node_record_key(node_id), + registry_version, + Some(node_record), + ) + .unwrap(); + } + fn add_root_subnet_id_to_provider( data_provider: &ProtoRegistryDataProvider, registry_version: RegistryVersion, @@ -1464,12 +1577,14 @@ mod tests { data_provider: &ProtoRegistryDataProvider, registry_version: RegistryVersion, subnet_id: SubnetId, + subnet_type: SubnetType, membership: impl AsRef<[NodeId]>, replica_version: &ReplicaVersion, recalled_replica_versions: impl AsRef<[String]>, ) { let subnet_record = SubnetRecordBuilder::new() .with_membership(membership.as_ref()) + .with_subnet_type(subnet_type.try_into().unwrap()) .with_replica_version(replica_version.as_ref()) .with_recalled_replica_version_ids(recalled_replica_versions.as_ref()) .build(); @@ -1510,6 +1625,7 @@ mod tests { ) -> Upgrade { let UpgradeTestScenario { node_id, + subnet_type, current_replica_version, has_local_cup, initial_subnet_assignment, @@ -1528,19 +1644,41 @@ mod tests { let ic_binary_dir = dir.join("ic_binary"); std::fs::create_dir_all(&ic_binary_dir).unwrap(); + let ic_boundary_env_file = dir.join("ic-boundary.env"); + std::fs::write(&ic_boundary_env_file, b"TEST_KEY=TEST_VALUE").unwrap(); + + let crypto_config = CryptoConfig::default(); - let replica_process = Arc::new(Mutex::new(FakeProcessManager::new())); + let process_manager = Arc::new(Mutex::new(ProcessManager::new( + Box::new(FakeRunner::new()), + Box::new(FakeRunner::new()), + ))); // Start the replica process if the test scenario indicates so - if test_scenario.was_replica_process_started_previously() { - replica_process + if test_scenario.were_child_processes_started_previously() { + process_manager .lock() .unwrap() .start(ReplicaProcess { - version: current_replica_version.clone(), - binary: ic_binary_dir.join("replica"), - args: vec![], + ic_binary_dir: ic_binary_dir.clone(), + replica_version: current_replica_version.clone(), + cup_path: PathBuf::new(), + config_file: PathBuf::new(), + subnet_id: SUBNET_1, }) .unwrap(); + if matches!(subnet_type, SubnetType::CloudEngine) { + process_manager + .lock() + .unwrap() + .start(IcBoundaryProcess { + ic_binary_dir: ic_binary_dir.clone(), + replica_version: current_replica_version.clone(), + domain_name: "domain.name".to_string(), + crypto_config: serde_json::to_string(&crypto_config).unwrap(), + env: HashMap::new(), + }) + .unwrap(); + } } let manageboot_runner = Box::new(FakeManagebootRunner); @@ -1597,7 +1735,7 @@ mod tests { let mut upgrade_loop = Upgrade::new( registry, metrics, - replica_process, + process_manager, manageboot_runner, cup_provider, subnet_assignment, @@ -1605,11 +1743,13 @@ mod tests { replica_config_file, node_id, ic_binary_dir, + ic_boundary_env_file, Arc::new(registry_replicator), release_content_dir, logger, orchestrator_data_dir, None, + crypto_config, ) .await; @@ -1686,6 +1826,8 @@ mod tests { struct UpgradeTestScenario { // Node id of the node under test node_id: NodeId, + // Subnet type of the node under test + subnet_type: SubnetType, // Current replica version of the running orchestrator current_replica_version: ReplicaVersion, // Whether the node is assigned to a subnet (<=> presence of local CUP) @@ -1729,20 +1871,13 @@ mod tests { } // Starting with an `Assigned` subnet assignment *and* successfully persisting a local CUP - // should mean that the replica process was started by a previous iteration of the upgrade + // should mean that the child processes were started by a previous iteration of the upgrade // loop. - fn was_replica_process_started_previously(&self) -> bool { + fn were_child_processes_started_previously(&self) -> bool { matches!( self.initial_subnet_assignment, SubnetAssignment::Assigned(_) - ) - && self.has_local_cup.is_some() - // TODO(CON-1630): After mocking the process management, we can remove the condition below. - // For now, we should not start the replica if a recovery CUP exists (with higher height) - // since that would try to stop the replica process, which fails in the test - // environment. - && self.has_registry_cup.as_ref().map(|(cup, _)| cup.height) - <= self.has_local_cup.as_ref().map(|cup| cup.height) + ) && self.has_local_cup.is_some() } // Returns whether the upgrade loop should call @@ -1776,6 +1911,18 @@ mod tests { fn setup_registry(&self) -> Arc { let data_provider = Arc::new(ProtoRegistryDataProvider::new()); + let mut node_record = NodeRecord::default(); + if matches!(self.subnet_type, SubnetType::CloudEngine) { + // Nodes in Cloud engines must have a domain name to start ic-boundary + node_record.domain = Some("domain.name".to_string()); + } + add_node_record_to_provider( + &data_provider, + RegistryVersion::from(1), + self.node_id, + node_record, + ); + // NNS subnet let nns_subnet_id = SUBNET_42; add_root_subnet_id_to_provider(&data_provider, RegistryVersion::from(1), nns_subnet_id); @@ -1840,6 +1987,7 @@ mod tests { &data_provider, RegistryVersion::from(1), local_cup.subnet_id, + self.subnet_type, vec![self.node_id, other_node_id], &self.current_replica_version, &recalled_replica_versions, @@ -1855,6 +2003,7 @@ mod tests { &data_provider, upgrade.registry_version, local_cup.subnet_id, + self.subnet_type, vec![self.node_id, other_node_id], &upgrade.replica_version, &recalled_replica_versions, @@ -1867,6 +2016,7 @@ mod tests { &data_provider, *leaving_registry_version, local_cup.subnet_id, + self.subnet_type, vec![other_node_id], &self.current_replica_version, &recalled_replica_versions, @@ -1880,6 +2030,7 @@ mod tests { &data_provider, *leaving_registry_version, local_cup.subnet_id, + self.subnet_type, vec![other_node_id], &self.current_replica_version, &recalled_replica_versions, @@ -1889,6 +2040,7 @@ mod tests { &data_provider, upgrade.registry_version, local_cup.subnet_id, + self.subnet_type, vec![other_node_id], &upgrade.replica_version, &recalled_replica_versions, @@ -1903,6 +2055,7 @@ mod tests { &data_provider, *leaving_registry_version, local_cup.subnet_id, + self.subnet_type, vec![other_node_id], &upgrade.replica_version, &recalled_replica_versions, @@ -1914,6 +2067,7 @@ mod tests { &data_provider, upgrade.registry_version, local_cup.subnet_id, + self.subnet_type, vec![self.node_id, other_node_id], &upgrade.replica_version, &recalled_replica_versions, @@ -1923,6 +2077,7 @@ mod tests { &data_provider, *leaving_registry_version, local_cup.subnet_id, + self.subnet_type, vec![other_node_id], &upgrade.replica_version, &recalled_replica_versions, @@ -1956,6 +2111,7 @@ mod tests { &data_provider, *registry_cup_registry_version, registry_cup.subnet_id, + self.subnet_type, vec![self.node_id, other_node_id], &self.current_replica_version, &recalled_replica_versions, @@ -1969,6 +2125,7 @@ mod tests { &data_provider, *registry_cup_registry_version, registry_cup.subnet_id, + self.subnet_type, vec![self.node_id, other_node_id], &self.current_replica_version, &recalled_replica_versions, @@ -1978,6 +2135,7 @@ mod tests { &data_provider, upgrade.registry_version, registry_cup.subnet_id, + self.subnet_type, vec![self.node_id, other_node_id], &upgrade.replica_version, &recalled_replica_versions, @@ -1990,6 +2148,7 @@ mod tests { &data_provider, *registry_cup_registry_version, registry_cup.subnet_id, + self.subnet_type, vec![self.node_id, other_node_id], &upgrade.replica_version, &recalled_replica_versions, @@ -2007,6 +2166,7 @@ mod tests { &data_provider, *registry_cup_registry_version, registry_cup.subnet_id, + self.subnet_type, vec![self.node_id, other_node_id], &upgrade.replica_version, &recalled_replica_versions, @@ -2383,43 +2543,66 @@ mod tests { assert_has_removed_state(); } - // Returns whether the replica process should be running after the upgrade loop. - // Additionally asserts whether the orchestrator has started a *new* replica process - fn should_replica_process_be_running(&self, logs: Vec) -> bool { - let needle_has_started_new_process = "Starting new replica process"; + // Returns whether the child processes should be running after the upgrade loop. + // Additionally asserts whether the orchestrator has started *new* child processes + fn should_child_processes_be_running(&self, logs: Vec) -> bool { let logs_assert = LogEntriesAssert::assert_that(logs); - let assert_has_started_new_process = || { - logs_assert - .has_only_one_message_containing(&Level::Info, needle_has_started_new_process); + let assert_has_started = |process_name: &str| { + logs_assert.has_only_one_message_containing( + &Level::Info, + &format!("Starting new {} process", process_name), + ); }; - let assert_has_not_started_new_process = || { + let assert_has_not_started = |process_name: &str| { logs_assert.has_exactly_n_messages_containing( 0, &Level::Info, - needle_has_started_new_process, + &format!("Starting new {} process", process_name), ); }; + let assert_has_started_new_processes = || { + if matches!(self.subnet_type, SubnetType::CloudEngine) { + assert_has_started(ReplicaProcess::NAME); + assert_has_started(IcBoundaryProcess::NAME); + } else { + assert_has_started(ReplicaProcess::NAME); + }; + }; + let assert_has_not_started_new_processes = || { + assert_has_not_started(ReplicaProcess::NAME); + assert_has_not_started(IcBoundaryProcess::NAME); + }; match &self.has_local_cup { Some(local_cup) => { - // If the initial subnet assignment was already `Assigned`, then the replica - // process should have been started by the previous iteration of the upgrade - // loop and should not be started again. + // If the child processes were started by the previous iteration of the upgrade + // loop, they should not be started again. // Though, if there is a recovery CUP of a higher height than the local CUP, - // then the replica process should be started again to pick up the new CUP. - let assert_has_started_new_process_if_necessary = - || match (&self.has_registry_cup, &self.initial_subnet_assignment) { - (Some((registry_cup, _)), _) - if registry_cup.height >= local_cup.height => - { - assert_has_started_new_process(); - } - (_, SubnetAssignment::Assigned(_)) => { - assert_has_not_started_new_process(); - } - (_, SubnetAssignment::Unassigned | SubnetAssignment::Unknown) => { - assert_has_started_new_process(); - } - }; + // then the *replica* process (not all children) should be started again to pick + // up the new CUP. + let assert_has_started_new_processes_if_necessary = || match ( + &self.has_registry_cup, + self.were_child_processes_started_previously(), + &self.subnet_type, + ) { + (Some((registry_cup, _)), false, SubnetType::CloudEngine) + if registry_cup.height >= local_cup.height => + { + assert_has_started(ReplicaProcess::NAME); + assert_has_started(IcBoundaryProcess::NAME); + } + (Some((registry_cup, _)), _, _) + if registry_cup.height >= local_cup.height => + { + assert_has_started(ReplicaProcess::NAME); + assert_has_not_started(IcBoundaryProcess::NAME); + } + (_, true, _) => { + assert_has_not_started_new_processes(); + } + (_, false, _) => { + assert_has_started_new_processes(); + } + }; let highest_height_cup = local_cup.max_height(self.has_registry_cup.as_ref().map(|(cup, _)| cup)); @@ -2428,43 +2611,43 @@ mod tests { (None, None) => { // Not leaving, so the replica process should be started only if // necessary - assert_has_started_new_process_if_necessary(); + assert_has_started_new_processes_if_necessary(); true } (None, Some(upgrade)) if highest_height_cup.registry_version < upgrade.registry_version => { // An upgrade is scheduled but the CUP's registry version has not - // reached the upgrade registry version yet, so the replica process + // reached the upgrade registry version yet, so the child processes // should be started only if not already running - assert_has_started_new_process_if_necessary(); + assert_has_started_new_processes_if_necessary(); true } (None, Some(_upgrade)) => { // An upgrade is scheduled and the CUP's registry version has reached // the upgrade registry version. // Regardless of whether the upgrade version was recalled or not, note - // that the implementation does not stop the replica process, it either + // that the implementation does not stop the child processes, it either // returns an error (if recalled) or just issues a reboot. Thus, in this - // unit test, we will assert that the replica process is in the same + // unit test, we will assert that the child processes are in the same // state as before. - assert_has_not_started_new_process(); - self.was_replica_process_started_previously() + assert_has_not_started_new_processes(); + self.were_child_processes_started_previously() } (Some(leaving_registry_version), None) if &highest_height_cup.registry_version < leaving_registry_version => { // The node is leaving the subnet, but the CUP's registry version has - // not reached the leaving registry version yet, so the replica process + // not reached the leaving registry version yet, so the child processes // should be started only if not already running - assert_has_started_new_process_if_necessary(); + assert_has_started_new_processes_if_necessary(); true } (Some(_leaving_registry_version), None) => { // The node is leaving the subnet and the CUP's registry version has // reached the leaving registry version, so we are expected to stop the - // replica process - assert_has_not_started_new_process(); + // child processes + assert_has_not_started_new_processes(); false } (Some(leaving_registry_version), Some(upgrade)) @@ -2473,9 +2656,9 @@ mod tests { < upgrade.registry_version => { // Both leaving and upgrade are scheduled, but the CUP's registry version - // has not reached either of them yet, so the replica process should be + // has not reached either of them yet, so the child processes should be // started only if not already running - assert_has_started_new_process_if_necessary(); + assert_has_started_new_processes_if_necessary(); true } (Some(leaving_registry_version), Some(_upgrade)) @@ -2484,20 +2667,20 @@ mod tests { // An upgrade is scheduled and the CUP's registry version has reached // the upgrade registry version. // Regardless of whether the upgrade version was recalled or not, note - // that the implementation does not stop the replica process, it either + // that the implementation does not stop the child processes, it either // returns an error (if recalled) or just issues a reboot. Thus, in this - // unit test, we will assert that the replica process is in the same + // unit test, we will assert that the child processes is in the same // state as before. - assert_has_not_started_new_process(); - self.was_replica_process_started_previously() + assert_has_not_started_new_processes(); + self.were_child_processes_started_previously() } (Some(_leaving_registry_version), Some(_upgrade)) => { // Both leaving and upgrade are scheduled, and the CUP's registry // version has reached the leaving registry version. Regardless of // whether the upgrade registry version has been reached, leaving the - // subnet takes precedence, and we are expected to stop the replica - // process - assert_has_not_started_new_process(); + // subnet takes precedence, and we are expected to stop the child + // processes + assert_has_not_started_new_processes(); false } } @@ -2505,8 +2688,8 @@ mod tests { None => { match &self.has_registry_cup { None => { - // Being unassigned, the replica process should not be running - assert_has_not_started_new_process(); + // Being unassigned, the child processes should not be running + assert_has_not_started_new_processes(); false } Some((registry_cup, _)) => { @@ -2515,26 +2698,26 @@ mod tests { // But there could be an upgrade scheduled in the meantime match &self.upgrade_to { None => { - // No upgrade is scheduled, so the replica process should be + // No upgrade is scheduled, so the child processes should be // *started* - assert_has_started_new_process(); + assert_has_started_new_processes(); true } Some(upgrade) if registry_cup.registry_version < upgrade.registry_version => { // An upgrade is scheduled but the CUP's registry version has - // not reached the upgrade registry version yet, so the replica - // process should be *started* - assert_has_started_new_process(); + // not reached the upgrade registry version yet, so the child + // processes should be *started* + assert_has_started_new_processes(); true } Some(_upgrade) => { // This scenario can be interpreted as the unassigned node // having a different replica version than the subnet's - // We should upgrade before actually starting the replica - // process (or return early if the version was recalled). - assert_has_not_started_new_process(); + // We should upgrade before actually starting the child + // processes (or return early if the version was recalled). + assert_has_not_started_new_processes(); false } } @@ -2577,8 +2760,8 @@ mod tests { // TODO(CON-1631): introduce distinct enum variants to better compare errors assert!(actual_error.contains(expected_error)); } - _ => { - panic!("Upgrade loop flow result does not match expected flow"); + (actual, expected) => { + panic!("Expected flow result {expected:?}, but got {actual:?}. Logs: {logs:#?}"); } } @@ -2610,8 +2793,8 @@ mod tests { // Check whether the replica process is running or not assert_eq!( - upgrade_loop.replica_process.lock().unwrap().is_running(), - test_scenario.should_replica_process_be_running(logs), + upgrade_loop.is_replica_running(), + test_scenario.should_child_processes_be_running(logs), ); // Asserting further invariants: @@ -2648,13 +2831,25 @@ mod tests { // `Assigned` AND (EITHER we are not upgrading OR the replica was // already started beforehand) assert_eq!( - upgrade_loop.replica_process.lock().unwrap().is_running(), + upgrade_loop.is_replica_running(), matches!(new_subnet_assignment, SubnetAssignment::Assigned(_)) && (matches!( flow_result, Ok(OrchestratorControlFlow::Assigned(_)) | Ok(OrchestratorControlFlow::Leaving(_)) - ) || test_scenario.was_replica_process_started_previously()) + ) || test_scenario.were_child_processes_started_previously()) + ); + // - The ic-boundary process is running <=> same as the replica process, but only for Cloud + // Engine subnets + assert_eq!( + upgrade_loop.is_ic_boundary_running(), + matches!(test_scenario.subnet_type, SubnetType::CloudEngine) + && matches!(new_subnet_assignment, SubnetAssignment::Assigned(_)) + && (matches!( + flow_result, + Ok(OrchestratorControlFlow::Assigned(_)) + | Ok(OrchestratorControlFlow::Leaving(_)) + ) || test_scenario.were_child_processes_started_previously()) ); // - As an assigned node: if new_subnet_assignment != SubnetAssignment::Unassigned { @@ -2684,6 +2879,7 @@ mod tests { #[tokio::test] async fn test_upgrade_scenarios( #[values(NODE_1)] node_id: NodeId, + #[values(SubnetType::Application, SubnetType::CloudEngine)] subnet_type: SubnetType, #[values(ReplicaVersion::try_from("replica_version_0.1").unwrap())] current_replica_version: ReplicaVersion, #[values( None, @@ -2764,6 +2960,7 @@ mod tests { let test_scenario = UpgradeTestScenario { node_id, + subnet_type, current_replica_version, has_local_cup, has_registry_cup, @@ -2810,6 +3007,7 @@ mod tests { async fn test_ignore_recalled_versions_if_nns() { let test_scenario = UpgradeTestScenario { node_id: NODE_1, + subnet_type: SubnetType::System, current_replica_version: ReplicaVersion::try_from("replica_version_0.1").unwrap(), has_local_cup: Some(CUPScenario { height: Height::from(100), @@ -2852,6 +3050,7 @@ mod tests { async fn test_ignore_up_to_date_replicator_after_timeout() { let test_scenario = UpgradeTestScenario { node_id: NODE_1, + subnet_type: SubnetType::Application, current_replica_version: ReplicaVersion::try_from("replica_version_0.1").unwrap(), has_local_cup: Some(CUPScenario { height: Height::from(100), diff --git a/rs/registry/admin/bin/main.rs b/rs/registry/admin/bin/main.rs index ca93e30ef416..6f3e393d85b8 100644 --- a/rs/registry/admin/bin/main.rs +++ b/rs/registry/admin/bin/main.rs @@ -3351,7 +3351,7 @@ impl ProposalPayload for ProposeToUpdateFirewallRule /// Sub-command to get all firewall rules for a given scope. #[derive(Parser)] struct GetFirewallRulesCmd { - /// The scope to apply new rules at (can be "global", "replica_nodes", "subnet(id)", or "node(id)") + /// The scope to apply new rules at (can be "global", "replica_nodes", "api_boundary_nodes", "subnet(id)", or "node(id)") pub scope: FirewallRulesScope, } diff --git a/rs/tests/driver/src/driver/group.rs b/rs/tests/driver/src/driver/group.rs index eb5d2855c75d..35746188c385 100644 --- a/rs/tests/driver/src/driver/group.rs +++ b/rs/tests/driver/src/driver/group.rs @@ -663,6 +663,7 @@ fn default_orchestrator_metrics() -> BTreeMap<&'static str, u64> { ("orchestrator_cup_deserialization_failed_total", 0), ("orchestrator_state_removal_failed_total", 0), ("orchestrator_tasks_failed_total", 0), + // TODO: adapt me (and all system tests that use me) ("orchestrator_replica_process_start_attempts_total", 1), ]) } From 9a2a31d255d11462212b12c5b279fa0cadef1811 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Tue, 16 Jun 2026 14:45:37 +0000 Subject: [PATCH 04/42] refactor: make Upgrade agnostic to processes running --- rs/orchestrator/src/boundary_node.rs | 101 +------ rs/orchestrator/src/dashboard.rs | 31 +- rs/orchestrator/src/error.rs | 6 - rs/orchestrator/src/orchestrator.rs | 42 +-- rs/orchestrator/src/process_manager.rs | 32 +++ rs/orchestrator/src/processes.rs | 374 +++++++++++++++++++++---- rs/orchestrator/src/upgrade.rs | 301 +++++++------------- 7 files changed, 505 insertions(+), 382 deletions(-) diff --git a/rs/orchestrator/src/boundary_node.rs b/rs/orchestrator/src/boundary_node.rs index a9bc7f1520a1..121ea787c9ed 100644 --- a/rs/orchestrator/src/boundary_node.rs +++ b/rs/orchestrator/src/boundary_node.rs @@ -1,54 +1,32 @@ use crate::{ - error::{OrchestratorError, OrchestratorResult}, - metrics::OrchestratorMetrics, - process_manager::{ProcessRunner, SingleProcessRunner}, - processes::IcBoundaryProcess, - registry_helper::RegistryHelper, - upgrade::{start_ic_boundary, stop_ic_boundary}, + error::OrchestratorError, processes::IcBoundaryManager, registry_helper::RegistryHelper, }; -use ic_config::crypto::CryptoConfig; use ic_logger::{ReplicaLogger, warn}; use ic_types::{NodeId, ReplicaVersion}; -use std::{ - path::PathBuf, - sync::{Arc, Mutex}, -}; +use std::sync::{Arc, RwLock}; pub(crate) struct BoundaryNodeManager { registry: Arc, - metrics: Arc, - process: Arc>>, - ic_binary_dir: PathBuf, - ic_boundary_env_file: PathBuf, - crypto_config: CryptoConfig, + process_manager: Arc>, version: ReplicaVersion, - logger: ReplicaLogger, node_id: NodeId, - domain_name: Option, + logger: ReplicaLogger, } impl BoundaryNodeManager { pub(crate) fn new( registry: Arc, - metrics: Arc, + process_manager: Arc>, version: ReplicaVersion, node_id: NodeId, - ic_binary_dir: PathBuf, - ic_boundary_env_file: PathBuf, - crypto_config: CryptoConfig, logger: ReplicaLogger, ) -> Self { Self { registry, - metrics, - process: Arc::new(Mutex::new(SingleProcessRunner::new(logger.clone()))), - ic_binary_dir, - ic_boundary_env_file, - crypto_config, + process_manager, version, logger, node_id, - domain_name: None, } } @@ -69,47 +47,18 @@ impl BoundaryNodeManager { // NOTE: We could also shutdown the boundary node here. However, it makes sense to continue // serving requests while the orchestrator is downloading the new image in most cases. } else { - match self.registry.get_node_domain_name(registry_version) { - Ok(Some(domain_name)) => { - // let domain_name = Some(domain_name); - - // stop ic-boundary when the domain name changes and start it again. - if Some(&domain_name) != self.domain_name.as_ref() { - if let Err(err) = self.ensure_ic_boundary_stopped() { - warn!(self.logger, "Failed to stop Boundary Node: {}", err); - } - self.domain_name = Some(domain_name.clone()); - } - - // make sure the boundary node is running - if let Err(err) = - self.ensure_ic_boundary_running(&self.version, domain_name) - { - warn!(self.logger, "Failed to start Boundary Node: {}", err); - } - } - // BN should not be active when the node doesn't have a domain name - Ok(None) => { - warn!( - self.logger, - "There is no domain associated with the node, while this is a requirement for the API boundary node. Shutting ic-boundary down." - ); - if let Err(err) = self.ensure_ic_boundary_stopped() { - warn!(self.logger, "Failed to stop Boundary Node: {}", err); - } - self.domain_name = None; - } - // Failing to read the registry - Err(err) => warn!( - self.logger, - "Failed to fetch Boundary Node domain name: {}", err - ), - } + self.process_manager + .write() + .unwrap() + .ensure_ic_boundary_running_and_restarted_on_domain_change( + &self.version, + registry_version, + ); } } // BN should not be active Err(OrchestratorError::ApiBoundaryNodeMissingError(_, _)) => { - if let Err(err) = self.ensure_ic_boundary_stopped() { + if let Err(err) = self.process_manager.write().unwrap().stop_ic_boundary() { warn!(self.logger, "Failed to stop Boundary Node: {}", err); } } @@ -120,26 +69,4 @@ impl BoundaryNodeManager { ), } } - - /// Start the current boundary node process - fn ensure_ic_boundary_running( - &self, - replica_version: &ReplicaVersion, - domain_name: String, - ) -> OrchestratorResult<()> { - start_ic_boundary( - &mut *self.process.lock().unwrap(), - &self.ic_binary_dir, - &self.ic_boundary_env_file, - replica_version, - domain_name, - &self.crypto_config, - &self.logger, - &self.metrics, - ) - } - - fn ensure_ic_boundary_stopped(&self) -> OrchestratorResult<()> { - stop_ic_boundary(&mut *self.process.lock().unwrap()) - } } diff --git a/rs/orchestrator/src/dashboard.rs b/rs/orchestrator/src/dashboard.rs index be121cabe2d3..c340e86c9ed1 100644 --- a/rs/orchestrator/src/dashboard.rs +++ b/rs/orchestrator/src/dashboard.rs @@ -1,9 +1,6 @@ use crate::{ - catch_up_package_provider::LocalCUPReader, - orchestrator::SubnetAssignment, - process_manager::ProcessRunner, - processes::{ProcessManager, ReplicaProcess}, - registry_helper::RegistryHelper, + catch_up_package_provider::LocalCUPReader, orchestrator::SubnetAssignment, + processes::MultipleProcessesManager, registry_helper::RegistryHelper, ssh_access_manager::SshAccessParameters, }; pub use ic_dashboard::Dashboard; @@ -14,7 +11,7 @@ use ic_types::{ }; use std::{ process::Command, - sync::{Arc, Mutex, RwLock}, + sync::{Arc, RwLock}, }; const ORCHESTRATOR_DASHBOARD_PORT: u16 = 7070; @@ -27,7 +24,7 @@ pub(crate) struct OrchestratorDashboard { last_applied_firewall_version: Arc>, last_applied_ipv4_config_version: Arc>, last_poll_certified_time: Arc>, - process_manager: Arc>, + processes_manager: Arc>, subnet_assignment: Arc>, replica_version: ReplicaVersion, hostos_version: Option, @@ -48,6 +45,7 @@ impl Dashboard for OrchestratorDashboard { last poll's certified time: {}\n\ subnet id: {}\n\ replica process id: {}\n\ + ic-boundary process id: {}\n\ replica version: {}\n\ host os version: {}\n\ scheduled upgrade: {}\n\ @@ -63,7 +61,8 @@ impl Dashboard for OrchestratorDashboard { self.registry.get_latest_version().get(), self.get_last_poll_certified_time(), self.get_subnet_id(), - self.get_pid(), + self.get_replica_pid(), + self.get_ic_boundary_pid(), self.replica_version, self.hostos_version .as_ref() @@ -93,7 +92,7 @@ impl OrchestratorDashboard { last_applied_firewall_version: Arc>, last_applied_ipv4_config_version: Arc>, last_poll_certified_time: Arc>, - process_manager: Arc>, + processes_manager: Arc>, subnet_assignment: Arc>, replica_version: ReplicaVersion, hostos_version: Option, @@ -107,7 +106,7 @@ impl OrchestratorDashboard { last_applied_firewall_version, last_applied_ipv4_config_version, last_poll_certified_time, - process_manager, + processes_manager, subnet_assignment, replica_version, hostos_version, @@ -137,9 +136,15 @@ impl OrchestratorDashboard { ) } - fn get_pid(&self) -> String { - let process_manager = self.process_manager.lock().unwrap(); - match >::get_pid(&process_manager) { + fn get_replica_pid(&self) -> String { + match self.processes_manager.read().unwrap().get_replica_pid() { + Some(pid) => pid.to_string(), + None => "None".to_string(), + } + } + + fn get_ic_boundary_pid(&self) -> String { + match self.processes_manager.read().unwrap().get_ic_boundary_pid() { Some(pid) => pid.to_string(), None => "None".to_string(), } diff --git a/rs/orchestrator/src/error.rs b/rs/orchestrator/src/error.rs index 8359d2d18c57..a85282545790 100644 --- a/rs/orchestrator/src/error.rs +++ b/rs/orchestrator/src/error.rs @@ -78,9 +78,6 @@ pub(crate) enum OrchestratorError { /// An error occurred when trying to get the role (Api boundary node, replica, ...) of the node /// at the given registry version. RoleError(String, RegistryVersion), - - /// The given node is missing a domain name - DomainNameMissingError(NodeId), } impl OrchestratorError { @@ -171,9 +168,6 @@ impl fmt::Display for OrchestratorError { "Failed to get the role of the node at the registry version {registry_version}: {msg}" ) } - OrchestratorError::DomainNameMissingError(node_id) => { - write!(f, "Node {node_id} does not have an associated domain name") - } } } } diff --git a/rs/orchestrator/src/orchestrator.rs b/rs/orchestrator/src/orchestrator.rs index 818e078ec9ae..77dbaae21b3b 100644 --- a/rs/orchestrator/src/orchestrator.rs +++ b/rs/orchestrator/src/orchestrator.rs @@ -7,8 +7,7 @@ use crate::{ hostos_upgrade::HostosUpgrader, ipv4_network::Ipv4Configurator, metrics::OrchestratorMetrics, - process_manager::SingleProcessRunner, - processes::ProcessManager, + processes::{IcBoundaryProcessConfig, MultipleProcessesManager, ReplicaProcessConfig}, registration::NodeRegistration, registry_helper::RegistryHelper, ssh_access_manager::SshAccessManager, @@ -37,7 +36,7 @@ use std::{ future::Future, net::{Ipv4Addr, Ipv6Addr, SocketAddr}, path::Path, - sync::{Arc, Mutex, RwLock}, + sync::{Arc, RwLock}, thread, time::Duration, }; @@ -241,17 +240,11 @@ impl Orchestrator { Arc::clone(&crypto) as _, ); - let process_manager = Arc::new(Mutex::new(ProcessManager::new( - Box::new(SingleProcessRunner::new(logger.clone())), - Box::new(SingleProcessRunner::new(logger.clone())), - ))); let ic_binary_directory = args.ic_binary_directory; let manageboot_runner = Box::new(ManagebootRunnerImpl::new( ic_binary_directory.join("manageboot.sh"), )); - let ic_boundary_env = args.ic_boundary_env_file; - // Create a read-only CUP reader that can be shared among Dashboard and Firewall // They read from the same file, so they'll see the same persisted CUP let local_cup_reader = LocalCUPReader::new(args.cup_dir.clone(), logger.clone()); @@ -266,6 +259,25 @@ impl Orchestrator { node_id, ); + let replica_process_config = ReplicaProcessConfig { + ic_binary_dir: ic_binary_directory.clone(), + cup_path: local_cup_reader.get_cup_path(), + replica_config_file: args.replica_config_file.clone(), + }; + let ic_boundary_process_config = IcBoundaryProcessConfig { + ic_binary_dir: ic_binary_directory.clone(), + ic_boundary_env_file: args.ic_boundary_env_file.clone(), + crypto_config: config.crypto.clone(), + }; + + let processes_manager = Arc::new(RwLock::new(MultipleProcessesManager::new( + replica_process_config, + ic_boundary_process_config.clone(), + Arc::clone(®istry), + Arc::clone(&metrics), + logger.clone(), + ))); + if args.enable_provisional_registration { // will not return until the node is registered registration.register_node().await; @@ -283,21 +295,18 @@ impl Orchestrator { Upgrade::new( Arc::clone(®istry) as _, Arc::clone(&metrics), - Arc::clone(&process_manager), + Arc::clone(&processes_manager), manageboot_runner, cup_provider, Arc::clone(&subnet_assignment), replica_version.clone(), args.replica_config_file.clone(), node_id, - ic_binary_directory.clone(), - ic_boundary_env.clone(), Arc::clone(®istry_replicator) as _, args.replica_binary_dir.clone(), logger.clone(), args.orchestrator_data_directory.clone(), disk_encryption_key_exchange_agent, - config.crypto.clone(), ) .await, ); @@ -330,12 +339,9 @@ impl Orchestrator { let boundary_node = BoundaryNodeManager::new( Arc::clone(®istry), - Arc::clone(&metrics), + processes_manager.read().unwrap().ic_boundary_manager(), replica_version.clone(), node_id, - ic_binary_directory.clone(), - ic_boundary_env, - config.crypto.clone(), logger.clone(), ); @@ -371,7 +377,7 @@ impl Orchestrator { firewall.get_last_applied_version(), ipv4_configurator.get_last_applied_version(), registry_replicator.get_latest_certified_time(), - process_manager, + processes_manager, Arc::clone(&subnet_assignment), replica_version, hostos_version.ok(), diff --git a/rs/orchestrator/src/process_manager.rs b/rs/orchestrator/src/process_manager.rs index 4a576382dd8c..d02137152e4e 100644 --- a/rs/orchestrator/src/process_manager.rs +++ b/rs/orchestrator/src/process_manager.rs @@ -12,6 +12,11 @@ use std::{ sync::{Arc, Mutex}, }; +use crate::{ + error::{OrchestratorError, OrchestratorResult}, + metrics::OrchestratorMetrics, +}; + type PIDCell = Arc>>; /// Captures a process that should be run by a [`ProcessRunner`] @@ -211,3 +216,30 @@ fn wait_on_exit( let _pid = pid_cell.lock().unwrap().take(); } } + +// start_orchestrator_process — common "check-running / log / metric / start" +// logic +// +// Any new process type automatically benefits from this without duplicating +// the boilerplate. +pub(crate) fn start_orchestrator_process( + runner: &mut dyn ProcessRunner

, + process: P, + metrics: &OrchestratorMetrics, + logger: &ReplicaLogger, +) -> OrchestratorResult<()> { + if runner.is_running() { + return Ok(()); + } + info!(logger, "Starting new {} process", P::NAME); + metrics + .processes_start_attempts + .with_label_values(&[P::NAME]) + .inc(); + runner.start(process).map_err(|e| { + OrchestratorError::IoError( + format!("Error when attempting to start {} process", P::NAME), + e, + ) + }) +} diff --git a/rs/orchestrator/src/processes.rs b/rs/orchestrator/src/processes.rs index 4ca8be18f3c0..8c8c37524e45 100644 --- a/rs/orchestrator/src/processes.rs +++ b/rs/orchestrator/src/processes.rs @@ -1,18 +1,54 @@ -use crate::process_manager::{Process, ProcessRunner}; -use ic_types::{ReplicaVersion, SubnetId}; +use crate::{ + error::{OrchestratorError, OrchestratorResult}, + metrics::OrchestratorMetrics, + process_manager::{Process, ProcessRunner, SingleProcessRunner, start_orchestrator_process}, + registry_helper::RegistryHelper, +}; +use ic_config::crypto::CryptoConfig; +use ic_logger::{ReplicaLogger, warn}; +use ic_protobuf::registry::subnet::v1::SubnetType; +use ic_types::{RegistryVersion, ReplicaVersion, SubnetId}; use nix::unistd::Pid; -use std::{collections::HashMap, ffi::OsString, path::PathBuf}; +use std::{ + collections::HashMap, + ffi::OsString, + path::PathBuf, + sync::{Arc, RwLock}, +}; // --------------------------------------------------------------------------- // ReplicaProcess // --------------------------------------------------------------------------- -pub(crate) struct ReplicaProcess { +#[derive(Clone)] +pub(crate) struct ReplicaProcessConfig { pub ic_binary_dir: PathBuf, - pub replica_version: ReplicaVersion, pub cup_path: PathBuf, - pub config_file: PathBuf, - pub subnet_id: SubnetId, + pub replica_config_file: PathBuf, +} + +pub(crate) struct ReplicaProcess { + ic_binary_dir: PathBuf, + replica_version: ReplicaVersion, + cup_path: PathBuf, + replica_config_file: PathBuf, + subnet_id: SubnetId, +} + +impl ReplicaProcess { + fn new( + config: ReplicaProcessConfig, + replica_version: ReplicaVersion, + subnet_id: SubnetId, + ) -> Self { + Self { + ic_binary_dir: config.ic_binary_dir, + replica_version, + cup_path: config.cup_path, + replica_config_file: config.replica_config_file, + subnet_id, + } + } } impl Process for ReplicaProcess { @@ -30,7 +66,7 @@ impl Process for ReplicaProcess { OsString::from("--replica-version"), self.replica_version.to_string().into(), OsString::from("--config-file"), - self.config_file.clone().into(), + self.replica_config_file.clone().into(), OsString::from("--catch-up-package"), self.cup_path.clone().into(), OsString::from("--force-subnet"), @@ -42,16 +78,100 @@ impl Process for ReplicaProcess { } } +pub(crate) struct ReplicaManager { + pub process_runner: Box + Sync>, + process_config: ReplicaProcessConfig, + metrics: Arc, + logger: ReplicaLogger, +} + +impl ReplicaManager { + pub(crate) fn new( + process_config: ReplicaProcessConfig, + metrics: Arc, + logger: ReplicaLogger, + ) -> Self { + let process_runner = Box::new(SingleProcessRunner::new(logger.clone())); + Self { + process_runner, + process_config, + metrics, + logger, + } + } + + fn ensure_replica_running( + &mut self, + replica_version: &ReplicaVersion, + subnet_id: SubnetId, + ) -> OrchestratorResult<()> { + start_orchestrator_process( + &mut *self.process_runner, + ReplicaProcess::new( + self.process_config.clone(), + replica_version.clone(), + subnet_id, + ), + &self.metrics, + &self.logger, + ) + } + + fn stop_replica(&mut self) -> OrchestratorResult<()> { + self.process_runner.stop().map_err(|e| { + OrchestratorError::IoError("Error when attempting to stop replica".to_string(), e) + }) + } +} + // --------------------------------------------------------------------------- // IcBoundaryProcess // --------------------------------------------------------------------------- -pub(crate) struct IcBoundaryProcess { +#[derive(Clone)] +pub(crate) struct IcBoundaryProcessConfig { pub ic_binary_dir: PathBuf, - pub replica_version: ReplicaVersion, - pub domain_name: String, - pub crypto_config: String, - pub env: HashMap, + pub ic_boundary_env_file: PathBuf, + pub crypto_config: CryptoConfig, +} + +pub(crate) struct IcBoundaryProcess { + ic_binary_dir: PathBuf, + replica_version: ReplicaVersion, + domain_name: String, + crypto_config: String, + env: HashMap, +} + +impl IcBoundaryProcess { + fn new( + process_config: IcBoundaryProcessConfig, + replica_version: ReplicaVersion, + domain_name: String, + ) -> OrchestratorResult { + let env = match env_file_reader::read_file(&process_config.ic_boundary_env_file) { + Ok(env) => env + .into_iter() + .map(|(k, v)| (OsString::from(k), OsString::from(v))) + .collect(), + Err(e) => { + return Err(OrchestratorError::IoError( + "unable to read ic-boundary environment variables".to_string(), + e, + )); + } + }; + let crypto_config = serde_json::to_string(&process_config.crypto_config) + .map_err(OrchestratorError::SerializeCryptoConfigError)?; + + Ok(Self { + ic_binary_dir: process_config.ic_binary_dir, + replica_version, + domain_name, + crypto_config, + env, + }) + } } impl Process for IcBoundaryProcess { @@ -73,65 +193,213 @@ impl Process for IcBoundaryProcess { ] } fn get_env(&self) -> HashMap { - self.env - .iter() - .map(|(k, v)| (OsString::from(k), OsString::from(v))) - .collect() + self.env.clone() + } +} + +pub(crate) struct IcBoundaryManager { + pub process_runner: Box + Sync>, + process_config: IcBoundaryProcessConfig, + registry: Arc, + current_domain_name: Option, + metrics: Arc, + logger: ReplicaLogger, +} + +impl IcBoundaryManager { + pub(crate) fn new( + process_config: IcBoundaryProcessConfig, + registry: Arc, + metrics: Arc, + logger: ReplicaLogger, + ) -> Self { + let process_runner = Box::new(SingleProcessRunner::new(logger.clone())); + Self { + process_runner, + process_config, + registry, + current_domain_name: None, + metrics, + logger, + } + } + + pub(crate) fn ensure_ic_boundary_running_and_restarted_on_domain_change( + &mut self, + replica_version: &ReplicaVersion, + registry_version: RegistryVersion, + ) { + match self.registry.get_node_domain_name(registry_version) { + Ok(Some(domain_name)) => { + // stop ic-boundary when the domain name changes and start it again. + if Some(&domain_name) != self.current_domain_name.as_ref() + && let Err(err) = self.stop_ic_boundary() + { + warn!(self.logger, "Failed to stop ic-boundary: {}", err); + } + + // make sure the ic-boundary is running + if let Err(err) = self.ensure_ic_boundary_running(replica_version, &domain_name) { + warn!(self.logger, "Failed to start ic-boundary: {}", err); + } + + self.current_domain_name = Some(domain_name); + } + // ic-boundary should not start when the node doesn't have a domain name + Ok(None) => { + warn!( + self.logger, + "There is no domain associated with the node, while this is a requirement for the API boundary node. Shutting ic-boundary down." + ); + if let Err(err) = self.stop_ic_boundary() { + warn!(self.logger, "Failed to stop Boundary Node: {}", err); + } + self.current_domain_name = None; + } + // Failing to read the registry + Err(err) => warn!(self.logger, "Failed to fetch domain name: {}", err), + } + } + + fn ensure_ic_boundary_running( + &mut self, + replica_version: &ReplicaVersion, + domain_name: &str, + ) -> OrchestratorResult<()> { + start_orchestrator_process( + &mut *self.process_runner, + IcBoundaryProcess::new( + self.process_config.clone(), + replica_version.clone(), + domain_name.to_string(), + )?, + &self.metrics, + &self.logger, + ) + } + + pub(crate) fn stop_ic_boundary(&mut self) -> OrchestratorResult<()> { + self.process_runner.stop().map_err(|e| { + OrchestratorError::IoError("Error when attempting to stop ic-boundary".to_string(), e) + }) } } // --------------------------------------------------------------------------- -// ProcessManager +// MultipleProcessManager +// +// This struct manages all processes that the orchestrator is responsible for, +// providing a single entry point for starting and stopping them according to +// the node's configuration in the registry. // --------------------------------------------------------------------------- -/// Manages all processes for this orchestrator. -/// -/// Owns one runner per process type. Each field is a `Box>` so the runner can -/// be swapped out in tests without spawning real processes. -/// Implements `ProcessRunner

` for each process type, delegating to the corresponding runner. -pub(crate) struct ProcessManager { - replica: Box>, - ic_boundary: Box>, +pub(crate) struct MultipleProcessesManager { + replica_manager: Arc>, + ic_boundary_manager: Arc>, + registry: Arc, } -impl ProcessManager { +impl MultipleProcessesManager { pub(crate) fn new( - replica: Box>, - ic_boundary: Box>, + replica_process_config: ReplicaProcessConfig, + ic_boundary_process_config: IcBoundaryProcessConfig, + registry: Arc, + metrics: Arc, + logger: ReplicaLogger, ) -> Self { + let replica_manager = Arc::new(RwLock::new(ReplicaManager::new( + replica_process_config, + metrics.clone(), + logger.clone(), + ))); + let ic_boundary_manager = Arc::new(RwLock::new(IcBoundaryManager::new( + ic_boundary_process_config, + registry.clone(), + metrics, + logger, + ))); + Self { - replica, - ic_boundary, + replica_manager, + ic_boundary_manager, + registry, } } -} -impl ProcessRunner for ProcessManager { - fn start(&mut self, process: ReplicaProcess) -> std::io::Result<()> { - self.replica.start(process) + pub(crate) fn ic_boundary_manager(&self) -> Arc> { + self.ic_boundary_manager.clone() } - fn stop(&mut self) -> std::io::Result<()> { - self.replica.stop() - } - fn is_running(&self) -> bool { - self.replica.is_running() - } - fn get_pid(&self) -> Option { - self.replica.get_pid() + + pub(crate) fn get_replica_pid(&self) -> Option { + self.replica_manager + .read() + .unwrap() + .process_runner + .get_pid() } -} -impl ProcessRunner for ProcessManager { - fn start(&mut self, process: IcBoundaryProcess) -> std::io::Result<()> { - self.ic_boundary.start(process) + pub(crate) fn get_ic_boundary_pid(&self) -> Option { + self.ic_boundary_manager + .read() + .unwrap() + .process_runner + .get_pid() } - fn stop(&mut self) -> std::io::Result<()> { - self.ic_boundary.stop() + + /// Start all processes appropriate for this node. + /// + /// Always starts the replica. For cloud-engine subnet nodes it also + /// starts ic-boundary, restarting it if the domain name has changed. + pub(crate) fn start_all( + &mut self, + replica_version: &ReplicaVersion, + subnet_id: SubnetId, + registry_version: RegistryVersion, + ) -> OrchestratorResult<()> { + self.replica_manager + .write() + .unwrap() + .ensure_replica_running(replica_version, subnet_id)?; + + // Cloud-engine nodes run ic-boundary as a sidecar. + match self.registry.get_subnet_type(subnet_id, registry_version)? { + None + | Some(SubnetType::Unspecified) + | Some(SubnetType::Application) + | Some(SubnetType::System) + | Some(SubnetType::VerifiedApplication) => { + self.ic_boundary_manager + .write() + .unwrap() + .stop_ic_boundary()?; + } + Some(SubnetType::CloudEngine) => { + self.ic_boundary_manager + .write() + .unwrap() + .ensure_ic_boundary_running_and_restarted_on_domain_change( + replica_version, + registry_version, + ); + } + } + + Ok(()) } - fn is_running(&self) -> bool { - self.ic_boundary.is_running() + + /// Stop the replica process. + pub(crate) fn stop_replica(&mut self) -> OrchestratorResult<()> { + self.replica_manager.write().unwrap().stop_replica() } - fn get_pid(&self) -> Option { - self.ic_boundary.get_pid() + + /// Stop every managed process. + pub(crate) fn stop_all(&mut self) -> OrchestratorResult<()> { + self.replica_manager.write().unwrap().stop_replica()?; + self.ic_boundary_manager + .write() + .unwrap() + .stop_ic_boundary()?; + + Ok(()) } } diff --git a/rs/orchestrator/src/upgrade.rs b/rs/orchestrator/src/upgrade.rs index 0c9b20a18ce1..7a0eea7faedd 100644 --- a/rs/orchestrator/src/upgrade.rs +++ b/rs/orchestrator/src/upgrade.rs @@ -3,13 +3,11 @@ use crate::{ error::{OrchestratorError, OrchestratorResult}, metrics::OrchestratorMetrics, orchestrator::SubnetAssignment, - process_manager::{Process, ProcessRunner}, - processes::{IcBoundaryProcess, ProcessManager, ReplicaProcess}, + processes::MultipleProcessesManager, registry_helper::RegistryHelper, }; use async_trait::async_trait; use guest_upgrade_server::DiskEncryptionKeyExchangeServerAgent; -use ic_config::crypto::CryptoConfig; use ic_consensus_dkg::get_vetkey_public_keys; use ic_crypto::get_master_public_key_from_transcript; use ic_http_utils::file_downloader::FileDownloader; @@ -20,7 +18,7 @@ use ic_image_upgrader::{ use ic_interfaces_registry::RegistryClient; use ic_logger::{ReplicaLogger, error, info, warn}; use ic_management_canister_types_private::MasterPublicKeyId; -use ic_protobuf::{proxy::try_from_option_field, registry::subnet::v1::SubnetType}; +use ic_protobuf::proxy::try_from_option_field; use ic_registry_client_helpers::{node::NodeRegistry, subnet::SubnetRegistry}; use ic_registry_local_store::{LocalStore, LocalStoreImpl}; use ic_registry_replicator::RegistryReplicator; @@ -34,8 +32,8 @@ use ic_types::{ }; use std::{ collections::BTreeMap, - path::{Path, PathBuf}, - sync::{Arc, Mutex, RwLock}, + path::PathBuf, + sync::{Arc, RwLock}, time::{Duration, Instant}, }; @@ -91,14 +89,12 @@ impl RegistryReplicatorForUpgrade for RegistryReplicator { pub(crate) struct Upgrade { pub registry: Arc, pub metrics: Arc, - process_manager: Arc>, + processes_manager: Arc>, manageboot_runner: Box, cup_provider: CatchUpPackageProvider, subnet_assignment: Arc>, replica_version: ReplicaVersion, replica_config_file: PathBuf, - pub ic_binary_dir: PathBuf, - ic_boundary_env_file: PathBuf, pub image_path: PathBuf, registry_replicator: Arc, init_time: Instant, @@ -108,7 +104,6 @@ pub(crate) struct Upgrade { /// The replica version that is prepared by 'prepare_upgrade' to upgrade to. pub prepared_upgrade_version: Option, pub orchestrator_data_directory: PathBuf, - crypto_config: CryptoConfig, } impl Upgrade { @@ -116,36 +111,31 @@ impl Upgrade { pub(crate) async fn new( registry: Arc, metrics: Arc, - process_manager: Arc>, + processes_manager: Arc>, manageboot_runner: Box, cup_provider: CatchUpPackageProvider, subnet_assignment: Arc>, replica_version: ReplicaVersion, replica_config_file: PathBuf, node_id: NodeId, - ic_binary_dir: PathBuf, - ic_boundary_env_file: PathBuf, registry_replicator: Arc, release_content_dir: PathBuf, logger: ReplicaLogger, orchestrator_data_directory: PathBuf, disk_encryption_key_exchange_agent: Option, - crypto_config: CryptoConfig, ) -> Self { let init_time = Instant::now(); let value = Self { registry, metrics, - process_manager, + processes_manager, manageboot_runner, cup_provider, subnet_assignment, node_id, replica_version, replica_config_file, - ic_binary_dir, - ic_boundary_env_file, image_path: release_content_dir.join("image.bin"), registry_replicator, init_time, @@ -153,7 +143,6 @@ impl Upgrade { prepared_upgrade_version: None, orchestrator_data_directory, disk_encryption_key_exchange_agent, - crypto_config, }; if let Err(e) = value.report_reboot_time() { warn!(logger, "Cannot report the reboot time: {}", e); @@ -589,17 +578,6 @@ impl Upgrade { Ok(()) } - /// Stop the current replica process. - fn stop_replica(&self) -> OrchestratorResult<()> { - let mut process_manager = self.process_manager.lock().unwrap(); - >::stop(&mut process_manager).map_err(|e| { - OrchestratorError::IoError( - "Error when attempting to stop replica during upgrade".into(), - e, - ) - }) - } - /// Stop the replica if the given CUP is unsigned and higher than the given height. /// Without restart, consensus would reject the unsigned artifact. /// If stopping the replica fails, restart the current process instead. @@ -616,7 +594,7 @@ impl Upgrade { ); // Restarting the replica is enough to pass the unsigned CUP forward. // If we fail, restart the current process instead. - if let Err(e) = self.stop_replica() { + if let Err(e) = self.processes_manager.write().unwrap().stop_replica() { warn!(self.logger, "Failed to stop replica with error {:?}", e); reexec_current_process(&self.logger); } @@ -625,140 +603,22 @@ impl Upgrade { /// Stop all child processes, including the replica. pub fn stop_children(&self) -> OrchestratorResult<()> { - self.stop_replica()?; - stop_ic_boundary(&mut *self.process_manager.lock().unwrap())?; - - Ok(()) + self.processes_manager.write().unwrap().stop_all() } - fn ensure_replica_is_running( + /// Start all child processes appropriate for this node. + fn ensure_children_are_running( &self, replica_version: &ReplicaVersion, subnet_id: SubnetId, - ) -> OrchestratorResult<()> { - let mut process_manager = self.process_manager.lock().unwrap(); - if >::is_running(&process_manager) { - return Ok(()); - } - - info!(self.logger, "Starting new {} process", ReplicaProcess::NAME); - self.metrics - .processes_start_attempts - .with_label_values(&[ReplicaProcess::NAME]) - .inc(); - process_manager - .start(ReplicaProcess { - ic_binary_dir: self.ic_binary_dir.clone(), - replica_version: replica_version.clone(), - cup_path: self.cup_provider.get_cup_path(), - config_file: self.replica_config_file.clone(), - subnet_id, - }) - .map_err(|e| { - OrchestratorError::IoError("Error when attempting to start new replica".into(), e) - }) - } - - fn ensure_ic_boundary_is_running( - &self, - replica_version: &ReplicaVersion, registry_version: RegistryVersion, ) -> OrchestratorResult<()> { - let domain_name = self - .registry - .get_node_domain_name(registry_version)? - .ok_or_else(|| OrchestratorError::DomainNameMissingError(self.node_id))?; - - start_ic_boundary( - &mut *self.process_manager.lock().unwrap(), - &self.ic_binary_dir, - &self.ic_boundary_env_file, + self.processes_manager.write().unwrap().start_all( replica_version, - domain_name, - &self.crypto_config, - &self.logger, - &self.metrics, + subnet_id, + registry_version, ) } - - /// Start the child processes if not running already. - fn ensure_children_are_running( - &self, - replica_version: &ReplicaVersion, - subnet_id: SubnetId, - registry_version: RegistryVersion, - ) -> OrchestratorResult<()> { - // First ensure the replica is running before launching any other processes - self.ensure_replica_is_running(replica_version, subnet_id)?; - - match self.registry.get_subnet_type(subnet_id, registry_version)? { - None - | Some(SubnetType::Unspecified) - | Some(SubnetType::Application) - | Some(SubnetType::System) - | Some(SubnetType::VerifiedApplication) => {} - Some(SubnetType::CloudEngine) => { - // Cloud engines are self-contained and also run boundary nodes' stack - self.ensure_ic_boundary_is_running(replica_version, registry_version)?; - } - } - - Ok(()) - } -} - -pub(crate) fn start_ic_boundary( - process: &mut dyn ProcessRunner, - ic_binary_dir: &Path, - ic_boundary_env_file: &Path, - replica_version: &ReplicaVersion, - domain_name: String, - crypto_config: &CryptoConfig, - logger: &ReplicaLogger, - metrics: &OrchestratorMetrics, -) -> OrchestratorResult<()> { - if process.is_running() { - return Ok(()); - } - - let crypto_config = serde_json::to_string(&crypto_config) - .map_err(OrchestratorError::SerializeCryptoConfigError)?; - - let env = match env_file_reader::read_file(ic_boundary_env_file) { - Ok(env) => env, - Err(e) => { - return Err(OrchestratorError::IoError( - "unable to read ic-boundary environment variables".to_string(), - e, - )); - } - }; - - info!(logger, "Starting new {} process", IcBoundaryProcess::NAME); - metrics - .processes_start_attempts - .with_label_values(&[IcBoundaryProcess::NAME]) - .inc(); - process - .start(IcBoundaryProcess { - ic_binary_dir: ic_binary_dir.to_path_buf(), - replica_version: replica_version.clone(), - domain_name, - crypto_config, - env, - }) - .map_err(|e| { - OrchestratorError::IoError("Error when attempting to start ic-boundary".into(), e) - }) -} - -/// Stop the current boundary node process. -pub(crate) fn stop_ic_boundary( - process: &mut dyn ProcessRunner, -) -> OrchestratorResult<()> { - process.stop().map_err(|e| { - OrchestratorError::IoError("Error when attempting to stop ic-boundary".into(), e) - }) } #[async_trait] @@ -1237,6 +1097,7 @@ fn report_master_public_key_changed_metric( mod tests { use crate::catch_up_package_provider::LocalCUPReader; use crate::catch_up_package_provider::tests::mock_tls_config; + use crate::processes::ReplicaProcessConfig; use super::*; use assert_matches::assert_matches; @@ -1307,19 +1168,37 @@ mod tests { }; use tempfile::{TempDir, tempdir}; + impl MultipleProcessesManager { + pub(crate) fn replica_manager(&self) -> Arc> { + self.replica_manager.clone() + } + } + impl Upgrade { pub fn subnet_assignment(&self) -> SubnetAssignment { *self.subnet_assignment.read().unwrap() } pub fn is_replica_running(&self) -> bool { - let process_manager = self.process_manager.lock().unwrap(); - >::is_running(&process_manager) + self.processes_manager + .read() + .unwrap() + .replica_manager() + .read() + .unwrap() + .process_runner + .is_running() } pub fn is_ic_boundary_running(&self) -> bool { - let process_manager = self.process_manager.lock().unwrap(); - >::is_running(&process_manager) + self.processes_manager + .read() + .unwrap() + .ic_boundary_manager() + .read() + .unwrap() + .process_runner + .is_running() } } @@ -1642,47 +1521,6 @@ mod tests { let metrics = Arc::new(OrchestratorMetrics::new(&MetricsRegistry::new())); - let ic_binary_dir = dir.join("ic_binary"); - std::fs::create_dir_all(&ic_binary_dir).unwrap(); - let ic_boundary_env_file = dir.join("ic-boundary.env"); - std::fs::write(&ic_boundary_env_file, b"TEST_KEY=TEST_VALUE").unwrap(); - - let crypto_config = CryptoConfig::default(); - - let process_manager = Arc::new(Mutex::new(ProcessManager::new( - Box::new(FakeRunner::new()), - Box::new(FakeRunner::new()), - ))); - // Start the replica process if the test scenario indicates so - if test_scenario.were_child_processes_started_previously() { - process_manager - .lock() - .unwrap() - .start(ReplicaProcess { - ic_binary_dir: ic_binary_dir.clone(), - replica_version: current_replica_version.clone(), - cup_path: PathBuf::new(), - config_file: PathBuf::new(), - subnet_id: SUBNET_1, - }) - .unwrap(); - if matches!(subnet_type, SubnetType::CloudEngine) { - process_manager - .lock() - .unwrap() - .start(IcBoundaryProcess { - ic_binary_dir: ic_binary_dir.clone(), - replica_version: current_replica_version.clone(), - domain_name: "domain.name".to_string(), - crypto_config: serde_json::to_string(&crypto_config).unwrap(), - env: HashMap::new(), - }) - .unwrap(); - } - } - - let manageboot_runner = Box::new(FakeManagebootRunner); - let cup_dir = dir.join("cups"); std::fs::create_dir_all(&cup_dir).unwrap(); if let Some(local_cup) = has_local_cup { @@ -1704,9 +1542,65 @@ mod tests { node_id, ); - let subnet_assignment = Arc::new(RwLock::new(initial_subnet_assignment)); - let replica_config_file = dir.join("ic.json5"); + let ic_binary_dir = dir.join("ic_binary"); + std::fs::create_dir_all(&ic_binary_dir).unwrap(); + let ic_boundary_env_file = dir.join("ic-boundary.env"); + std::fs::write(&ic_boundary_env_file, b"TEST_KEY=TEST_VALUE").unwrap(); + + let crypto_config = CryptoConfig::default(); + + let replica_process_config = ReplicaProcessConfig { + ic_binary_dir: ic_binary_dir.clone(), + cup_path: cup_file, + replica_config_file: replica_config_file.clone(), + }; + let ic_boundary_process_config = IcBoundaryProcessConfig { + ic_binary_dir: ic_binary_dir.clone(), + ic_boundary_env_file, + crypto_config, + }; + + let processes_manager = MultipleProcessesManager::new( + replica_process_config.clone(), + ic_boundary_process_config.clone(), + Arc::clone(®istry), + Arc::clone(&metrics), + logger.clone(), + ); + // Start the replica process if the test scenario indicates so + if test_scenario.were_child_processes_started_previously() { + processes_manager + .replica_manager() + .write() + .unwrap() + .process_runner + .start(ReplicaProcess::new( + replica_process_config, + current_replica_version.clone(), + SUBNET_1, + )) + .unwrap(); + if matches!(subnet_type, SubnetType::CloudEngine) { + // Simulate ic-boundary already running by faking a start. + processes_manager + .ic_boundary_manager() + .write() + .unwrap() + .process_runner + .start(IcBoundaryProcess::new( + ic_boundary_process_config, + current_replica_version.clone(), + "domain.name".to_string(), + )) + .unwrap(); + } + } + let processes_manager = Arc::new(RwLock::new(processes_manager)); + + let manageboot_runner = Box::new(FakeManagebootRunner); + + let subnet_assignment = Arc::new(RwLock::new(initial_subnet_assignment)); let mut registry_replicator = MockRegistryReplicatorForUpgrade::new(); registry_replicator @@ -1735,21 +1629,18 @@ mod tests { let mut upgrade_loop = Upgrade::new( registry, metrics, - process_manager, + processes_manager, manageboot_runner, cup_provider, subnet_assignment, current_replica_version.clone(), replica_config_file, node_id, - ic_binary_dir, - ic_boundary_env_file, Arc::new(registry_replicator), release_content_dir, logger, orchestrator_data_dir, None, - crypto_config, ) .await; From 932808e860e5d3fcd8cb8abc5c738f35f551d06e Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Wed, 17 Jun 2026 08:45:09 +0000 Subject: [PATCH 05/42] feat: add ic-gateway --- Cargo.Bazel.json.lock | 15 +- bazel/rust.MODULE.bazel | 4 + ic-os/components/guestos.bzl | 1 + ic-os/components/guestos/ic-replica.service | 2 +- ic-os/components/guestos/share/ic-gateway.env | 4 + ic-os/guestos/defs.bzl | 1 + rs/ic_os/release/BUILD.bazel | 1 + rs/orchestrator/src/args.rs | 4 + rs/orchestrator/src/dashboard.rs | 9 + rs/orchestrator/src/orchestrator.rs | 12 +- rs/orchestrator/src/processes.rs | 179 +++++++++++++++--- rs/orchestrator/src/upgrade.rs | 108 +++++++---- rs/tests/consensus/orchestrator/BUILD.bazel | 1 + .../cloud_engine_ic_gateway_test.rs | 20 +- 14 files changed, 292 insertions(+), 69 deletions(-) create mode 100644 ic-os/components/guestos/share/ic-gateway.env diff --git a/Cargo.Bazel.json.lock b/Cargo.Bazel.json.lock index 96ba8cd0936b..68d3c4f1bde1 100644 --- a/Cargo.Bazel.json.lock +++ b/Cargo.Bazel.json.lock @@ -1,5 +1,5 @@ { - "checksum": "7f96e5c133c203870fc997d52e080f72044bd53f1530c5332ba2230232e22d68", + "checksum": "ff7f44505ebc5f13c6c3d5c53d2dffc80669e43460f22816aff45ab32ec31178", "crates": { "abnf 0.12.0": { "name": "abnf", @@ -37763,6 +37763,18 @@ ] } } + }, + { + "Binary": { + "crate_name": "ic-gateway", + "crate_root": "src/main.rs", + "srcs": { + "allow_empty": true, + "include": [ + "**/*.rs" + ] + } + } } ], "library_target_name": "ic_gateway", @@ -100978,6 +100990,7 @@ }, "binary_crates": [ "canbench 0.4.1", + "ic-gateway 0.2.0", "ic-wasm 0.9.11", "metrics-proxy 0.1.0" ], diff --git a/bazel/rust.MODULE.bazel b/bazel/rust.MODULE.bazel index ce659d446c2c..4dec0002247d 100644 --- a/bazel/rust.MODULE.bazel +++ b/bazel/rust.MODULE.bazel @@ -2179,6 +2179,10 @@ crate.annotation( crate = "metrics-proxy", gen_binaries = ["metrics-proxy"], ) +crate.annotation( + crate = "ic-gateway", + gen_binaries = ["ic-gateway"], +) crate.splicing_config( resolver_version = "2", ) diff --git a/ic-os/components/guestos.bzl b/ic-os/components/guestos.bzl index a074c864aaf4..bfe19faf31fc 100644 --- a/ic-os/components/guestos.bzl +++ b/ic-os/components/guestos.bzl @@ -53,6 +53,7 @@ def component_files(mode): Label("guestos/remote-attestation-server.service"): "/etc/systemd/system/remote-attestation-server.service", Label("guestos/generate-ic-config/generate-ic-config.service"): "/etc/systemd/system/generate-ic-config.service", Label("guestos/share/ic-boundary.env"): "/opt/ic/share/ic-boundary.env", + Label("guestos/share/ic-gateway.env"): "/opt/ic/share/ic-gateway.env", Label("guestos/share/nns_public_key.pem"): "/opt/ic/share/nns_public_key.pem", # init diff --git a/ic-os/components/guestos/ic-replica.service b/ic-os/components/guestos/ic-replica.service index 5d360ee96673..7657e20e4e4b 100644 --- a/ic-os/components/guestos/ic-replica.service +++ b/ic-os/components/guestos/ic-replica.service @@ -20,7 +20,7 @@ User=ic-replica Environment=RUST_BACKTRACE=1 Environment=RUST_MIN_STACK=8192000 -ExecStart=/opt/ic/bin/orchestrator --replica-binary-dir /var/lib/ic/data/images --cup-dir /var/lib/ic/data/cups --replica-config-file /run/ic-node/config/ic.json5 --ic-boundary-env-file /opt/ic/share/ic-boundary.env --enable-provisional-registration --ic-binary-directory /opt/ic/bin --orchestrator-data-directory /var/lib/ic/data/orchestrator --version-file /opt/ic/share/version.txt +ExecStart=/opt/ic/bin/orchestrator --replica-binary-dir /var/lib/ic/data/images --cup-dir /var/lib/ic/data/cups --replica-config-file /run/ic-node/config/ic.json5 --ic-boundary-env-file /opt/ic/share/ic-boundary.env --ic-gateway-env-file /opt/ic/share/ic-gateway.env --enable-provisional-registration --ic-binary-directory /opt/ic/bin --orchestrator-data-directory /var/lib/ic/data/orchestrator --version-file /opt/ic/share/version.txt LimitNOFILE=16777216 Restart=always RestartSec=10 diff --git a/ic-os/components/guestos/share/ic-gateway.env b/ic-os/components/guestos/share/ic-gateway.env new file mode 100644 index 000000000000..a40c1845ed05 --- /dev/null +++ b/ic-os/components/guestos/share/ic-gateway.env @@ -0,0 +1,4 @@ +LISTEN_PLAIN=[::]:80 +LISTEN_INSECURE_SERVE_HTTP_ONLY=true +IC_URL=http://127.0.0.1:8080 +DOMAIN=gateway.icp diff --git a/ic-os/guestos/defs.bzl b/ic-os/guestos/defs.bzl index 19ee4989fdcd..880484ad4cb5 100644 --- a/ic-os/guestos/defs.bzl +++ b/ic-os/guestos/defs.bzl @@ -40,6 +40,7 @@ def image_deps(mode, malicious = False): "//publish/binaries:orchestrator": "/opt/ic/bin/orchestrator:0755", # Replica process manager, required by the IC protocol (upgrades, node addition, etc). ("//publish/malicious:replica" if malicious else "//publish/binaries:replica"): "/opt/ic/bin/replica:0755", # Main protocol binary, required by the IC protocol. Installs the malicious replica iff set only in test builds. "//publish/binaries:ic-boundary": "/opt/ic/bin/ic-boundary:0755", # API boundary node binary, required by the IC protocol. The same GuestOS is used both for the replica and API boundary nodes. + "//rs/ic_os/release:ic-gateway": "/opt/ic/bin/ic-gateway:0755", # IC-gateway binary, required by cloud engine nodes, who run it as a sidecar to the replica. "//publish/binaries:ic-consensus-pool-util": "/opt/ic/bin/ic-consensus-pool-util:0755", # May be used during recoveries to export/import consensus pool artifacts. "//publish/binaries:ic-recovery": "/opt/ic/bin/ic-recovery:0755", # Required for performing subnet recoveries on the node directly. "//publish/binaries:state-tool": "/opt/ic/bin/state-tool:0755", # May be used during recoveries for calculating the state hash and inspecting the state more generally. diff --git a/rs/ic_os/release/BUILD.bazel b/rs/ic_os/release/BUILD.bazel index 27a70f3f07f0..1277aec21795 100644 --- a/rs/ic_os/release/BUILD.bazel +++ b/rs/ic_os/release/BUILD.bazel @@ -18,6 +18,7 @@ OBJECTS = { "metrics-proxy": "@crate_index//:metrics-proxy__metrics-proxy", "nss_icos": "//rs/ic_os/networking/nss_icos", "custom_metrics": "//rs/ic_os/metrics/custom_metrics:custom_metrics_bin", + "ic-gateway": "@crate_index//:ic-gateway__ic-gateway", } [release_strip_binary( diff --git a/rs/orchestrator/src/args.rs b/rs/orchestrator/src/args.rs index 5cbb135b8875..c3cc2f381da7 100644 --- a/rs/orchestrator/src/args.rs +++ b/rs/orchestrator/src/args.rs @@ -31,6 +31,10 @@ pub struct OrchestratorArgs { #[clap(long)] pub(crate) ic_boundary_env_file: PathBuf, + /// The path to the IC gateway environment file + #[clap(long)] + pub(crate) ic_gateway_env_file: PathBuf, + /// The path to the Replica binary location containing the following in case /// of guest OS deployment: version.txt, manageboot.sh, replica, /// install-upgrade.sh diff --git a/rs/orchestrator/src/dashboard.rs b/rs/orchestrator/src/dashboard.rs index c340e86c9ed1..9e8faeba4988 100644 --- a/rs/orchestrator/src/dashboard.rs +++ b/rs/orchestrator/src/dashboard.rs @@ -46,6 +46,7 @@ impl Dashboard for OrchestratorDashboard { subnet id: {}\n\ replica process id: {}\n\ ic-boundary process id: {}\n\ + ic-gateway process id: {}\n\ replica version: {}\n\ host os version: {}\n\ scheduled upgrade: {}\n\ @@ -63,6 +64,7 @@ impl Dashboard for OrchestratorDashboard { self.get_subnet_id(), self.get_replica_pid(), self.get_ic_boundary_pid(), + self.get_ic_gateway_pid(), self.replica_version, self.hostos_version .as_ref() @@ -150,6 +152,13 @@ impl OrchestratorDashboard { } } + fn get_ic_gateway_pid(&self) -> String { + match self.processes_manager.read().unwrap().get_ic_gateway_pid() { + Some(pid) => pid.to_string(), + None => "None".to_string(), + } + } + fn get_subnet_id(&self) -> String { match *self.subnet_assignment.read().unwrap() { SubnetAssignment::Assigned(id) => id.to_string(), diff --git a/rs/orchestrator/src/orchestrator.rs b/rs/orchestrator/src/orchestrator.rs index 77dbaae21b3b..20818c54dde9 100644 --- a/rs/orchestrator/src/orchestrator.rs +++ b/rs/orchestrator/src/orchestrator.rs @@ -7,7 +7,10 @@ use crate::{ hostos_upgrade::HostosUpgrader, ipv4_network::Ipv4Configurator, metrics::OrchestratorMetrics, - processes::{IcBoundaryProcessConfig, MultipleProcessesManager, ReplicaProcessConfig}, + processes::{ + IcBoundaryProcessConfig, IcGatewayProcessConfig, MultipleProcessesManager, + ReplicaProcessConfig, + }, registration::NodeRegistration, registry_helper::RegistryHelper, ssh_access_manager::SshAccessManager, @@ -269,10 +272,15 @@ impl Orchestrator { ic_boundary_env_file: args.ic_boundary_env_file.clone(), crypto_config: config.crypto.clone(), }; + let ic_gateway_process_config = IcGatewayProcessConfig { + ic_binary_dir: ic_binary_directory.clone(), + ic_gateway_env_file: args.ic_gateway_env_file.clone(), + }; let processes_manager = Arc::new(RwLock::new(MultipleProcessesManager::new( replica_process_config, - ic_boundary_process_config.clone(), + ic_boundary_process_config, + ic_gateway_process_config, Arc::clone(®istry), Arc::clone(&metrics), logger.clone(), diff --git a/rs/orchestrator/src/processes.rs b/rs/orchestrator/src/processes.rs index 8c8c37524e45..f6a94ba2a0ea 100644 --- a/rs/orchestrator/src/processes.rs +++ b/rs/orchestrator/src/processes.rs @@ -36,7 +36,7 @@ pub(crate) struct ReplicaProcess { } impl ReplicaProcess { - fn new( + pub(crate) fn new( config: ReplicaProcessConfig, replica_version: ReplicaVersion, subnet_id: SubnetId, @@ -144,7 +144,7 @@ pub(crate) struct IcBoundaryProcess { } impl IcBoundaryProcess { - fn new( + pub(crate) fn new( process_config: IcBoundaryProcessConfig, replica_version: ReplicaVersion, domain_name: String, @@ -201,7 +201,7 @@ pub(crate) struct IcBoundaryManager { pub process_runner: Box + Sync>, process_config: IcBoundaryProcessConfig, registry: Arc, - current_domain_name: Option, + pub current_domain_name: Option, metrics: Arc, logger: ReplicaLogger, } @@ -285,6 +285,107 @@ impl IcBoundaryManager { } } +// --------------------------------------------------------------------------- +// IcGatewayProcess +// --------------------------------------------------------------------------- + +#[derive(Clone)] +pub(crate) struct IcGatewayProcessConfig { + pub ic_binary_dir: PathBuf, + pub ic_gateway_env_file: PathBuf, +} + +pub(crate) struct IcGatewayProcess { + ic_binary_dir: PathBuf, + replica_version: ReplicaVersion, + env: HashMap, +} + +impl IcGatewayProcess { + pub(crate) fn new( + process_config: IcGatewayProcessConfig, + replica_version: ReplicaVersion, + ) -> OrchestratorResult { + let env = match env_file_reader::read_file(&process_config.ic_gateway_env_file) { + Ok(env) => env + .into_iter() + .map(|(k, v)| (OsString::from(k), OsString::from(v))) + .collect(), + Err(e) => { + return Err(OrchestratorError::IoError( + "unable to read ic-gateway environment variables".to_string(), + e, + )); + } + }; + + Ok(Self { + ic_binary_dir: process_config.ic_binary_dir, + replica_version, + env, + }) + } +} + +impl Process for IcGatewayProcess { + const NAME: &'static str = "ic-gateway"; + type Version = ReplicaVersion; + + fn get_version(&self) -> &Self::Version { + &self.replica_version + } + fn get_binary(&self) -> PathBuf { + self.ic_binary_dir.join(Self::NAME) + } + fn get_args(&self) -> Vec { + vec![] + } + fn get_env(&self) -> HashMap { + self.env.clone() + } +} + +pub(crate) struct IcGatewayManager { + pub process_runner: Box + Sync>, + process_config: IcGatewayProcessConfig, + metrics: Arc, + logger: ReplicaLogger, +} + +impl IcGatewayManager { + pub(crate) fn new( + process_config: IcGatewayProcessConfig, + metrics: Arc, + logger: ReplicaLogger, + ) -> Self { + let process_runner = Box::new(SingleProcessRunner::new(logger.clone())); + Self { + process_runner, + process_config, + metrics, + logger, + } + } + + fn ensure_ic_gateway_running( + &mut self, + replica_version: &ReplicaVersion, + ) -> OrchestratorResult<()> { + start_orchestrator_process( + &mut *self.process_runner, + IcGatewayProcess::new(self.process_config.clone(), replica_version.clone())?, + &self.metrics, + &self.logger, + ) + } + + fn stop_ic_gateway(&mut self) -> OrchestratorResult<()> { + self.process_runner.stop().map_err(|e| { + OrchestratorError::IoError("Error when attempting to stop ic-gateway".to_string(), e) + }) + } +} + // --------------------------------------------------------------------------- // MultipleProcessManager // @@ -296,6 +397,7 @@ impl IcBoundaryManager { pub(crate) struct MultipleProcessesManager { replica_manager: Arc>, ic_boundary_manager: Arc>, + ic_gateway_manager: Arc>, registry: Arc, } @@ -303,6 +405,7 @@ impl MultipleProcessesManager { pub(crate) fn new( replica_process_config: ReplicaProcessConfig, ic_boundary_process_config: IcBoundaryProcessConfig, + ic_gateway_process_config: IcGatewayProcessConfig, registry: Arc, metrics: Arc, logger: ReplicaLogger, @@ -315,6 +418,11 @@ impl MultipleProcessesManager { let ic_boundary_manager = Arc::new(RwLock::new(IcBoundaryManager::new( ic_boundary_process_config, registry.clone(), + metrics.clone(), + logger.clone(), + ))); + let ic_gateway_manager = Arc::new(RwLock::new(IcGatewayManager::new( + ic_gateway_process_config, metrics, logger, ))); @@ -322,14 +430,29 @@ impl MultipleProcessesManager { Self { replica_manager, ic_boundary_manager, + ic_gateway_manager, registry, } } + // Used in tests to assert the state of the managed processes. + #[cfg(test)] + pub(crate) fn replica_manager(&self) -> Arc> { + self.replica_manager.clone() + } + + // Used in tests to assert the state of the managed processes, but also in production code to + // share the `ic-boundary` process with `BoundaryNodeManager`. pub(crate) fn ic_boundary_manager(&self) -> Arc> { self.ic_boundary_manager.clone() } + // Used in tests to assert the state of the managed processes. + #[cfg(test)] + pub(crate) fn ic_gateway_manager(&self) -> Arc> { + self.ic_gateway_manager.clone() + } + pub(crate) fn get_replica_pid(&self) -> Option { self.replica_manager .read() @@ -346,20 +469,30 @@ impl MultipleProcessesManager { .get_pid() } + pub(crate) fn get_ic_gateway_pid(&self) -> Option { + self.ic_gateway_manager + .read() + .unwrap() + .process_runner + .get_pid() + } + /// Start all processes appropriate for this node. /// /// Always starts the replica. For cloud-engine subnet nodes it also - /// starts ic-boundary, restarting it if the domain name has changed. + /// starts ic-boundary, restarting it if the domain name has changed, + /// and ic-gateway. pub(crate) fn start_all( &mut self, replica_version: &ReplicaVersion, subnet_id: SubnetId, registry_version: RegistryVersion, ) -> OrchestratorResult<()> { - self.replica_manager - .write() - .unwrap() - .ensure_replica_running(replica_version, subnet_id)?; + let mut replica_manager = self.replica_manager.write().unwrap(); + let mut ic_boundary_manager = self.ic_boundary_manager.write().unwrap(); + let mut ic_gateway_manager = self.ic_gateway_manager.write().unwrap(); + + replica_manager.ensure_replica_running(replica_version, subnet_id)?; // Cloud-engine nodes run ic-boundary as a sidecar. match self.registry.get_subnet_type(subnet_id, registry_version)? { @@ -368,19 +501,15 @@ impl MultipleProcessesManager { | Some(SubnetType::Application) | Some(SubnetType::System) | Some(SubnetType::VerifiedApplication) => { - self.ic_boundary_manager - .write() - .unwrap() - .stop_ic_boundary()?; + ic_boundary_manager.stop_ic_boundary()?; + ic_gateway_manager.stop_ic_gateway()?; } Some(SubnetType::CloudEngine) => { - self.ic_boundary_manager - .write() - .unwrap() - .ensure_ic_boundary_running_and_restarted_on_domain_change( - replica_version, - registry_version, - ); + ic_boundary_manager.ensure_ic_boundary_running_and_restarted_on_domain_change( + replica_version, + registry_version, + ); + ic_gateway_manager.ensure_ic_gateway_running(replica_version)?; } } @@ -394,11 +523,13 @@ impl MultipleProcessesManager { /// Stop every managed process. pub(crate) fn stop_all(&mut self) -> OrchestratorResult<()> { - self.replica_manager.write().unwrap().stop_replica()?; - self.ic_boundary_manager - .write() - .unwrap() - .stop_ic_boundary()?; + let mut replica_manager = self.replica_manager.write().unwrap(); + let mut ic_boundary_manager = self.ic_boundary_manager.write().unwrap(); + let mut ic_gateway_manager = self.ic_gateway_manager.write().unwrap(); + + replica_manager.stop_replica()?; + ic_boundary_manager.stop_ic_boundary()?; + ic_gateway_manager.stop_ic_gateway()?; Ok(()) } diff --git a/rs/orchestrator/src/upgrade.rs b/rs/orchestrator/src/upgrade.rs index 7a0eea7faedd..31a9d7510fc2 100644 --- a/rs/orchestrator/src/upgrade.rs +++ b/rs/orchestrator/src/upgrade.rs @@ -1097,10 +1097,15 @@ fn report_master_public_key_changed_metric( mod tests { use crate::catch_up_package_provider::LocalCUPReader; use crate::catch_up_package_provider::tests::mock_tls_config; - use crate::processes::ReplicaProcessConfig; + use crate::process_manager::{Process, ProcessRunner}; + use crate::processes::{ + IcBoundaryProcess, IcBoundaryProcessConfig, IcGatewayProcess, IcGatewayProcessConfig, + ReplicaProcess, ReplicaProcessConfig, + }; use super::*; use assert_matches::assert_matches; + use ic_config::crypto::CryptoConfig; use ic_crypto_test_utils_canister_threshold_sigs::{ CanisterThresholdSigTestEnvironment, IDkgParticipants, generate_key_transcript, }; @@ -1121,7 +1126,9 @@ mod tests { use ic_protobuf::registry::subnet::v1::{CatchUpPackageContents, InitialNiDkgTranscriptRecord}; use ic_protobuf::registry::unassigned_nodes_config::v1::UnassignedNodesConfigRecord; use ic_protobuf::registry::{ - node::v1::NodeRecord, replica_version::v1::ReplicaVersionRecord, subnet::v1::SubnetRecord, + node::v1::NodeRecord, + replica_version::v1::ReplicaVersionRecord, + subnet::v1::{SubnetRecord, SubnetType}, }; use ic_protobuf::types::v1 as pb; use ic_registry_client_fake::FakeRegistryClient; @@ -1161,18 +1168,14 @@ mod tests { use rstest::rstest; use slog::Level; use std::{ - collections::{BTreeMap, BTreeSet, HashMap}, + collections::{BTreeMap, BTreeSet}, ffi::OsStr, path::Path, process::Output, }; use tempfile::{TempDir, tempdir}; - impl MultipleProcessesManager { - pub(crate) fn replica_manager(&self) -> Arc> { - self.replica_manager.clone() - } - } + const DOMAIN_NAME: &str = "domain.name"; impl Upgrade { pub fn subnet_assignment(&self) -> SubnetAssignment { @@ -1200,10 +1203,21 @@ mod tests { .process_runner .is_running() } + + pub fn is_ic_gateway_running(&self) -> bool { + self.processes_manager + .read() + .unwrap() + .ic_gateway_manager() + .read() + .unwrap() + .process_runner + .is_running() + } } /// Fake runner that tracks running state without spawning a real process. - /// Used as a drop-in for `SingleProcessRunner

` inside `ProcessManager`. + /// Used as a drop-in for `SingleProcessRunner

` inside process managers. pub(crate) struct FakeRunner { running: bool, } @@ -1522,6 +1536,7 @@ mod tests { let metrics = Arc::new(OrchestratorMetrics::new(&MetricsRegistry::new())); let cup_dir = dir.join("cups"); + let cup_file = cup_dir.join("cup.types.v1.CatchUpPackage.pb"); std::fs::create_dir_all(&cup_dir).unwrap(); if let Some(local_cup) = has_local_cup { let cup = make_local_cup( @@ -1530,7 +1545,6 @@ mod tests { local_cup.registry_version, ); let cup_proto = pb::CatchUpPackage::from(cup); - let cup_file = cup_dir.join("cup.types.v1.CatchUpPackage.pb"); std::fs::write(&cup_file, cup_proto.encode_to_vec()).unwrap(); } let cup_provider = CatchUpPackageProvider::new( @@ -1547,6 +1561,8 @@ mod tests { std::fs::create_dir_all(&ic_binary_dir).unwrap(); let ic_boundary_env_file = dir.join("ic-boundary.env"); std::fs::write(&ic_boundary_env_file, b"TEST_KEY=TEST_VALUE").unwrap(); + let ic_gateway_env_file = dir.join("ic-gateway.env"); + std::fs::write(&ic_gateway_env_file, b"TEST_KEY=TEST_VALUE").unwrap(); let crypto_config = CryptoConfig::default(); @@ -1560,20 +1576,30 @@ mod tests { ic_boundary_env_file, crypto_config, }; - + let ic_gateway_process_config = IcGatewayProcessConfig { + ic_binary_dir: ic_binary_dir.clone(), + ic_gateway_env_file, + }; let processes_manager = MultipleProcessesManager::new( replica_process_config.clone(), ic_boundary_process_config.clone(), + ic_gateway_process_config.clone(), Arc::clone(®istry), Arc::clone(&metrics), logger.clone(), ); + let replica_manager_lock = processes_manager.replica_manager(); + let mut replica_manager = replica_manager_lock.write().unwrap(); + replica_manager.process_runner = Box::new(FakeRunner::new()); + let ic_boundary_manager_lock = processes_manager.ic_boundary_manager(); + let mut ic_boundary_manager = ic_boundary_manager_lock.write().unwrap(); + ic_boundary_manager.process_runner = Box::new(FakeRunner::new()); + let ic_gateway_manager_lock = processes_manager.ic_gateway_manager(); + let mut ic_gateway_manager = ic_gateway_manager_lock.write().unwrap(); + ic_gateway_manager.process_runner = Box::new(FakeRunner::new()); // Start the replica process if the test scenario indicates so if test_scenario.were_child_processes_started_previously() { - processes_manager - .replica_manager() - .write() - .unwrap() + replica_manager .process_runner .start(ReplicaProcess::new( replica_process_config, @@ -1582,17 +1608,30 @@ mod tests { )) .unwrap(); if matches!(subnet_type, SubnetType::CloudEngine) { - // Simulate ic-boundary already running by faking a start. - processes_manager - .ic_boundary_manager() - .write() - .unwrap() + // After starting ic-boundary, its manager sets its internal domain name, so set it + // here to simulate a previous start of ic-boundary. + ic_boundary_manager.current_domain_name = Some(DOMAIN_NAME.to_string()); + // Simulate ic-boundary and ic-gateway already running by faking a start. + ic_boundary_manager .process_runner - .start(IcBoundaryProcess::new( - ic_boundary_process_config, - current_replica_version.clone(), - "domain.name".to_string(), - )) + .start( + IcBoundaryProcess::new( + ic_boundary_process_config, + current_replica_version.clone(), + DOMAIN_NAME.to_string(), + ) + .unwrap(), + ) + .unwrap(); + ic_gateway_manager + .process_runner + .start( + IcGatewayProcess::new( + ic_gateway_process_config, + current_replica_version.clone(), + ) + .unwrap(), + ) .unwrap(); } } @@ -1805,7 +1844,7 @@ mod tests { let mut node_record = NodeRecord::default(); if matches!(self.subnet_type, SubnetType::CloudEngine) { // Nodes in Cloud engines must have a domain name to start ic-boundary - node_record.domain = Some("domain.name".to_string()); + node_record.domain = Some(DOMAIN_NAME.to_string()); } add_node_record_to_provider( &data_provider, @@ -2455,6 +2494,7 @@ mod tests { if matches!(self.subnet_type, SubnetType::CloudEngine) { assert_has_started(ReplicaProcess::NAME); assert_has_started(IcBoundaryProcess::NAME); + assert_has_started(IcGatewayProcess::NAME); } else { assert_has_started(ReplicaProcess::NAME); }; @@ -2462,6 +2502,7 @@ mod tests { let assert_has_not_started_new_processes = || { assert_has_not_started(ReplicaProcess::NAME); assert_has_not_started(IcBoundaryProcess::NAME); + assert_has_not_started(IcGatewayProcess::NAME); }; match &self.has_local_cup { Some(local_cup) => { @@ -2480,12 +2521,14 @@ mod tests { { assert_has_started(ReplicaProcess::NAME); assert_has_started(IcBoundaryProcess::NAME); + assert_has_started(IcGatewayProcess::NAME); } (Some((registry_cup, _)), _, _) if registry_cup.height >= local_cup.height => { assert_has_started(ReplicaProcess::NAME); assert_has_not_started(IcBoundaryProcess::NAME); + assert_has_not_started(IcGatewayProcess::NAME); } (_, true, _) => { assert_has_not_started_new_processes(); @@ -2742,6 +2785,11 @@ mod tests { | Ok(OrchestratorControlFlow::Leaving(_)) ) || test_scenario.were_child_processes_started_previously()) ); + // - The ic-gateway process is running <=> the ic-boundary process is running + assert_eq!( + upgrade_loop.is_ic_gateway_running(), + upgrade_loop.is_ic_boundary_running(), + ); // - As an assigned node: if new_subnet_assignment != SubnetAssignment::Unassigned { // - If the replicator has not yet replicated all versions before init, then we should never @@ -2821,10 +2869,8 @@ mod tests { #[values( None, Some(RegistryVersion::from(5)), - Some(RegistryVersion::from(10)), Some(RegistryVersion::from(50)), - Some(RegistryVersion::from(100)), - Some(RegistryVersion::from(150)) + Some(RegistryVersion::from(100)) )] is_leaving: Option, #[values(false, true)] does_upgrade: bool, @@ -2833,10 +2879,8 @@ mod tests { RegistryVersion::from(3), RegistryVersion::from(5), RegistryVersion::from(10), - RegistryVersion::from(75), RegistryVersion::from(100), - RegistryVersion::from(150), - RegistryVersion::from(175) + RegistryVersion::from(150) )] upgrade_registry_version: RegistryVersion, #[values(false, true)] upgrade_is_recalled: bool, diff --git a/rs/tests/consensus/orchestrator/BUILD.bazel b/rs/tests/consensus/orchestrator/BUILD.bazel index 920936097c74..908b6505795d 100644 --- a/rs/tests/consensus/orchestrator/BUILD.bazel +++ b/rs/tests/consensus/orchestrator/BUILD.bazel @@ -130,6 +130,7 @@ system_test( "@crate_index//:reqwest", "@crate_index//:serde_cbor", "@crate_index//:slog", + "@crate_index//:url", ], ) diff --git a/rs/tests/consensus/orchestrator/cloud_engine_ic_gateway_test.rs b/rs/tests/consensus/orchestrator/cloud_engine_ic_gateway_test.rs index 4bfadb591912..5a77677c1005 100644 --- a/rs/tests/consensus/orchestrator/cloud_engine_ic_gateway_test.rs +++ b/rs/tests/consensus/orchestrator/cloud_engine_ic_gateway_test.rs @@ -21,7 +21,7 @@ Every cloud engine node reports a healthy status on port 80. end::catalog[] */ -use anyhow::{Context, Result, bail}; +use anyhow::{Result, bail}; use ic_registry_subnet_type::SubnetType; use ic_system_test_driver::driver::group::SystemTestGroup; use ic_system_test_driver::driver::ic::{InternetComputer, Subnet}; @@ -35,6 +35,7 @@ use ic_system_test_driver::{retry_with_msg_async, systest}; use ic_types::messages::{HttpStatusResponse, ReplicaHealthStatus}; use slog::{Logger, info}; use std::time::Duration; +use url::Url; /// Port on which `ic-gateway` exposes the public API of a cloud engine node. /// @@ -44,6 +45,9 @@ use std::time::Duration; /// network by the cloud-engine firewall rules). const IC_GATEWAY_PORT: u16 = 80; +/// Domain name that all `ic-gateway`s are configured to serve. +const IC_GATEWAY_DOMAIN: &str = "gateway.icp"; + /// Number of nodes in the cloud engine subnet under test. const CLOUD_ENGINE_NODES: usize = 4; @@ -118,14 +122,8 @@ fn test(env: TestEnv) { /// This mirrors the driver's standard health check (`status_is_healthy`), but /// retargets it from port 8080 to port 80. async fn await_healthy_on_ic_gateway(node: &IcNodeSnapshot, logger: &Logger) -> Result<()> { - // `get_public_url` yields the replica's URL on port 8080; rewrite the port to - // reach the co-located `ic-gateway` instead. - let mut url = node.get_public_url(); - url.set_port(Some(IC_GATEWAY_PORT)) - .map_err(|_| anyhow::anyhow!("failed to set port {IC_GATEWAY_PORT} on {url}"))?; - let status_url = url - .join("api/v2/status") - .expect("failed to join status path"); + let status_url = Url::parse(&format!("http://{IC_GATEWAY_DOMAIN}/api/v2/status")) + .expect("failed to parse status URL"); retry_with_msg_async!( format!( @@ -138,6 +136,10 @@ async fn await_healthy_on_ic_gateway(node: &IcNodeSnapshot, logger: &Logger) -> || async { let response = reqwest::Client::builder() .timeout(STATUS_REQUEST_TIMEOUT) + .resolve( + IC_GATEWAY_DOMAIN, + (node.get_ip_addr(), IC_GATEWAY_PORT).into(), + ) .build() .expect("cannot build a reqwest client") .get(status_url.clone()) From de65d146076bc96d017dd3c74f7fd7fc489b6fdc Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Wed, 17 Jun 2026 09:31:23 +0000 Subject: [PATCH 06/42] feat: run ic-gateway only for now --- .../components/guestos/share/ic-boundary.env | 2 +- rs/orchestrator/src/boundary_node.rs | 10 +- rs/orchestrator/src/dashboard.rs | 9 -- rs/orchestrator/src/orchestrator.rs | 23 ++-- rs/orchestrator/src/processes.rs | 108 +++++---------- rs/orchestrator/src/upgrade.rs | 128 +++--------------- 6 files changed, 75 insertions(+), 205 deletions(-) diff --git a/ic-os/components/guestos/share/ic-boundary.env b/ic-os/components/guestos/share/ic-boundary.env index 1791e4a9905a..c23bfa9de7f2 100644 --- a/ic-os/components/guestos/share/ic-boundary.env +++ b/ic-os/components/guestos/share/ic-boundary.env @@ -1,7 +1,7 @@ LISTEN_HTTPS_PORT="443" TLS_CERT_PATH="/var/lib/ic/data/ic-boundary-tls.crt" TLS_PKEY_PATH="/var/lib/ic/data/ic-boundary-tls.key" -TLS_ACME_CREDENTIALS_PATH="/var/lib/ic/data" # Check that it's fine (it should be) +TLS_ACME_CREDENTIALS_PATH="/var/lib/ic/data" HTTP_CLIENT_TIMEOUT_CONNECT="3s" REGISTRY_LOCAL_STORE_PATH="/var/lib/ic/data/ic_registry_local_store" RATE_LIMIT_GENERIC_CANISTER_ID="u637p-5aaaa-aaaaq-qaaca-cai" diff --git a/rs/orchestrator/src/boundary_node.rs b/rs/orchestrator/src/boundary_node.rs index 121ea787c9ed..9c3c14dfb80d 100644 --- a/rs/orchestrator/src/boundary_node.rs +++ b/rs/orchestrator/src/boundary_node.rs @@ -3,11 +3,11 @@ use crate::{ }; use ic_logger::{ReplicaLogger, warn}; use ic_types::{NodeId, ReplicaVersion}; -use std::sync::{Arc, RwLock}; +use std::sync::Arc; pub(crate) struct BoundaryNodeManager { registry: Arc, - process_manager: Arc>, + process_manager: IcBoundaryManager, version: ReplicaVersion, node_id: NodeId, logger: ReplicaLogger, @@ -16,7 +16,7 @@ pub(crate) struct BoundaryNodeManager { impl BoundaryNodeManager { pub(crate) fn new( registry: Arc, - process_manager: Arc>, + process_manager: IcBoundaryManager, version: ReplicaVersion, node_id: NodeId, logger: ReplicaLogger, @@ -48,8 +48,6 @@ impl BoundaryNodeManager { // serving requests while the orchestrator is downloading the new image in most cases. } else { self.process_manager - .write() - .unwrap() .ensure_ic_boundary_running_and_restarted_on_domain_change( &self.version, registry_version, @@ -58,7 +56,7 @@ impl BoundaryNodeManager { } // BN should not be active Err(OrchestratorError::ApiBoundaryNodeMissingError(_, _)) => { - if let Err(err) = self.process_manager.write().unwrap().stop_ic_boundary() { + if let Err(err) = self.process_manager.stop_ic_boundary() { warn!(self.logger, "Failed to stop Boundary Node: {}", err); } } diff --git a/rs/orchestrator/src/dashboard.rs b/rs/orchestrator/src/dashboard.rs index 9e8faeba4988..0ec22e5b6a86 100644 --- a/rs/orchestrator/src/dashboard.rs +++ b/rs/orchestrator/src/dashboard.rs @@ -45,7 +45,6 @@ impl Dashboard for OrchestratorDashboard { last poll's certified time: {}\n\ subnet id: {}\n\ replica process id: {}\n\ - ic-boundary process id: {}\n\ ic-gateway process id: {}\n\ replica version: {}\n\ host os version: {}\n\ @@ -63,7 +62,6 @@ impl Dashboard for OrchestratorDashboard { self.get_last_poll_certified_time(), self.get_subnet_id(), self.get_replica_pid(), - self.get_ic_boundary_pid(), self.get_ic_gateway_pid(), self.replica_version, self.hostos_version @@ -145,13 +143,6 @@ impl OrchestratorDashboard { } } - fn get_ic_boundary_pid(&self) -> String { - match self.processes_manager.read().unwrap().get_ic_boundary_pid() { - Some(pid) => pid.to_string(), - None => "None".to_string(), - } - } - fn get_ic_gateway_pid(&self) -> String { match self.processes_manager.read().unwrap().get_ic_gateway_pid() { Some(pid) => pid.to_string(), diff --git a/rs/orchestrator/src/orchestrator.rs b/rs/orchestrator/src/orchestrator.rs index 20818c54dde9..65493553018a 100644 --- a/rs/orchestrator/src/orchestrator.rs +++ b/rs/orchestrator/src/orchestrator.rs @@ -8,8 +8,8 @@ use crate::{ ipv4_network::Ipv4Configurator, metrics::OrchestratorMetrics, processes::{ - IcBoundaryProcessConfig, IcGatewayProcessConfig, MultipleProcessesManager, - ReplicaProcessConfig, + IcBoundaryManager, IcBoundaryProcessConfig, IcGatewayProcessConfig, + MultipleProcessesManager, ReplicaProcessConfig, }, registration::NodeRegistration, registry_helper::RegistryHelper, @@ -267,11 +267,6 @@ impl Orchestrator { cup_path: local_cup_reader.get_cup_path(), replica_config_file: args.replica_config_file.clone(), }; - let ic_boundary_process_config = IcBoundaryProcessConfig { - ic_binary_dir: ic_binary_directory.clone(), - ic_boundary_env_file: args.ic_boundary_env_file.clone(), - crypto_config: config.crypto.clone(), - }; let ic_gateway_process_config = IcGatewayProcessConfig { ic_binary_dir: ic_binary_directory.clone(), ic_gateway_env_file: args.ic_gateway_env_file.clone(), @@ -279,7 +274,6 @@ impl Orchestrator { let processes_manager = Arc::new(RwLock::new(MultipleProcessesManager::new( replica_process_config, - ic_boundary_process_config, ic_gateway_process_config, Arc::clone(®istry), Arc::clone(&metrics), @@ -345,9 +339,20 @@ impl Orchestrator { ), }; + let ic_boundary_process_config = IcBoundaryProcessConfig { + ic_binary_dir: ic_binary_directory.clone(), + ic_boundary_env_file: args.ic_boundary_env_file.clone(), + crypto_config: config.crypto.clone(), + }; + let ic_boundary_manager = IcBoundaryManager::new( + ic_boundary_process_config, + Arc::clone(®istry), + Arc::clone(&metrics), + logger.clone(), + ); let boundary_node = BoundaryNodeManager::new( Arc::clone(®istry), - processes_manager.read().unwrap().ic_boundary_manager(), + ic_boundary_manager, replica_version.clone(), node_id, logger.clone(), diff --git a/rs/orchestrator/src/processes.rs b/rs/orchestrator/src/processes.rs index f6a94ba2a0ea..9ab557bbfaf0 100644 --- a/rs/orchestrator/src/processes.rs +++ b/rs/orchestrator/src/processes.rs @@ -13,7 +13,7 @@ use std::{ collections::HashMap, ffi::OsString, path::PathBuf, - sync::{Arc, RwLock}, + sync::Arc, }; // --------------------------------------------------------------------------- @@ -395,41 +395,38 @@ impl IcGatewayManager { // --------------------------------------------------------------------------- pub(crate) struct MultipleProcessesManager { - replica_manager: Arc>, - ic_boundary_manager: Arc>, - ic_gateway_manager: Arc>, + replica_manager: ReplicaManager, + ic_gateway_manager: IcGatewayManager, registry: Arc, } impl MultipleProcessesManager { + #[cfg(test)] + pub(crate) fn new_for_test( + replica_manager: ReplicaManager, + ic_gateway_manager: IcGatewayManager, + registry: Arc, + ) -> Self { + Self { + replica_manager, + ic_gateway_manager, + registry, + } + } + pub(crate) fn new( replica_process_config: ReplicaProcessConfig, - ic_boundary_process_config: IcBoundaryProcessConfig, ic_gateway_process_config: IcGatewayProcessConfig, registry: Arc, metrics: Arc, logger: ReplicaLogger, ) -> Self { - let replica_manager = Arc::new(RwLock::new(ReplicaManager::new( - replica_process_config, - metrics.clone(), - logger.clone(), - ))); - let ic_boundary_manager = Arc::new(RwLock::new(IcBoundaryManager::new( - ic_boundary_process_config, - registry.clone(), - metrics.clone(), - logger.clone(), - ))); - let ic_gateway_manager = Arc::new(RwLock::new(IcGatewayManager::new( - ic_gateway_process_config, - metrics, - logger, - ))); + let replica_manager = + ReplicaManager::new(replica_process_config, metrics.clone(), logger.clone()); + let ic_gateway_manager = IcGatewayManager::new(ic_gateway_process_config, metrics, logger); Self { replica_manager, - ic_boundary_manager, ic_gateway_manager, registry, } @@ -437,79 +434,49 @@ impl MultipleProcessesManager { // Used in tests to assert the state of the managed processes. #[cfg(test)] - pub(crate) fn replica_manager(&self) -> Arc> { - self.replica_manager.clone() - } - - // Used in tests to assert the state of the managed processes, but also in production code to - // share the `ic-boundary` process with `BoundaryNodeManager`. - pub(crate) fn ic_boundary_manager(&self) -> Arc> { - self.ic_boundary_manager.clone() + pub(crate) fn is_replica_running(&self) -> bool { + self.replica_manager.process_runner.is_running() } // Used in tests to assert the state of the managed processes. #[cfg(test)] - pub(crate) fn ic_gateway_manager(&self) -> Arc> { - self.ic_gateway_manager.clone() + pub(crate) fn is_ic_gateway_running(&self) -> bool { + self.ic_gateway_manager.process_runner.is_running() } pub(crate) fn get_replica_pid(&self) -> Option { - self.replica_manager - .read() - .unwrap() - .process_runner - .get_pid() - } - - pub(crate) fn get_ic_boundary_pid(&self) -> Option { - self.ic_boundary_manager - .read() - .unwrap() - .process_runner - .get_pid() + self.replica_manager.process_runner.get_pid() } pub(crate) fn get_ic_gateway_pid(&self) -> Option { - self.ic_gateway_manager - .read() - .unwrap() - .process_runner - .get_pid() + self.ic_gateway_manager.process_runner.get_pid() } /// Start all processes appropriate for this node. /// /// Always starts the replica. For cloud-engine subnet nodes it also - /// starts ic-boundary, restarting it if the domain name has changed, - /// and ic-gateway. + /// starts ic-gateway. pub(crate) fn start_all( &mut self, replica_version: &ReplicaVersion, subnet_id: SubnetId, registry_version: RegistryVersion, ) -> OrchestratorResult<()> { - let mut replica_manager = self.replica_manager.write().unwrap(); - let mut ic_boundary_manager = self.ic_boundary_manager.write().unwrap(); - let mut ic_gateway_manager = self.ic_gateway_manager.write().unwrap(); - - replica_manager.ensure_replica_running(replica_version, subnet_id)?; + self.replica_manager + .ensure_replica_running(replica_version, subnet_id)?; - // Cloud-engine nodes run ic-boundary as a sidecar. + // Cloud-engine nodes run ic-gateway as a sidecar. match self.registry.get_subnet_type(subnet_id, registry_version)? { None | Some(SubnetType::Unspecified) | Some(SubnetType::Application) | Some(SubnetType::System) | Some(SubnetType::VerifiedApplication) => { - ic_boundary_manager.stop_ic_boundary()?; - ic_gateway_manager.stop_ic_gateway()?; + self.ic_gateway_manager.stop_ic_gateway()?; } Some(SubnetType::CloudEngine) => { - ic_boundary_manager.ensure_ic_boundary_running_and_restarted_on_domain_change( - replica_version, - registry_version, - ); - ic_gateway_manager.ensure_ic_gateway_running(replica_version)?; + self.ic_gateway_manager + .ensure_ic_gateway_running(replica_version)?; } } @@ -518,18 +485,13 @@ impl MultipleProcessesManager { /// Stop the replica process. pub(crate) fn stop_replica(&mut self) -> OrchestratorResult<()> { - self.replica_manager.write().unwrap().stop_replica() + self.replica_manager.stop_replica() } /// Stop every managed process. pub(crate) fn stop_all(&mut self) -> OrchestratorResult<()> { - let mut replica_manager = self.replica_manager.write().unwrap(); - let mut ic_boundary_manager = self.ic_boundary_manager.write().unwrap(); - let mut ic_gateway_manager = self.ic_gateway_manager.write().unwrap(); - - replica_manager.stop_replica()?; - ic_boundary_manager.stop_ic_boundary()?; - ic_gateway_manager.stop_ic_gateway()?; + self.replica_manager.stop_replica()?; + self.ic_gateway_manager.stop_ic_gateway()?; Ok(()) } diff --git a/rs/orchestrator/src/upgrade.rs b/rs/orchestrator/src/upgrade.rs index 31a9d7510fc2..c8f417625347 100644 --- a/rs/orchestrator/src/upgrade.rs +++ b/rs/orchestrator/src/upgrade.rs @@ -1099,13 +1099,12 @@ mod tests { use crate::catch_up_package_provider::tests::mock_tls_config; use crate::process_manager::{Process, ProcessRunner}; use crate::processes::{ - IcBoundaryProcess, IcBoundaryProcessConfig, IcGatewayProcess, IcGatewayProcessConfig, - ReplicaProcess, ReplicaProcessConfig, + IcGatewayManager, IcGatewayProcess, IcGatewayProcessConfig, ReplicaManager, ReplicaProcess, + ReplicaProcessConfig, }; use super::*; use assert_matches::assert_matches; - use ic_config::crypto::CryptoConfig; use ic_crypto_test_utils_canister_threshold_sigs::{ CanisterThresholdSigTestEnvironment, IDkgParticipants, generate_key_transcript, }; @@ -1126,15 +1125,14 @@ mod tests { use ic_protobuf::registry::subnet::v1::{CatchUpPackageContents, InitialNiDkgTranscriptRecord}; use ic_protobuf::registry::unassigned_nodes_config::v1::UnassignedNodesConfigRecord; use ic_protobuf::registry::{ - node::v1::NodeRecord, replica_version::v1::ReplicaVersionRecord, subnet::v1::{SubnetRecord, SubnetType}, }; use ic_protobuf::types::v1 as pb; use ic_registry_client_fake::FakeRegistryClient; use ic_registry_keys::{ - ROOT_SUBNET_ID_KEY, make_catch_up_package_contents_key, make_node_record_key, - make_replica_version_key, make_subnet_record_key, make_unassigned_nodes_config_record_key, + ROOT_SUBNET_ID_KEY, make_catch_up_package_contents_key, make_replica_version_key, + make_subnet_record_key, make_unassigned_nodes_config_record_key, }; use ic_registry_proto_data_provider::ProtoRegistryDataProvider; use ic_test_utilities_consensus::fake::{Fake, FakeContent}; @@ -1175,44 +1173,20 @@ mod tests { }; use tempfile::{TempDir, tempdir}; - const DOMAIN_NAME: &str = "domain.name"; - impl Upgrade { pub fn subnet_assignment(&self) -> SubnetAssignment { *self.subnet_assignment.read().unwrap() } pub fn is_replica_running(&self) -> bool { - self.processes_manager - .read() - .unwrap() - .replica_manager() - .read() - .unwrap() - .process_runner - .is_running() - } - - pub fn is_ic_boundary_running(&self) -> bool { - self.processes_manager - .read() - .unwrap() - .ic_boundary_manager() - .read() - .unwrap() - .process_runner - .is_running() + self.processes_manager.read().unwrap().is_replica_running() } pub fn is_ic_gateway_running(&self) -> bool { self.processes_manager .read() .unwrap() - .ic_gateway_manager() - .read() - .unwrap() - .process_runner - .is_running() + .is_ic_gateway_running() } } @@ -1361,21 +1335,6 @@ mod tests { make_cup_with_summary(height, summary_payload) } - fn add_node_record_to_provider( - data_provider: &ProtoRegistryDataProvider, - registry_version: RegistryVersion, - node_id: NodeId, - node_record: NodeRecord, - ) { - data_provider - .add( - &make_node_record_key(node_id), - registry_version, - Some(node_record), - ) - .unwrap(); - } - fn add_root_subnet_id_to_provider( data_provider: &ProtoRegistryDataProvider, registry_version: RegistryVersion, @@ -1559,43 +1518,29 @@ mod tests { let replica_config_file = dir.join("ic.json5"); let ic_binary_dir = dir.join("ic_binary"); std::fs::create_dir_all(&ic_binary_dir).unwrap(); - let ic_boundary_env_file = dir.join("ic-boundary.env"); - std::fs::write(&ic_boundary_env_file, b"TEST_KEY=TEST_VALUE").unwrap(); let ic_gateway_env_file = dir.join("ic-gateway.env"); std::fs::write(&ic_gateway_env_file, b"TEST_KEY=TEST_VALUE").unwrap(); - let crypto_config = CryptoConfig::default(); - let replica_process_config = ReplicaProcessConfig { ic_binary_dir: ic_binary_dir.clone(), cup_path: cup_file, replica_config_file: replica_config_file.clone(), }; - let ic_boundary_process_config = IcBoundaryProcessConfig { - ic_binary_dir: ic_binary_dir.clone(), - ic_boundary_env_file, - crypto_config, - }; + let mut replica_manager = ReplicaManager::new( + replica_process_config.clone(), + Arc::clone(&metrics), + logger.clone(), + ); + replica_manager.process_runner = Box::new(FakeRunner::new()); let ic_gateway_process_config = IcGatewayProcessConfig { ic_binary_dir: ic_binary_dir.clone(), ic_gateway_env_file, }; - let processes_manager = MultipleProcessesManager::new( - replica_process_config.clone(), - ic_boundary_process_config.clone(), + let mut ic_gateway_manager = IcGatewayManager::new( ic_gateway_process_config.clone(), - Arc::clone(®istry), Arc::clone(&metrics), logger.clone(), ); - let replica_manager_lock = processes_manager.replica_manager(); - let mut replica_manager = replica_manager_lock.write().unwrap(); - replica_manager.process_runner = Box::new(FakeRunner::new()); - let ic_boundary_manager_lock = processes_manager.ic_boundary_manager(); - let mut ic_boundary_manager = ic_boundary_manager_lock.write().unwrap(); - ic_boundary_manager.process_runner = Box::new(FakeRunner::new()); - let ic_gateway_manager_lock = processes_manager.ic_gateway_manager(); - let mut ic_gateway_manager = ic_gateway_manager_lock.write().unwrap(); ic_gateway_manager.process_runner = Box::new(FakeRunner::new()); // Start the replica process if the test scenario indicates so if test_scenario.were_child_processes_started_previously() { @@ -1608,21 +1553,7 @@ mod tests { )) .unwrap(); if matches!(subnet_type, SubnetType::CloudEngine) { - // After starting ic-boundary, its manager sets its internal domain name, so set it - // here to simulate a previous start of ic-boundary. - ic_boundary_manager.current_domain_name = Some(DOMAIN_NAME.to_string()); - // Simulate ic-boundary and ic-gateway already running by faking a start. - ic_boundary_manager - .process_runner - .start( - IcBoundaryProcess::new( - ic_boundary_process_config, - current_replica_version.clone(), - DOMAIN_NAME.to_string(), - ) - .unwrap(), - ) - .unwrap(); + // Simulate ic-gateway already running by faking a start. ic_gateway_manager .process_runner .start( @@ -1635,7 +1566,11 @@ mod tests { .unwrap(); } } - let processes_manager = Arc::new(RwLock::new(processes_manager)); + let processes_manager = Arc::new(RwLock::new(MultipleProcessesManager::new_for_test( + replica_manager, + ic_gateway_manager, + Arc::clone(®istry), + ))); let manageboot_runner = Box::new(FakeManagebootRunner); @@ -1841,18 +1776,6 @@ mod tests { fn setup_registry(&self) -> Arc { let data_provider = Arc::new(ProtoRegistryDataProvider::new()); - let mut node_record = NodeRecord::default(); - if matches!(self.subnet_type, SubnetType::CloudEngine) { - // Nodes in Cloud engines must have a domain name to start ic-boundary - node_record.domain = Some(DOMAIN_NAME.to_string()); - } - add_node_record_to_provider( - &data_provider, - RegistryVersion::from(1), - self.node_id, - node_record, - ); - // NNS subnet let nns_subnet_id = SUBNET_42; add_root_subnet_id_to_provider(&data_provider, RegistryVersion::from(1), nns_subnet_id); @@ -2493,7 +2416,6 @@ mod tests { let assert_has_started_new_processes = || { if matches!(self.subnet_type, SubnetType::CloudEngine) { assert_has_started(ReplicaProcess::NAME); - assert_has_started(IcBoundaryProcess::NAME); assert_has_started(IcGatewayProcess::NAME); } else { assert_has_started(ReplicaProcess::NAME); @@ -2501,7 +2423,6 @@ mod tests { }; let assert_has_not_started_new_processes = || { assert_has_not_started(ReplicaProcess::NAME); - assert_has_not_started(IcBoundaryProcess::NAME); assert_has_not_started(IcGatewayProcess::NAME); }; match &self.has_local_cup { @@ -2520,14 +2441,12 @@ mod tests { if registry_cup.height >= local_cup.height => { assert_has_started(ReplicaProcess::NAME); - assert_has_started(IcBoundaryProcess::NAME); assert_has_started(IcGatewayProcess::NAME); } (Some((registry_cup, _)), _, _) if registry_cup.height >= local_cup.height => { assert_has_started(ReplicaProcess::NAME); - assert_has_not_started(IcBoundaryProcess::NAME); assert_has_not_started(IcGatewayProcess::NAME); } (_, true, _) => { @@ -2773,10 +2692,10 @@ mod tests { | Ok(OrchestratorControlFlow::Leaving(_)) ) || test_scenario.were_child_processes_started_previously()) ); - // - The ic-boundary process is running <=> same as the replica process, but only for Cloud + // - The ic-gateway process is running <=> same as the replica process, but only for Cloud // Engine subnets assert_eq!( - upgrade_loop.is_ic_boundary_running(), + upgrade_loop.is_ic_gateway_running(), matches!(test_scenario.subnet_type, SubnetType::CloudEngine) && matches!(new_subnet_assignment, SubnetAssignment::Assigned(_)) && (matches!( @@ -2785,11 +2704,6 @@ mod tests { | Ok(OrchestratorControlFlow::Leaving(_)) ) || test_scenario.were_child_processes_started_previously()) ); - // - The ic-gateway process is running <=> the ic-boundary process is running - assert_eq!( - upgrade_loop.is_ic_gateway_running(), - upgrade_loop.is_ic_boundary_running(), - ); // - As an assigned node: if new_subnet_assignment != SubnetAssignment::Unassigned { // - If the replicator has not yet replicated all versions before init, then we should never From 01b936210125582ca60b9b54ab0462c859f3b16f Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Wed, 17 Jun 2026 09:52:56 +0000 Subject: [PATCH 07/42] refactor: minor refactor --- rs/orchestrator/src/process_manager.rs | 2 +- rs/orchestrator/src/upgrade.rs | 37 +++++++++++--------------- 2 files changed, 17 insertions(+), 22 deletions(-) diff --git a/rs/orchestrator/src/process_manager.rs b/rs/orchestrator/src/process_manager.rs index d02137152e4e..0725dba1bf87 100644 --- a/rs/orchestrator/src/process_manager.rs +++ b/rs/orchestrator/src/process_manager.rs @@ -61,7 +61,7 @@ pub(crate) trait ProcessRunner: Send { fn get_pid(&self) -> Option; } -/// Runs a single versioned [`Process`], implementing [`ProcessRunner

`]. +/// Runs a single versioned [`Process`] by implementing [`ProcessRunner

`]. pub(crate) struct SingleProcessRunner { process: Option

, pid_cell: PIDCell, diff --git a/rs/orchestrator/src/upgrade.rs b/rs/orchestrator/src/upgrade.rs index c8f417625347..699a07fccaae 100644 --- a/rs/orchestrator/src/upgrade.rs +++ b/rs/orchestrator/src/upgrade.rs @@ -1542,7 +1542,7 @@ mod tests { logger.clone(), ); ic_gateway_manager.process_runner = Box::new(FakeRunner::new()); - // Start the replica process if the test scenario indicates so + // Start the child processes if the test scenario indicates so if test_scenario.were_child_processes_started_previously() { replica_manager .process_runner @@ -1553,7 +1553,6 @@ mod tests { )) .unwrap(); if matches!(subnet_type, SubnetType::CloudEngine) { - // Simulate ic-gateway already running by faking a start. ic_gateway_manager .process_runner .start( @@ -2396,9 +2395,9 @@ mod tests { assert_has_removed_state(); } - // Returns whether the child processes should be running after the upgrade loop. + // Returns whether the replica process should be running after the upgrade loop. // Additionally asserts whether the orchestrator has started *new* child processes - fn should_child_processes_be_running(&self, logs: Vec) -> bool { + fn should_replica_process_be_running(&self, logs: Vec) -> bool { let logs_assert = LogEntriesAssert::assert_that(logs); let assert_has_started = |process_name: &str| { logs_assert.has_only_one_message_containing( @@ -2414,12 +2413,10 @@ mod tests { ); }; let assert_has_started_new_processes = || { + assert_has_started(ReplicaProcess::NAME); if matches!(self.subnet_type, SubnetType::CloudEngine) { - assert_has_started(ReplicaProcess::NAME); assert_has_started(IcGatewayProcess::NAME); - } else { - assert_has_started(ReplicaProcess::NAME); - }; + } }; let assert_has_not_started_new_processes = || { assert_has_not_started(ReplicaProcess::NAME); @@ -2647,8 +2644,18 @@ mod tests { // Check whether the replica process is running or not assert_eq!( upgrade_loop.is_replica_running(), - test_scenario.should_child_processes_be_running(logs), + test_scenario.should_replica_process_be_running(logs), ); + if matches!(test_scenario.subnet_type, SubnetType::CloudEngine) { + // For Cloud Engine subnets, ic-gateway should always be running when the replica is + assert_eq!( + upgrade_loop.is_ic_gateway_running(), + upgrade_loop.is_replica_running() + ); + } else { + // For other subnets, ic-gateway should never be running + assert!(!upgrade_loop.is_ic_gateway_running()); + } // Asserting further invariants: // - Consistent flow/subnet assignment: @@ -2692,18 +2699,6 @@ mod tests { | Ok(OrchestratorControlFlow::Leaving(_)) ) || test_scenario.were_child_processes_started_previously()) ); - // - The ic-gateway process is running <=> same as the replica process, but only for Cloud - // Engine subnets - assert_eq!( - upgrade_loop.is_ic_gateway_running(), - matches!(test_scenario.subnet_type, SubnetType::CloudEngine) - && matches!(new_subnet_assignment, SubnetAssignment::Assigned(_)) - && (matches!( - flow_result, - Ok(OrchestratorControlFlow::Assigned(_)) - | Ok(OrchestratorControlFlow::Leaving(_)) - ) || test_scenario.were_child_processes_started_previously()) - ); // - As an assigned node: if new_subnet_assignment != SubnetAssignment::Unassigned { // - If the replicator has not yet replicated all versions before init, then we should never From 2af29356bbad491dfcf10de4e5f23f783a3f80b7 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Wed, 17 Jun 2026 09:56:33 +0000 Subject: [PATCH 08/42] docs: remove gov-team change --- rs/registry/admin/bin/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rs/registry/admin/bin/main.rs b/rs/registry/admin/bin/main.rs index 6f3e393d85b8..ca93e30ef416 100644 --- a/rs/registry/admin/bin/main.rs +++ b/rs/registry/admin/bin/main.rs @@ -3351,7 +3351,7 @@ impl ProposalPayload for ProposeToUpdateFirewallRule /// Sub-command to get all firewall rules for a given scope. #[derive(Parser)] struct GetFirewallRulesCmd { - /// The scope to apply new rules at (can be "global", "replica_nodes", "api_boundary_nodes", "subnet(id)", or "node(id)") + /// The scope to apply new rules at (can be "global", "replica_nodes", "subnet(id)", or "node(id)") pub scope: FirewallRulesScope, } From d6ee4b5dde064dbd9ae8110b99471d46975a57a7 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Wed, 17 Jun 2026 12:23:34 +0000 Subject: [PATCH 09/42] refactor: deduplicate/generalize code --- rs/orchestrator/src/boundary_node.rs | 15 +- rs/orchestrator/src/process_manager.rs | 44 +-- rs/orchestrator/src/processes.rs | 412 ++++++++++++------------- rs/orchestrator/src/upgrade.rs | 70 ++--- 4 files changed, 254 insertions(+), 287 deletions(-) diff --git a/rs/orchestrator/src/boundary_node.rs b/rs/orchestrator/src/boundary_node.rs index 9c3c14dfb80d..473344ab23be 100644 --- a/rs/orchestrator/src/boundary_node.rs +++ b/rs/orchestrator/src/boundary_node.rs @@ -1,5 +1,5 @@ use crate::{ - error::OrchestratorError, processes::IcBoundaryManager, registry_helper::RegistryHelper, + error::OrchestratorError, process_manager::Process, processes::{IcBoundaryManager, IcBoundaryProcess}, registry_helper::RegistryHelper }; use ic_logger::{ReplicaLogger, warn}; use ic_types::{NodeId, ReplicaVersion}; @@ -49,21 +49,26 @@ impl BoundaryNodeManager { } else { self.process_manager .ensure_ic_boundary_running_and_restarted_on_domain_change( - &self.version, + self.version.clone(), registry_version, ); } } // BN should not be active Err(OrchestratorError::ApiBoundaryNodeMissingError(_, _)) => { - if let Err(err) = self.process_manager.stop_ic_boundary() { - warn!(self.logger, "Failed to stop Boundary Node: {}", err); + if let Err(err) = self.process_manager.stop() { + warn!( + self.logger, + "Failed to stop {}: {}", + IcBoundaryProcess::NAME, + err + ); } } // Failing to read the registry Err(err) => warn!( self.logger, - "Failed to fetch Boundary Node version: {}", err + "Failed to fetch API Boundary Node version: {}", err ), } } diff --git a/rs/orchestrator/src/process_manager.rs b/rs/orchestrator/src/process_manager.rs index 0725dba1bf87..710c560d7b30 100644 --- a/rs/orchestrator/src/process_manager.rs +++ b/rs/orchestrator/src/process_manager.rs @@ -12,10 +12,7 @@ use std::{ sync::{Arc, Mutex}, }; -use crate::{ - error::{OrchestratorError, OrchestratorResult}, - metrics::OrchestratorMetrics, -}; +use crate::error::OrchestratorResult; type PIDCell = Arc>>; @@ -32,6 +29,18 @@ pub(crate) trait Process { /// We only impose that we can check that versions are equal and have /// a debug representation type Version: Eq + Debug; + /// Static configuration of the process, such as the path to the binary + /// and static arguments. + type Config; + /// Dynamic arguments of the process, such as the subnet ID for the replica + /// (which could change across the orchestrator's lifetime). + type Args; + + /// Build a new instance of the process with the given configuration and + /// arguments. + fn build(config: &Self::Config, args: Self::Args) -> OrchestratorResult + where + Self: Sized; /// Return the version of the [`Process`] fn get_version(&self) -> &Self::Version; @@ -216,30 +225,3 @@ fn wait_on_exit( let _pid = pid_cell.lock().unwrap().take(); } } - -// start_orchestrator_process — common "check-running / log / metric / start" -// logic -// -// Any new process type automatically benefits from this without duplicating -// the boilerplate. -pub(crate) fn start_orchestrator_process( - runner: &mut dyn ProcessRunner

, - process: P, - metrics: &OrchestratorMetrics, - logger: &ReplicaLogger, -) -> OrchestratorResult<()> { - if runner.is_running() { - return Ok(()); - } - info!(logger, "Starting new {} process", P::NAME); - metrics - .processes_start_attempts - .with_label_values(&[P::NAME]) - .inc(); - runner.start(process).map_err(|e| { - OrchestratorError::IoError( - format!("Error when attempting to start {} process", P::NAME), - e, - ) - }) -} diff --git a/rs/orchestrator/src/processes.rs b/rs/orchestrator/src/processes.rs index 9ab557bbfaf0..f4d5b99ec2a3 100644 --- a/rs/orchestrator/src/processes.rs +++ b/rs/orchestrator/src/processes.rs @@ -1,20 +1,15 @@ use crate::{ error::{OrchestratorError, OrchestratorResult}, metrics::OrchestratorMetrics, - process_manager::{Process, ProcessRunner, SingleProcessRunner, start_orchestrator_process}, + process_manager::{Process, ProcessRunner, SingleProcessRunner}, registry_helper::RegistryHelper, }; use ic_config::crypto::CryptoConfig; -use ic_logger::{ReplicaLogger, warn}; +use ic_logger::{ReplicaLogger, info, warn}; use ic_protobuf::registry::subnet::v1::SubnetType; use ic_types::{RegistryVersion, ReplicaVersion, SubnetId}; use nix::unistd::Pid; -use std::{ - collections::HashMap, - ffi::OsString, - path::PathBuf, - sync::Arc, -}; +use std::{collections::HashMap, ffi::OsString, path::PathBuf, sync::Arc}; // --------------------------------------------------------------------------- // ReplicaProcess @@ -35,25 +30,24 @@ pub(crate) struct ReplicaProcess { subnet_id: SubnetId, } -impl ReplicaProcess { - pub(crate) fn new( - config: ReplicaProcessConfig, - replica_version: ReplicaVersion, - subnet_id: SubnetId, - ) -> Self { - Self { - ic_binary_dir: config.ic_binary_dir, - replica_version, - cup_path: config.cup_path, - replica_config_file: config.replica_config_file, - subnet_id, - } - } -} - impl Process for ReplicaProcess { const NAME: &'static str = "replica"; type Version = ReplicaVersion; + type Config = ReplicaProcessConfig; + type Args = (ReplicaVersion, SubnetId); + + fn build( + config: &Self::Config, + (replica_version, subnet_id): Self::Args, + ) -> OrchestratorResult { + Ok(Self { + ic_binary_dir: config.ic_binary_dir.clone(), + replica_version, + cup_path: config.cup_path.clone(), + replica_config_file: config.replica_config_file.clone(), + subnet_id, + }) + } fn get_version(&self) -> &Self::Version { &self.replica_version @@ -78,52 +72,6 @@ impl Process for ReplicaProcess { } } -pub(crate) struct ReplicaManager { - pub process_runner: Box + Sync>, - process_config: ReplicaProcessConfig, - metrics: Arc, - logger: ReplicaLogger, -} - -impl ReplicaManager { - pub(crate) fn new( - process_config: ReplicaProcessConfig, - metrics: Arc, - logger: ReplicaLogger, - ) -> Self { - let process_runner = Box::new(SingleProcessRunner::new(logger.clone())); - Self { - process_runner, - process_config, - metrics, - logger, - } - } - - fn ensure_replica_running( - &mut self, - replica_version: &ReplicaVersion, - subnet_id: SubnetId, - ) -> OrchestratorResult<()> { - start_orchestrator_process( - &mut *self.process_runner, - ReplicaProcess::new( - self.process_config.clone(), - replica_version.clone(), - subnet_id, - ), - &self.metrics, - &self.logger, - ) - } - - fn stop_replica(&mut self) -> OrchestratorResult<()> { - self.process_runner.stop().map_err(|e| { - OrchestratorError::IoError("Error when attempting to stop replica".to_string(), e) - }) - } -} - // --------------------------------------------------------------------------- // IcBoundaryProcess // --------------------------------------------------------------------------- @@ -143,13 +91,17 @@ pub(crate) struct IcBoundaryProcess { env: HashMap, } -impl IcBoundaryProcess { - pub(crate) fn new( - process_config: IcBoundaryProcessConfig, - replica_version: ReplicaVersion, - domain_name: String, +impl Process for IcBoundaryProcess { + const NAME: &'static str = "ic-boundary"; + type Version = ReplicaVersion; + type Config = IcBoundaryProcessConfig; + type Args = (ReplicaVersion, String); + + fn build( + config: &Self::Config, + (replica_version, domain_name): Self::Args, ) -> OrchestratorResult { - let env = match env_file_reader::read_file(&process_config.ic_boundary_env_file) { + let env = match env_file_reader::read_file(&config.ic_boundary_env_file) { Ok(env) => env .into_iter() .map(|(k, v)| (OsString::from(k), OsString::from(v))) @@ -161,22 +113,17 @@ impl IcBoundaryProcess { )); } }; - let crypto_config = serde_json::to_string(&process_config.crypto_config) + let crypto_config = serde_json::to_string(&config.crypto_config) .map_err(OrchestratorError::SerializeCryptoConfigError)?; Ok(Self { - ic_binary_dir: process_config.ic_binary_dir, + ic_binary_dir: config.ic_binary_dir.clone(), replica_version, domain_name, crypto_config, env, }) } -} - -impl Process for IcBoundaryProcess { - const NAME: &'static str = "ic-boundary"; - type Version = ReplicaVersion; fn get_version(&self) -> &Self::Version { &self.replica_version @@ -197,94 +144,6 @@ impl Process for IcBoundaryProcess { } } -pub(crate) struct IcBoundaryManager { - pub process_runner: Box + Sync>, - process_config: IcBoundaryProcessConfig, - registry: Arc, - pub current_domain_name: Option, - metrics: Arc, - logger: ReplicaLogger, -} - -impl IcBoundaryManager { - pub(crate) fn new( - process_config: IcBoundaryProcessConfig, - registry: Arc, - metrics: Arc, - logger: ReplicaLogger, - ) -> Self { - let process_runner = Box::new(SingleProcessRunner::new(logger.clone())); - Self { - process_runner, - process_config, - registry, - current_domain_name: None, - metrics, - logger, - } - } - - pub(crate) fn ensure_ic_boundary_running_and_restarted_on_domain_change( - &mut self, - replica_version: &ReplicaVersion, - registry_version: RegistryVersion, - ) { - match self.registry.get_node_domain_name(registry_version) { - Ok(Some(domain_name)) => { - // stop ic-boundary when the domain name changes and start it again. - if Some(&domain_name) != self.current_domain_name.as_ref() - && let Err(err) = self.stop_ic_boundary() - { - warn!(self.logger, "Failed to stop ic-boundary: {}", err); - } - - // make sure the ic-boundary is running - if let Err(err) = self.ensure_ic_boundary_running(replica_version, &domain_name) { - warn!(self.logger, "Failed to start ic-boundary: {}", err); - } - - self.current_domain_name = Some(domain_name); - } - // ic-boundary should not start when the node doesn't have a domain name - Ok(None) => { - warn!( - self.logger, - "There is no domain associated with the node, while this is a requirement for the API boundary node. Shutting ic-boundary down." - ); - if let Err(err) = self.stop_ic_boundary() { - warn!(self.logger, "Failed to stop Boundary Node: {}", err); - } - self.current_domain_name = None; - } - // Failing to read the registry - Err(err) => warn!(self.logger, "Failed to fetch domain name: {}", err), - } - } - - fn ensure_ic_boundary_running( - &mut self, - replica_version: &ReplicaVersion, - domain_name: &str, - ) -> OrchestratorResult<()> { - start_orchestrator_process( - &mut *self.process_runner, - IcBoundaryProcess::new( - self.process_config.clone(), - replica_version.clone(), - domain_name.to_string(), - )?, - &self.metrics, - &self.logger, - ) - } - - pub(crate) fn stop_ic_boundary(&mut self) -> OrchestratorResult<()> { - self.process_runner.stop().map_err(|e| { - OrchestratorError::IoError("Error when attempting to stop ic-boundary".to_string(), e) - }) - } -} - // --------------------------------------------------------------------------- // IcGatewayProcess // --------------------------------------------------------------------------- @@ -301,12 +160,14 @@ pub(crate) struct IcGatewayProcess { env: HashMap, } -impl IcGatewayProcess { - pub(crate) fn new( - process_config: IcGatewayProcessConfig, - replica_version: ReplicaVersion, - ) -> OrchestratorResult { - let env = match env_file_reader::read_file(&process_config.ic_gateway_env_file) { +impl Process for IcGatewayProcess { + const NAME: &'static str = "ic-gateway"; + type Version = ReplicaVersion; + type Config = IcGatewayProcessConfig; + type Args = ReplicaVersion; + + fn build(config: &Self::Config, replica_version: Self::Args) -> OrchestratorResult { + let env = match env_file_reader::read_file(&config.ic_gateway_env_file) { Ok(env) => env .into_iter() .map(|(k, v)| (OsString::from(k), OsString::from(v))) @@ -320,16 +181,11 @@ impl IcGatewayProcess { }; Ok(Self { - ic_binary_dir: process_config.ic_binary_dir, + ic_binary_dir: config.ic_binary_dir.clone(), replica_version, env, }) } -} - -impl Process for IcGatewayProcess { - const NAME: &'static str = "ic-gateway"; - type Version = ReplicaVersion; fn get_version(&self) -> &Self::Version { &self.replica_version @@ -345,66 +201,191 @@ impl Process for IcGatewayProcess { } } -pub(crate) struct IcGatewayManager { - pub process_runner: Box + Sync>, - process_config: IcGatewayProcessConfig, +// --------------------------------------------------------------------------- +// ProcessManager

+// +// This struct offers common boilerplate functionality logic to ensure a process +// is running (logging, metrics) and to stop it, converting errors to [`OrchestratorError`] +// in both cases. +// --------------------------------------------------------------------------- + +pub(crate) struct ProcessManager { + process_runner: Box + Sync>, + process_config: P::Config, metrics: Arc, logger: ReplicaLogger, } -impl IcGatewayManager { +impl ProcessManager

{ + /// Used in tests to inject a mock ProcessRunner. + #[cfg(test)] + pub(crate) fn new_for_test( + process_runner: Box + Sync>, + process_config: P::Config, + metrics: Arc, + logger: ReplicaLogger, + ) -> Self { + Self { + process_runner, + process_config, + metrics, + logger, + } + } + pub(crate) fn new( - process_config: IcGatewayProcessConfig, + process_config: P::Config, metrics: Arc, logger: ReplicaLogger, ) -> Self { let process_runner = Box::new(SingleProcessRunner::new(logger.clone())); Self { - process_runner, process_config, + process_runner, metrics, logger, } } - fn ensure_ic_gateway_running( - &mut self, - replica_version: &ReplicaVersion, - ) -> OrchestratorResult<()> { - start_orchestrator_process( - &mut *self.process_runner, - IcGatewayProcess::new(self.process_config.clone(), replica_version.clone())?, - &self.metrics, - &self.logger, - ) + pub(crate) fn ensure_running(&mut self, args: P::Args) -> OrchestratorResult<()> { + if self.process_runner.is_running() { + return Ok(()); + } + let process = P::build(&self.process_config, args)?; + info!(self.logger, "Starting new {} process", P::NAME); + self.metrics + .processes_start_attempts + .with_label_values(&[P::NAME]) + .inc(); + self.process_runner.start(process).map_err(|e| { + OrchestratorError::IoError( + format!("Error when attempting to start {} process", P::NAME), + e, + ) + }) } - fn stop_ic_gateway(&mut self) -> OrchestratorResult<()> { + pub(crate) fn stop(&mut self) -> OrchestratorResult<()> { self.process_runner.stop().map_err(|e| { - OrchestratorError::IoError("Error when attempting to stop ic-gateway".to_string(), e) + OrchestratorError::IoError( + format!("Error when attempting to stop the {} process", P::NAME), + e, + ) }) } } +// --------------------------------------------------------------------------- +// IcBoundaryManager +// +// Wrapper around ProcessManager which contains additional +// logic to stop and restart the process when the node's domain name changes +// in the registry. +// --------------------------------------------------------------------------- + +pub(crate) struct IcBoundaryManager { + inner: ProcessManager, + registry: Arc, + current_domain_name: Option, + logger: ReplicaLogger, +} + +impl IcBoundaryManager { + pub(crate) fn new( + config: ::Config, + registry: Arc, + metrics: Arc, + logger: ReplicaLogger, + ) -> Self { + let inner = ProcessManager::new(config, metrics, logger.clone()); + Self { + inner, + registry, + current_domain_name: None, + logger, + } + } + + pub(crate) fn ensure_ic_boundary_running_and_restarted_on_domain_change( + &mut self, + replica_version: ReplicaVersion, + registry_version: RegistryVersion, + ) { + match self.registry.get_node_domain_name(registry_version) { + Ok(Some(domain_name)) => { + // stop ic-boundary when the domain name changes and start it again. + if Some(&domain_name) != self.current_domain_name.as_ref() + && let Err(err) = self.inner.stop() + { + warn!( + self.logger, + "Failed to stop {}: {}", + IcBoundaryProcess::NAME, + err + ); + } + + // make sure ic-boundary is running + if let Err(err) = self + .inner + .ensure_running((replica_version, domain_name.clone())) + { + warn!( + self.logger, + "Failed to start {}: {}", + IcBoundaryProcess::NAME, + err + ); + } + + self.current_domain_name = Some(domain_name); + } + // ic-boundary should not start when the node doesn't have a domain name + Ok(None) => { + warn!( + self.logger, + "There is no domain associated with the node, while this is a requirement for the API boundary node. Shutting {} down.", + IcBoundaryProcess::NAME + ); + if let Err(err) = self.inner.stop() { + warn!( + self.logger, + "Failed to stop {}: {}", + IcBoundaryProcess::NAME, + err + ); + } + self.current_domain_name = None; + } + // Failing to read the registry + Err(err) => warn!(self.logger, "Failed to fetch domain name: {}", err), + } + } + + pub(crate) fn stop(&mut self) -> OrchestratorResult<()> { + self.inner.stop() + } +} + // --------------------------------------------------------------------------- // MultipleProcessManager // -// This struct manages all processes that the orchestrator is responsible for, +// This struct manages all processes that the upgrade loop is responsible for, // providing a single entry point for starting and stopping them according to // the node's configuration in the registry. // --------------------------------------------------------------------------- pub(crate) struct MultipleProcessesManager { - replica_manager: ReplicaManager, - ic_gateway_manager: IcGatewayManager, + replica_manager: ProcessManager, + ic_gateway_manager: ProcessManager, registry: Arc, } impl MultipleProcessesManager { #[cfg(test)] pub(crate) fn new_for_test( - replica_manager: ReplicaManager, - ic_gateway_manager: IcGatewayManager, + replica_manager: ProcessManager, + ic_gateway_manager: ProcessManager, registry: Arc, ) -> Self { Self { @@ -422,8 +403,8 @@ impl MultipleProcessesManager { logger: ReplicaLogger, ) -> Self { let replica_manager = - ReplicaManager::new(replica_process_config, metrics.clone(), logger.clone()); - let ic_gateway_manager = IcGatewayManager::new(ic_gateway_process_config, metrics, logger); + ProcessManager::new(replica_process_config, metrics.clone(), logger.clone()); + let ic_gateway_manager = ProcessManager::new(ic_gateway_process_config, metrics, logger); Self { replica_manager, @@ -458,12 +439,12 @@ impl MultipleProcessesManager { /// starts ic-gateway. pub(crate) fn start_all( &mut self, - replica_version: &ReplicaVersion, + replica_version: ReplicaVersion, subnet_id: SubnetId, registry_version: RegistryVersion, ) -> OrchestratorResult<()> { self.replica_manager - .ensure_replica_running(replica_version, subnet_id)?; + .ensure_running((replica_version.clone(), subnet_id))?; // Cloud-engine nodes run ic-gateway as a sidecar. match self.registry.get_subnet_type(subnet_id, registry_version)? { @@ -472,11 +453,10 @@ impl MultipleProcessesManager { | Some(SubnetType::Application) | Some(SubnetType::System) | Some(SubnetType::VerifiedApplication) => { - self.ic_gateway_manager.stop_ic_gateway()?; + self.ic_gateway_manager.stop()?; } Some(SubnetType::CloudEngine) => { - self.ic_gateway_manager - .ensure_ic_gateway_running(replica_version)?; + self.ic_gateway_manager.ensure_running(replica_version)?; } } @@ -485,13 +465,13 @@ impl MultipleProcessesManager { /// Stop the replica process. pub(crate) fn stop_replica(&mut self) -> OrchestratorResult<()> { - self.replica_manager.stop_replica() + self.replica_manager.stop() } /// Stop every managed process. pub(crate) fn stop_all(&mut self) -> OrchestratorResult<()> { - self.replica_manager.stop_replica()?; - self.ic_gateway_manager.stop_ic_gateway()?; + self.replica_manager.stop()?; + self.ic_gateway_manager.stop()?; Ok(()) } diff --git a/rs/orchestrator/src/upgrade.rs b/rs/orchestrator/src/upgrade.rs index 699a07fccaae..65c79cf4deb4 100644 --- a/rs/orchestrator/src/upgrade.rs +++ b/rs/orchestrator/src/upgrade.rs @@ -372,7 +372,7 @@ impl Upgrade { // This will start new child processes if any of them is not running self.ensure_children_are_running( - &self.replica_version, + self.replica_version.clone(), subnet_id, latest_registry_version, )?; @@ -609,7 +609,7 @@ impl Upgrade { /// Start all child processes appropriate for this node. fn ensure_children_are_running( &self, - replica_version: &ReplicaVersion, + replica_version: ReplicaVersion, subnet_id: SubnetId, registry_version: RegistryVersion, ) -> OrchestratorResult<()> { @@ -1099,7 +1099,7 @@ mod tests { use crate::catch_up_package_provider::tests::mock_tls_config; use crate::process_manager::{Process, ProcessRunner}; use crate::processes::{ - IcGatewayManager, IcGatewayProcess, IcGatewayProcessConfig, ReplicaManager, ReplicaProcess, + IcGatewayProcess, IcGatewayProcessConfig, ProcessManager, ReplicaProcess, ReplicaProcessConfig, }; @@ -1495,7 +1495,7 @@ mod tests { let metrics = Arc::new(OrchestratorMetrics::new(&MetricsRegistry::new())); let cup_dir = dir.join("cups"); - let cup_file = cup_dir.join("cup.types.v1.CatchUpPackage.pb"); + let cup_path = cup_dir.join("cup.types.v1.CatchUpPackage.pb"); std::fs::create_dir_all(&cup_dir).unwrap(); if let Some(local_cup) = has_local_cup { let cup = make_local_cup( @@ -1504,7 +1504,7 @@ mod tests { local_cup.registry_version, ); let cup_proto = pb::CatchUpPackage::from(cup); - std::fs::write(&cup_file, cup_proto.encode_to_vec()).unwrap(); + std::fs::write(&cup_path, cup_proto.encode_to_vec()).unwrap(); } let cup_provider = CatchUpPackageProvider::new( registry.clone(), @@ -1521,43 +1521,33 @@ mod tests { let ic_gateway_env_file = dir.join("ic-gateway.env"); std::fs::write(&ic_gateway_env_file, b"TEST_KEY=TEST_VALUE").unwrap(); + let mut replica_runner = Box::new(FakeRunner::new()); let replica_process_config = ReplicaProcessConfig { ic_binary_dir: ic_binary_dir.clone(), - cup_path: cup_file, + cup_path, replica_config_file: replica_config_file.clone(), }; - let mut replica_manager = ReplicaManager::new( - replica_process_config.clone(), - Arc::clone(&metrics), - logger.clone(), - ); - replica_manager.process_runner = Box::new(FakeRunner::new()); + let mut ic_gateway_runner = Box::new(FakeRunner::new()); let ic_gateway_process_config = IcGatewayProcessConfig { - ic_binary_dir: ic_binary_dir.clone(), + ic_binary_dir, ic_gateway_env_file, }; - let mut ic_gateway_manager = IcGatewayManager::new( - ic_gateway_process_config.clone(), - Arc::clone(&metrics), - logger.clone(), - ); - ic_gateway_manager.process_runner = Box::new(FakeRunner::new()); // Start the child processes if the test scenario indicates so if test_scenario.were_child_processes_started_previously() { - replica_manager - .process_runner - .start(ReplicaProcess::new( - replica_process_config, - current_replica_version.clone(), - SUBNET_1, - )) + replica_runner + .start( + ReplicaProcess::build( + &replica_process_config, + (current_replica_version.clone(), SUBNET_1), + ) + .unwrap(), + ) .unwrap(); if matches!(subnet_type, SubnetType::CloudEngine) { - ic_gateway_manager - .process_runner + ic_gateway_runner .start( - IcGatewayProcess::new( - ic_gateway_process_config, + IcGatewayProcess::build( + &ic_gateway_process_config, current_replica_version.clone(), ) .unwrap(), @@ -1566,8 +1556,18 @@ mod tests { } } let processes_manager = Arc::new(RwLock::new(MultipleProcessesManager::new_for_test( - replica_manager, - ic_gateway_manager, + ProcessManager::new_for_test( + replica_runner, + replica_process_config, + Arc::clone(&metrics), + logger.clone(), + ), + ProcessManager::new_for_test( + ic_gateway_runner, + ic_gateway_process_config, + Arc::clone(&metrics), + logger.clone(), + ), Arc::clone(®istry), ))); @@ -1606,7 +1606,7 @@ mod tests { manageboot_runner, cup_provider, subnet_assignment, - current_replica_version.clone(), + current_replica_version, replica_config_file, node_id, Arc::new(registry_replicator), @@ -2625,8 +2625,8 @@ mod tests { // Check presence/absence of local CUP, including its height, which // tests the recovery case where the recovery CUP would overwrite the // local CUP - let cup_file = tmp_path.join("cups").join("cup.types.v1.CatchUpPackage.pb"); - let local_cup_height = std::fs::read(cup_file) + let cup_path = tmp_path.join("cups").join("cup.types.v1.CatchUpPackage.pb"); + let local_cup_height = std::fs::read(cup_path) .map(|bytes| { CatchUpPackage::try_from(&pb::CatchUpPackage::decode(&bytes[..]).unwrap()) .unwrap() From e6309263ceaa892de754373315189ef9c055f823 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Wed, 17 Jun 2026 12:40:36 +0000 Subject: [PATCH 10/42] feat: do not attempt to stop when not running. Log + Metrics when stopping --- rs/orchestrator/src/metrics.rs | 8 +++++++- rs/orchestrator/src/processes.rs | 14 ++++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/rs/orchestrator/src/metrics.rs b/rs/orchestrator/src/metrics.rs index b343437fdaac..66dfccaab41d 100644 --- a/rs/orchestrator/src/metrics.rs +++ b/rs/orchestrator/src/metrics.rs @@ -20,6 +20,7 @@ pub(crate) struct OrchestratorMetrics { pub(crate) fstrim_duration: IntGauge, pub(crate) critical_error_task_failed: IntCounterVec, pub(crate) processes_start_attempts: IntCounterVec, + pub(crate) processes_stop_attempts: IntCounterVec, } #[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug, EnumIter, AsRefStr)] @@ -109,7 +110,12 @@ impl OrchestratorMetrics { ), processes_start_attempts: metrics_registry.int_counter_vec( "orchestrator_processes_start_attempts_total", - "Number of times the replica process was attempted to be started", + "Number of times a process was attempted to be started", + &["process_name"], + ), + processes_stop_attempts: metrics_registry.int_counter_vec( + "orchestrator_processes_stop_attempts_total", + "Number of times a process was attempted to be stopped", &["process_name"], ), } diff --git a/rs/orchestrator/src/processes.rs b/rs/orchestrator/src/processes.rs index f4d5b99ec2a3..50dcf208b7a4 100644 --- a/rs/orchestrator/src/processes.rs +++ b/rs/orchestrator/src/processes.rs @@ -205,8 +205,8 @@ impl Process for IcGatewayProcess { // ProcessManager

// // This struct offers common boilerplate functionality logic to ensure a process -// is running (logging, metrics) and to stop it, converting errors to [`OrchestratorError`] -// in both cases. +// is running and to stop it, converting errors to [`OrchestratorError`], logging +// them, and updating metrics. // --------------------------------------------------------------------------- pub(crate) struct ProcessManager { @@ -251,6 +251,7 @@ impl ProcessManager

{ if self.process_runner.is_running() { return Ok(()); } + let process = P::build(&self.process_config, args)?; info!(self.logger, "Starting new {} process", P::NAME); self.metrics @@ -266,6 +267,15 @@ impl ProcessManager

{ } pub(crate) fn stop(&mut self) -> OrchestratorResult<()> { + if !self.process_runner.is_running() { + return Ok(()); + } + + info!(self.logger, "Stopping {} process", P::NAME); + self.metrics + .processes_stop_attempts + .with_label_values(&[P::NAME]) + .inc(); self.process_runner.stop().map_err(|e| { OrchestratorError::IoError( format!("Error when attempting to stop the {} process", P::NAME), From e50142eee2fee8d76797dc0dc78037e1ef6eebf0 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Wed, 17 Jun 2026 13:05:48 +0000 Subject: [PATCH 11/42] feat: propagate orchestrator's changed metric's name --- rs/tests/consensus/cup_explorer_test.rs | 2 +- .../consensus/orchestrator/node_reassignment_test.rs | 2 +- rs/tests/consensus/replica_determinism_test.rs | 2 +- .../sr_app_failover_nodes_enable_chain_keys_test.rs | 2 +- .../subnet_recovery/sr_app_failover_nodes_test.rs | 2 +- .../sr_app_failover_nodes_with_chain_keys_test.rs | 4 +++- .../sr_app_large_no_upgrade_with_chain_keys_test.rs | 2 +- .../sr_app_no_upgrade_enable_chain_keys_test.rs | 2 +- .../subnet_recovery/sr_app_no_upgrade_local_test.rs | 2 +- .../sr_app_no_upgrade_provision_write_access_test.rs | 2 +- .../subnet_recovery/sr_app_no_upgrade_test.rs | 2 +- .../sr_app_no_upgrade_with_chain_keys_test.rs | 2 +- .../sr_app_same_nodes_enable_chain_keys_test.rs | 2 +- .../subnet_recovery/sr_app_same_nodes_test.rs | 2 +- .../sr_app_same_nodes_with_chain_keys_test.rs | 2 +- .../subnet_recovery/sr_nns_failover_nodes_test.rs | 2 +- rs/tests/consensus/subnet_splitting_test.rs | 2 +- rs/tests/driver/src/driver/group.rs | 10 ++++++++-- 18 files changed, 27 insertions(+), 19 deletions(-) diff --git a/rs/tests/consensus/cup_explorer_test.rs b/rs/tests/consensus/cup_explorer_test.rs index c2b7eb32d61c..f0ac0f5633dc 100644 --- a/rs/tests/consensus/cup_explorer_test.rs +++ b/rs/tests/consensus/cup_explorer_test.rs @@ -220,7 +220,7 @@ fn main() -> Result<()> { .add_test(systest!(test)) // The replica is restarted when the orchestrator observes the recovery CUP in the registry .update_orchestrator_metrics_to_check( - "orchestrator_replica_process_start_attempts_total", + r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, 2, ) .execute_from_args()?; diff --git a/rs/tests/consensus/orchestrator/node_reassignment_test.rs b/rs/tests/consensus/orchestrator/node_reassignment_test.rs index b13945a3f708..7b3ab3f80942 100644 --- a/rs/tests/consensus/orchestrator/node_reassignment_test.rs +++ b/rs/tests/consensus/orchestrator/node_reassignment_test.rs @@ -361,7 +361,7 @@ fn main() -> Result<()> { // Some nodes change subnets twice in which case the replica process would be started // three times. .update_orchestrator_metrics_to_check( - "orchestrator_replica_process_start_attempts_total", + r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, 3, ) .execute_from_args()?; diff --git a/rs/tests/consensus/replica_determinism_test.rs b/rs/tests/consensus/replica_determinism_test.rs index 13afa101aad2..afe87384a7ee 100644 --- a/rs/tests/consensus/replica_determinism_test.rs +++ b/rs/tests/consensus/replica_determinism_test.rs @@ -177,7 +177,7 @@ fn main() -> Result<()> { // it crashes again briefly during the catch-up process after the divergence. Consider reducing // this number if the underlying issue has been resolved. .update_orchestrator_metrics_to_check( - "orchestrator_replica_process_start_attempts_total", + r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, 3, ) // One of the nodes has a corrupted state which could cause a panic in the replica like: diff --git a/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_enable_chain_keys_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_enable_chain_keys_test.rs index beb193772642..4091ce3bdfb6 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_enable_chain_keys_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_enable_chain_keys_test.rs @@ -12,7 +12,7 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check("orchestrator_replica_process_start_attempts_total") + .remove_metrics_to_check(r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#) .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_test.rs index 601cf91af225..8c2bcaf3dfd6 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_test.rs @@ -11,7 +11,7 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test; CupCorruption::CorruptedIncludingInvalidNiDkgId)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check("orchestrator_replica_process_start_attempts_total") + .remove_metrics_to_check(r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#) // The test corrupts the CUPs, so it's expected that the following error metric will be // non-zero. .remove_metrics_to_check("orchestrator_cup_deserialization_failed_total") diff --git a/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_with_chain_keys_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_with_chain_keys_test.rs index 7e2a5eaa3933..9fbca5c945e4 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_with_chain_keys_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_with_chain_keys_test.rs @@ -13,7 +13,9 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check("orchestrator_replica_process_start_attempts_total") + .remove_metrics_to_check( + r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, + ) .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_large_no_upgrade_with_chain_keys_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_large_no_upgrade_with_chain_keys_test.rs index 3f16021bb298..9ac3f4e8538f 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_large_no_upgrade_with_chain_keys_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_large_no_upgrade_with_chain_keys_test.rs @@ -14,7 +14,7 @@ fn main() -> Result<()> { .with_timeout_per_test(Duration::from_secs(50 * 60)) .add_test(systest!(test)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check("orchestrator_replica_process_start_attempts_total") + .remove_metrics_to_check(r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#) .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_enable_chain_keys_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_enable_chain_keys_test.rs index f3fd7327abf5..1e6e214b566e 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_enable_chain_keys_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_enable_chain_keys_test.rs @@ -13,7 +13,7 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test; CupCorruption::NotCorrupted)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check("orchestrator_replica_process_start_attempts_total") + .remove_metrics_to_check(r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#) .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_local_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_local_test.rs index a6b2bbb21d48..1e9b25b3ae10 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_local_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_local_test.rs @@ -11,7 +11,7 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check("orchestrator_replica_process_start_attempts_total") + .remove_metrics_to_check(r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#) .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_provision_write_access_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_provision_write_access_test.rs index ddf55e86284d..58181fac8402 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_provision_write_access_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_provision_write_access_test.rs @@ -11,7 +11,7 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check("orchestrator_replica_process_start_attempts_total") + .remove_metrics_to_check(r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#) .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_test.rs index f60f22b1ee8d..5da42d2ff937 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_test.rs @@ -11,7 +11,7 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check("orchestrator_replica_process_start_attempts_total") + .remove_metrics_to_check(r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#) .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_with_chain_keys_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_with_chain_keys_test.rs index 3da5ef6fc587..bee3a1ab7bcf 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_with_chain_keys_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_with_chain_keys_test.rs @@ -18,7 +18,7 @@ fn main() -> Result<()> { // non-zero. .remove_metrics_to_check("orchestrator_cup_deserialization_failed_total") // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check("orchestrator_replica_process_start_attempts_total") + .remove_metrics_to_check(r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#) .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_enable_chain_keys_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_enable_chain_keys_test.rs index 911f483b4046..65677cd61890 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_enable_chain_keys_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_enable_chain_keys_test.rs @@ -12,7 +12,7 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check("orchestrator_replica_process_start_attempts_total") + .remove_metrics_to_check(r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#) .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_test.rs index 548910bdc5fb..06b769dfda5a 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_test.rs @@ -11,7 +11,7 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test; CupCorruption::NotCorrupted)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check("orchestrator_replica_process_start_attempts_total") + .remove_metrics_to_check(r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#) .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_with_chain_keys_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_with_chain_keys_test.rs index 621aa18ff680..de8ca693b769 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_with_chain_keys_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_with_chain_keys_test.rs @@ -13,7 +13,7 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check("orchestrator_replica_process_start_attempts_total") + .remove_metrics_to_check(r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#) .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_nns_failover_nodes_test.rs b/rs/tests/consensus/subnet_recovery/sr_nns_failover_nodes_test.rs index 8f1617c01882..f9f14c117d57 100644 --- a/rs/tests/consensus/subnet_recovery/sr_nns_failover_nodes_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_nns_failover_nodes_test.rs @@ -62,7 +62,7 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check("orchestrator_replica_process_start_attempts_total") + .remove_metrics_to_check(r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#) .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_splitting_test.rs b/rs/tests/consensus/subnet_splitting_test.rs index ded0db973349..f49c72c06a5c 100644 --- a/rs/tests/consensus/subnet_splitting_test.rs +++ b/rs/tests/consensus/subnet_splitting_test.rs @@ -449,7 +449,7 @@ fn main() -> Result<()> { .add_test(systest!(subnet_splitting_test)) // The replica is restarted when the orchestrator observes the recovery CUP in the registry .update_orchestrator_metrics_to_check( - "orchestrator_replica_process_start_attempts_total", + r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, 2, ) .execute_from_args() diff --git a/rs/tests/driver/src/driver/group.rs b/rs/tests/driver/src/driver/group.rs index 35746188c385..3fa103c38ea8 100644 --- a/rs/tests/driver/src/driver/group.rs +++ b/rs/tests/driver/src/driver/group.rs @@ -663,8 +663,14 @@ fn default_orchestrator_metrics() -> BTreeMap<&'static str, u64> { ("orchestrator_cup_deserialization_failed_total", 0), ("orchestrator_state_removal_failed_total", 0), ("orchestrator_tasks_failed_total", 0), - // TODO: adapt me (and all system tests that use me) - ("orchestrator_replica_process_start_attempts_total", 1), + ( + r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, + 1, + ), + ( + r#"orchestrator_processes_start_attempts_total{process_name="ic-gateway"}"#, + 1, + ), ]) } From 11138a74381f99d13583733f601882e8e7d18cf8 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Wed, 17 Jun 2026 13:19:11 +0000 Subject: [PATCH 12/42] re-trigger CI From b73a95bb500251e754834f07d3899e1e183df4e3 Mon Sep 17 00:00:00 2001 From: IDX GitHub Automation Date: Wed, 17 Jun 2026 13:24:06 +0000 Subject: [PATCH 13/42] Automatically fixing code for linting and formatting issues --- rs/orchestrator/src/boundary_node.rs | 5 ++++- .../sr_app_failover_nodes_enable_chain_keys_test.rs | 4 +++- .../consensus/subnet_recovery/sr_app_failover_nodes_test.rs | 4 +++- .../sr_app_large_no_upgrade_with_chain_keys_test.rs | 4 +++- .../sr_app_no_upgrade_enable_chain_keys_test.rs | 4 +++- .../subnet_recovery/sr_app_no_upgrade_local_test.rs | 4 +++- .../sr_app_no_upgrade_provision_write_access_test.rs | 4 +++- rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_test.rs | 4 +++- .../sr_app_no_upgrade_with_chain_keys_test.rs | 4 +++- .../sr_app_same_nodes_enable_chain_keys_test.rs | 4 +++- rs/tests/consensus/subnet_recovery/sr_app_same_nodes_test.rs | 4 +++- .../sr_app_same_nodes_with_chain_keys_test.rs | 4 +++- .../consensus/subnet_recovery/sr_nns_failover_nodes_test.rs | 4 +++- 13 files changed, 40 insertions(+), 13 deletions(-) diff --git a/rs/orchestrator/src/boundary_node.rs b/rs/orchestrator/src/boundary_node.rs index 473344ab23be..314fa312ebcc 100644 --- a/rs/orchestrator/src/boundary_node.rs +++ b/rs/orchestrator/src/boundary_node.rs @@ -1,5 +1,8 @@ use crate::{ - error::OrchestratorError, process_manager::Process, processes::{IcBoundaryManager, IcBoundaryProcess}, registry_helper::RegistryHelper + error::OrchestratorError, + process_manager::Process, + processes::{IcBoundaryManager, IcBoundaryProcess}, + registry_helper::RegistryHelper, }; use ic_logger::{ReplicaLogger, warn}; use ic_types::{NodeId, ReplicaVersion}; diff --git a/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_enable_chain_keys_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_enable_chain_keys_test.rs index 4091ce3bdfb6..7b21db3ddebf 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_enable_chain_keys_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_enable_chain_keys_test.rs @@ -12,7 +12,9 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check(r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#) + .remove_metrics_to_check( + r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, + ) .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_test.rs index 8c2bcaf3dfd6..9c7a7678d0be 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_test.rs @@ -11,7 +11,9 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test; CupCorruption::CorruptedIncludingInvalidNiDkgId)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check(r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#) + .remove_metrics_to_check( + r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, + ) // The test corrupts the CUPs, so it's expected that the following error metric will be // non-zero. .remove_metrics_to_check("orchestrator_cup_deserialization_failed_total") diff --git a/rs/tests/consensus/subnet_recovery/sr_app_large_no_upgrade_with_chain_keys_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_large_no_upgrade_with_chain_keys_test.rs index 9ac3f4e8538f..37f98e15c797 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_large_no_upgrade_with_chain_keys_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_large_no_upgrade_with_chain_keys_test.rs @@ -14,7 +14,9 @@ fn main() -> Result<()> { .with_timeout_per_test(Duration::from_secs(50 * 60)) .add_test(systest!(test)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check(r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#) + .remove_metrics_to_check( + r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, + ) .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_enable_chain_keys_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_enable_chain_keys_test.rs index 1e6e214b566e..4e178e0cb6c2 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_enable_chain_keys_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_enable_chain_keys_test.rs @@ -13,7 +13,9 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test; CupCorruption::NotCorrupted)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check(r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#) + .remove_metrics_to_check( + r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, + ) .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_local_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_local_test.rs index 1e9b25b3ae10..bf5b509e0725 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_local_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_local_test.rs @@ -11,7 +11,9 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check(r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#) + .remove_metrics_to_check( + r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, + ) .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_provision_write_access_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_provision_write_access_test.rs index 58181fac8402..1dc50a36807d 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_provision_write_access_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_provision_write_access_test.rs @@ -11,7 +11,9 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check(r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#) + .remove_metrics_to_check( + r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, + ) .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_test.rs index 5da42d2ff937..a0e1180a6be8 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_test.rs @@ -11,7 +11,9 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check(r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#) + .remove_metrics_to_check( + r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, + ) .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_with_chain_keys_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_with_chain_keys_test.rs index bee3a1ab7bcf..bf6ce79b2ec5 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_with_chain_keys_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_with_chain_keys_test.rs @@ -18,7 +18,9 @@ fn main() -> Result<()> { // non-zero. .remove_metrics_to_check("orchestrator_cup_deserialization_failed_total") // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check(r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#) + .remove_metrics_to_check( + r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, + ) .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_enable_chain_keys_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_enable_chain_keys_test.rs index 65677cd61890..a969a7801d4b 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_enable_chain_keys_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_enable_chain_keys_test.rs @@ -12,7 +12,9 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check(r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#) + .remove_metrics_to_check( + r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, + ) .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_test.rs index 06b769dfda5a..a4e355867cf7 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_test.rs @@ -11,7 +11,9 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test; CupCorruption::NotCorrupted)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check(r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#) + .remove_metrics_to_check( + r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, + ) .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_with_chain_keys_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_with_chain_keys_test.rs index de8ca693b769..899d9b0ecc26 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_with_chain_keys_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_with_chain_keys_test.rs @@ -13,7 +13,9 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check(r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#) + .remove_metrics_to_check( + r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, + ) .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_nns_failover_nodes_test.rs b/rs/tests/consensus/subnet_recovery/sr_nns_failover_nodes_test.rs index f9f14c117d57..b55fccd84c40 100644 --- a/rs/tests/consensus/subnet_recovery/sr_nns_failover_nodes_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_nns_failover_nodes_test.rs @@ -62,7 +62,9 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check(r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#) + .remove_metrics_to_check( + r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, + ) .execute_from_args()?; Ok(()) } From 8a5a43eb8ad366fa3dc13ac2d5fb226129f9d24c Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Wed, 17 Jun 2026 13:32:57 +0000 Subject: [PATCH 14/42] style: check metric of all orchestrator processes --- rs/tests/consensus/cup_explorer_test.rs | 5 +---- .../consensus/orchestrator/node_reassignment_test.rs | 5 +---- rs/tests/consensus/replica_determinism_test.rs | 5 +---- .../sr_app_failover_nodes_enable_chain_keys_test.rs | 4 +--- .../subnet_recovery/sr_app_failover_nodes_test.rs | 4 +--- .../sr_app_failover_nodes_with_chain_keys_test.rs | 4 +--- .../sr_app_large_no_upgrade_with_chain_keys_test.rs | 4 +--- .../sr_app_no_upgrade_enable_chain_keys_test.rs | 4 +--- .../subnet_recovery/sr_app_no_upgrade_local_test.rs | 4 +--- .../sr_app_no_upgrade_provision_write_access_test.rs | 4 +--- .../consensus/subnet_recovery/sr_app_no_upgrade_test.rs | 4 +--- .../sr_app_no_upgrade_with_chain_keys_test.rs | 4 +--- .../sr_app_same_nodes_enable_chain_keys_test.rs | 4 +--- .../consensus/subnet_recovery/sr_app_same_nodes_test.rs | 4 +--- .../sr_app_same_nodes_with_chain_keys_test.rs | 4 +--- .../subnet_recovery/sr_nns_failover_nodes_test.rs | 4 +--- rs/tests/consensus/subnet_splitting_test.rs | 5 +---- rs/tests/driver/src/driver/group.rs | 9 +-------- 18 files changed, 18 insertions(+), 63 deletions(-) diff --git a/rs/tests/consensus/cup_explorer_test.rs b/rs/tests/consensus/cup_explorer_test.rs index f0ac0f5633dc..d5ace6df819d 100644 --- a/rs/tests/consensus/cup_explorer_test.rs +++ b/rs/tests/consensus/cup_explorer_test.rs @@ -219,10 +219,7 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test)) // The replica is restarted when the orchestrator observes the recovery CUP in the registry - .update_orchestrator_metrics_to_check( - r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, - 2, - ) + .update_orchestrator_metrics_to_check("orchestrator_processes_start_attempts_total", 2) .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/orchestrator/node_reassignment_test.rs b/rs/tests/consensus/orchestrator/node_reassignment_test.rs index 7b3ab3f80942..26c16a82ab0d 100644 --- a/rs/tests/consensus/orchestrator/node_reassignment_test.rs +++ b/rs/tests/consensus/orchestrator/node_reassignment_test.rs @@ -360,10 +360,7 @@ fn main() -> Result<()> { .add_test(systest!(test)) // Some nodes change subnets twice in which case the replica process would be started // three times. - .update_orchestrator_metrics_to_check( - r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, - 3, - ) + .update_orchestrator_metrics_to_check("orchestrator_processes_start_attempts_total", 3) .execute_from_args()?; Ok(()) diff --git a/rs/tests/consensus/replica_determinism_test.rs b/rs/tests/consensus/replica_determinism_test.rs index afe87384a7ee..39097c02dfa1 100644 --- a/rs/tests/consensus/replica_determinism_test.rs +++ b/rs/tests/consensus/replica_determinism_test.rs @@ -176,10 +176,7 @@ fn main() -> Result<()> { // TODO(DSM-118): The replica may occasionally be started 3 times (instead of the usual 2) if // it crashes again briefly during the catch-up process after the divergence. Consider reducing // this number if the underlying issue has been resolved. - .update_orchestrator_metrics_to_check( - r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, - 3, - ) + .update_orchestrator_metrics_to_check("orchestrator_processes_start_attempts_total", 3) // One of the nodes has a corrupted state which could cause a panic in the replica like: // thread 'MR Batch Processor' (1588) panicked at rs/state_manager/src/lib.rs:1036:17: // Unexpected sandbox state for canister ... diff --git a/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_enable_chain_keys_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_enable_chain_keys_test.rs index 7b21db3ddebf..1ae1902b55fc 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_enable_chain_keys_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_enable_chain_keys_test.rs @@ -12,9 +12,7 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check( - r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, - ) + .remove_metrics_to_check("orchestrator_processes_start_attempts_total") .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_test.rs index 9c7a7678d0be..6674a4aa038e 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_test.rs @@ -11,9 +11,7 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test; CupCorruption::CorruptedIncludingInvalidNiDkgId)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check( - r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, - ) + .remove_metrics_to_check("orchestrator_processes_start_attempts_total") // The test corrupts the CUPs, so it's expected that the following error metric will be // non-zero. .remove_metrics_to_check("orchestrator_cup_deserialization_failed_total") diff --git a/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_with_chain_keys_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_with_chain_keys_test.rs index 9fbca5c945e4..06264be5847a 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_with_chain_keys_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_failover_nodes_with_chain_keys_test.rs @@ -13,9 +13,7 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check( - r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, - ) + .remove_metrics_to_check("orchestrator_processes_start_attempts_total") .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_large_no_upgrade_with_chain_keys_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_large_no_upgrade_with_chain_keys_test.rs index 37f98e15c797..aff5d13b5d12 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_large_no_upgrade_with_chain_keys_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_large_no_upgrade_with_chain_keys_test.rs @@ -14,9 +14,7 @@ fn main() -> Result<()> { .with_timeout_per_test(Duration::from_secs(50 * 60)) .add_test(systest!(test)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check( - r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, - ) + .remove_metrics_to_check("orchestrator_processes_start_attempts_total") .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_enable_chain_keys_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_enable_chain_keys_test.rs index 4e178e0cb6c2..ea6c785d1bd6 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_enable_chain_keys_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_enable_chain_keys_test.rs @@ -13,9 +13,7 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test; CupCorruption::NotCorrupted)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check( - r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, - ) + .remove_metrics_to_check("orchestrator_processes_start_attempts_total") .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_local_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_local_test.rs index bf5b509e0725..2f0c93b1acf0 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_local_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_local_test.rs @@ -11,9 +11,7 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check( - r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, - ) + .remove_metrics_to_check("orchestrator_processes_start_attempts_total") .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_provision_write_access_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_provision_write_access_test.rs index 1dc50a36807d..1e06e1597061 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_provision_write_access_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_provision_write_access_test.rs @@ -11,9 +11,7 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check( - r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, - ) + .remove_metrics_to_check("orchestrator_processes_start_attempts_total") .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_test.rs index a0e1180a6be8..f58420a5e281 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_test.rs @@ -11,9 +11,7 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check( - r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, - ) + .remove_metrics_to_check("orchestrator_processes_start_attempts_total") .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_with_chain_keys_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_with_chain_keys_test.rs index bf6ce79b2ec5..3b36bb38e043 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_with_chain_keys_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_no_upgrade_with_chain_keys_test.rs @@ -18,9 +18,7 @@ fn main() -> Result<()> { // non-zero. .remove_metrics_to_check("orchestrator_cup_deserialization_failed_total") // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check( - r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, - ) + .remove_metrics_to_check("orchestrator_processes_start_attempts_total") .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_enable_chain_keys_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_enable_chain_keys_test.rs index a969a7801d4b..9591dd26d37c 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_enable_chain_keys_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_enable_chain_keys_test.rs @@ -12,9 +12,7 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check( - r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, - ) + .remove_metrics_to_check("orchestrator_processes_start_attempts_total") .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_test.rs index a4e355867cf7..97249bf7418c 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_test.rs @@ -11,9 +11,7 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test; CupCorruption::NotCorrupted)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check( - r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, - ) + .remove_metrics_to_check("orchestrator_processes_start_attempts_total") .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_with_chain_keys_test.rs b/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_with_chain_keys_test.rs index 899d9b0ecc26..af83521a8e2a 100644 --- a/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_with_chain_keys_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_app_same_nodes_with_chain_keys_test.rs @@ -13,9 +13,7 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check( - r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, - ) + .remove_metrics_to_check("orchestrator_processes_start_attempts_total") .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_recovery/sr_nns_failover_nodes_test.rs b/rs/tests/consensus/subnet_recovery/sr_nns_failover_nodes_test.rs index b55fccd84c40..2774957df6df 100644 --- a/rs/tests/consensus/subnet_recovery/sr_nns_failover_nodes_test.rs +++ b/rs/tests/consensus/subnet_recovery/sr_nns_failover_nodes_test.rs @@ -62,9 +62,7 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(test)) // The replica binary is "broken" and restarted by the orchestrator multiple times - .remove_metrics_to_check( - r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, - ) + .remove_metrics_to_check("orchestrator_processes_start_attempts_total") .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/subnet_splitting_test.rs b/rs/tests/consensus/subnet_splitting_test.rs index f49c72c06a5c..737dc849c491 100644 --- a/rs/tests/consensus/subnet_splitting_test.rs +++ b/rs/tests/consensus/subnet_splitting_test.rs @@ -448,9 +448,6 @@ fn main() -> Result<()> { .with_setup(setup) .add_test(systest!(subnet_splitting_test)) // The replica is restarted when the orchestrator observes the recovery CUP in the registry - .update_orchestrator_metrics_to_check( - r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, - 2, - ) + .update_orchestrator_metrics_to_check("orchestrator_processes_start_attempts_total", 2) .execute_from_args() } diff --git a/rs/tests/driver/src/driver/group.rs b/rs/tests/driver/src/driver/group.rs index 3fa103c38ea8..75c626271ab1 100644 --- a/rs/tests/driver/src/driver/group.rs +++ b/rs/tests/driver/src/driver/group.rs @@ -663,14 +663,7 @@ fn default_orchestrator_metrics() -> BTreeMap<&'static str, u64> { ("orchestrator_cup_deserialization_failed_total", 0), ("orchestrator_state_removal_failed_total", 0), ("orchestrator_tasks_failed_total", 0), - ( - r#"orchestrator_processes_start_attempts_total{process_name="replica"}"#, - 1, - ), - ( - r#"orchestrator_processes_start_attempts_total{process_name="ic-gateway"}"#, - 1, - ), + ("orchestrator_processes_start_attempts_total", 1), ]) } From fa1dd8526e05184980dd44cb36cc28144efbc932 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Wed, 17 Jun 2026 14:59:22 +0000 Subject: [PATCH 15/42] feat: support checking metrics prefixes --- rs/tests/driver/src/driver/group.rs | 4 ++++ rs/tests/driver/src/driver/test_env_api.rs | 7 +++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/rs/tests/driver/src/driver/group.rs b/rs/tests/driver/src/driver/group.rs index 75c626271ab1..a177d43f611e 100644 --- a/rs/tests/driver/src/driver/group.rs +++ b/rs/tests/driver/src/driver/group.rs @@ -647,6 +647,8 @@ impl SystemTestSubGroup { } } +// Replica metrics to check by default. Including a prefix is supported and will match on all +// metrics with that prefix. For that reason, keep the list prefix-free. fn default_replica_metrics() -> BTreeMap<&'static str, u64> { BTreeMap::from([ ("critical_errors", 0), @@ -658,6 +660,8 @@ fn default_replica_metrics() -> BTreeMap<&'static str, u64> { ]) } +// Orchestrator metrics to check by default. Including a prefix is supported and will match on all +// metrics with that prefix. For that reason, keep the list prefix-free. fn default_orchestrator_metrics() -> BTreeMap<&'static str, u64> { BTreeMap::from([ ("orchestrator_cup_deserialization_failed_total", 0), diff --git a/rs/tests/driver/src/driver/test_env_api.rs b/rs/tests/driver/src/driver/test_env_api.rs index 7c06d9df1fa3..338c5d29bc91 100644 --- a/rs/tests/driver/src/driver/test_env_api.rs +++ b/rs/tests/driver/src/driver/test_env_api.rs @@ -1138,9 +1138,12 @@ impl IcNodeSnapshot { self.node_id ); for (name, value) in metrics { + // Assume the metrics to check are prefix-free. This allows to specify a metric name + // prefix to check all metrics with that prefix. let max_value = metrics_to_check - .get(name.split('(').next().unwrap()) - .copied() + .iter() + .find(|(metric_name, _)| name.starts_with(**metric_name)) + .map(|(_, max_value)| *max_value) .unwrap_or_default(); assert!( value[0] <= max_value, From 3adc57243f612936786a7da9b01389a6779ff0e1 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Wed, 17 Jun 2026 15:05:38 +0000 Subject: [PATCH 16/42] feat: ic-gateway metrics --- ic-os/components/guestos/share/ic-gateway.env | 1 + .../config/tool/templates/ic.json5.template | 239 +++++++++--------- ...nftables_assigned_cloud_engine.conf.golden | 6 +- .../nftables_assigned_replica.conf.golden | 4 +- ...tables_unassigned_cloud_engine.conf.golden | 4 +- .../nftables_unassigned_replica.conf.golden | 4 +- 6 files changed, 130 insertions(+), 128 deletions(-) diff --git a/ic-os/components/guestos/share/ic-gateway.env b/ic-os/components/guestos/share/ic-gateway.env index a40c1845ed05..67aef4b25b29 100644 --- a/ic-os/components/guestos/share/ic-gateway.env +++ b/ic-os/components/guestos/share/ic-gateway.env @@ -1,4 +1,5 @@ LISTEN_PLAIN=[::]:80 LISTEN_INSECURE_SERVE_HTTP_ONLY=true +METRICS_LISTEN=[::]:9314 IC_URL=http://127.0.0.1:8080 DOMAIN=gateway.icp diff --git a/rs/ic_os/config/tool/templates/ic.json5.template b/rs/ic_os/config/tool/templates/ic.json5.template index 4a77df13e795..ea6a436e9014 100644 --- a/rs/ic_os/config/tool/templates/ic.json5.template +++ b/rs/ic_os/config/tool/templates/ic.json5.template @@ -327,15 +327,22 @@ table ip6 filter {\n\ whitelisted_nodes_udp_ports_whitelist: [4100], all_nodes_tcp_ports_whitelist: [2497], all_nodes_udp_ports_whitelist: [], - ports_for_http_adapter_blacklist: [22, 2497, 4100, 7070, 8080, 9090, 9091, 9100, 19100, 19523, 19531], + ports_for_http_adapter_blacklist: [22, 2497, 4100, 7070, 8080, 9090, 9091, 9100, 9314, 9324, 19100, 19522, 19523, 19531], max_simultaneous_connections_per_ip_address: 1000, }, - boundary_node_firewall: { + cloud_engine_firewall: { config_file: "/run/ic-node/nftables-ruleset/nftables.conf", file_template: "flush ruleset\n\ \n\ table filter {\n\ + define icmp_v4_types_accept = {\n\ + destination-unreachable,\n\ + time-exceeded,\n\ + echo-request,\n\ + echo-reply,\n\ + }\n\ +\n\ set rate_limit {\n\ type ipv4_addr\n\ size 65535\n\ @@ -360,18 +367,18 @@ table filter {\n\ type filter hook input priority 0; policy drop;\n\ iif lo accept\n\ ip saddr @blackhole drop\n\ - ct state new add @rate_limit { ip saddr limit rate over 2000/minute burst 1000 packets } counter name rate_limit_v4_counter drop\n\ + ct state new add @rate_limit { ip saddr limit rate over 1000/minute burst 500 packets } counter name rate_limit_v4_counter drop\n\ # Notes about the rule below:\n\ # - The rule allows a maximum of <> persistent connections to any ip address.\n\ # - The rule drops all new connections that goes over the configured limit.\n\ ct state new add @connection_limit { ip saddr ct count over <> } counter name connection_limit_v4_counter drop\n\ - icmp type { echo-reply, destination-unreachable, source-quench, echo-request, time-exceeded } accept\n\ - ct state invalid drop\n\ - ct state { established, related } accept\n\ - ip saddr { 0.0.0.0-255.255.255.255 } ct state new tcp dport 443 accept\n\ -\n\ + icmp type $icmp_v4_types_accept accept\n\ <>\n\ <>\n\ + ct state { invalid } drop\n\ + ct state { established, related } accept\n\ + ip saddr { 0.0.0.0-255.255.255.255 } ct state new tcp dport 80 accept\n\ + log prefix \"Drop - default policy: \"\n\ }\n\ \n\ chain FORWARD {\n\ @@ -379,17 +386,24 @@ table filter {\n\ }\n\ \n\ chain OUTPUT {\n\ - type filter hook output priority 0; policy drop;\n\ - oif \"lo\" accept\n\ - icmp type { echo-reply, destination-unreachable, source-quench, echo-request, time-exceeded } accept\n\ - ct state invalid drop\n\ - ct state { established, related } accept\n\ - ct state new tcp dport { 53, 80, 443, 8080 } accept\n\ - ct state new udp dport { 53, 123 } accept\n\ + type filter hook output priority 0; policy accept;\n\ + meta skuid ic-http-adapter ip daddr { 127.0.0.0/8 } ct state { new } tcp dport { 1-19999 } reject # Block restricted localhost ic-http-adapter HTTPS access\n\ + <>\n\ }\n\ }\n\ \n\ table ip6 filter {\n\ + define icmp_v6_types_accept = {\n\ + destination-unreachable,\n\ + packet-too-big,\n\ + time-exceeded,\n\ + echo-request,\n\ + echo-reply,\n\ + nd-router-advert,\n\ + nd-neighbor-solicit,\n\ + nd-neighbor-advert,\n\ + }\n\ +\n\ set rate_limit {\n\ type ipv6_addr\n\ size 65535\n\ @@ -414,19 +428,25 @@ table ip6 filter {\n\ type filter hook input priority 0; policy drop;\n\ iif lo accept\n\ ip6 saddr @blackhole6 drop\n\ - ct state new add @rate_limit { ip6 saddr limit rate over 2000/minute burst 1000 packets } counter name rate_limit_v6_counter drop\n\ + ct state { invalid } drop\n\ + ct state { established, related } accept\n\ + ct state new add @rate_limit { ip6 saddr limit rate over 1000/minute burst 500 packets } counter name rate_limit_v6_counter drop\n\ # Notes about the rule below:\n\ # - The rule allows a maximum of <> persistent connections to any ip6 address.\n\ # - The rule drops all new connections that goes over the configured limit.\n\ ct state new add @connection_limit { ip6 saddr ct count over <> } counter name connection_limit_v6_counter drop\n\ - icmpv6 type { destination-unreachable, packet-too-big, time-exceeded, echo-request, echo-reply, nd-router-advert, nd-neighbor-solicit, nd-neighbor-advert } accept\n\ - ct state { invalid } drop\n\ - ct state { established, related } accept\n\ - ip6 saddr { {{ ipv6_prefix }} } ct state { new } tcp dport { 7070, 9091, 9100, 9324, 19531, 19100, 19522 } accept\n\ - ip6 saddr { ::-ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff } ct state new tcp dport 443 accept\n\ -\n\ + icmpv6 type $icmp_v6_types_accept accept\n\ + # DHCPv6\n\ + udp dport { 546 } accept\n\ + # TCP ports required for GuestOS functionality\n\ + ip6 saddr { {{ ipv6_prefix }} } ct state { new } tcp dport { 7070, 9090, 9091, 9100, 9314, 19531, 19100, 19522 } accept\n\ + # Allow access from HostOS metrics-proxy so GuestOS metrics-proxy can proxy certain metrics to HostOS\n\ + ip6 saddr { hostos } ct state { new } tcp dport { 42372 } accept\n\ + ip6 saddr { ::-ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff } ct state new tcp dport 80 accept\n\ + # Custom templated rules\n\ <>\n\ <>\n\ + log prefix \"Drop - default policy: \"\n\ }\n\ \n\ chain FORWARD {\n\ @@ -434,53 +454,51 @@ table ip6 filter {\n\ }\n\ \n\ chain OUTPUT {\n\ - type filter hook output priority 0; policy drop;\n\ - oif \"lo\" accept\n\ - icmpv6 type { destination-unreachable, packet-too-big, time-exceeded, echo-request, echo-reply, nd-router-solicit, nd-neighbor-solicit, nd-neighbor-advert } accept\n\ - ct state invalid drop\n\ - ct state { established, related } accept\n\ - ct state new tcp dport { 53, 80, 443, 4460, 8080 } accept\n\ - ct state new udp dport { 53, 123 } accept\n\ + type filter hook output priority 0; policy accept;\n\ + meta skuid ic-http-adapter fib daddr type local ct state { new } tcp dport { 1-19999 } reject # Block restricted local addresses ic-http-adapter HTTPS access\n\ + meta skuid ic-http-adapter ip6 daddr { 2a00:fb01:400:42::/64, 2602:fb2b:110::/48, 2602:fb2b:100::/48, 2602:fb2b:120::/48 } ct state { new } tcp dport { 1-19999 } reject # Block restricted outbound ic-http-adapter HTTPS access\n\ + <>\n\ }\n\ }\n", ipv4_tcp_rule_template: "ip saddr {<>} ct state { new } tcp dport {<>} <> # <>", - ipv4_udp_rule_template: "ip saddr {<>} udp dport {<>} <> # <>", ipv6_tcp_rule_template: "ip6 saddr {<>} ct state { new } tcp dport {<>} <> # <>", + ipv4_udp_rule_template: "ip saddr {<>} udp dport {<>} <> # <>", ipv6_udp_rule_template: "ip6 saddr {<>} udp dport {<>} <> # <>", + ipv4_user_output_rule_template: "meta skuid <> ip daddr {<>} ct state { new } tcp dport {<>} <> # <>", + ipv6_user_output_rule_template: "meta skuid <> ip6 daddr {<>} ct state { new } tcp dport {<>} <> # <>", default_rules: [{ - ipv4_prefixes: [], - ipv6_prefixes: [ - "2602:fb2b:120::/48", - "2602:fb2b:100::/48", - "2602:fb2b:110::/48", - "2600:c00:2:100::/64", - "2001:4c08:2003:b09::/64", - "2600:3007:4401::/48", - "2a00:fb01:400::/56", - "2a00:fb01:400:200::/64", - "2a05:d01c:e2c:a700::/56", - "2a05:d01c:d9:2b00::/56", - ], - ports: [22, 7070, 9091, 9100, 9324, 19531], - action: 1, - comment: "Default rule from template", - direction: null, + ipv4_prefixes: [], + ipv6_prefixes: [ + "2602:fb2b:120::/48", + "2602:fb2b:100::/48", + "2602:fb2b:110::/48", + "2600:c00:2:100::/64", + "2001:4c08:2003:b09::/64", + "2600:3007:4401::/48", + "2a00:fb01:400::/56", + "2a00:fb01:400:200::/64", + "2a05:d01c:e2c:a700::/56", + "2a05:d01c:d9:2b00::/56", + ], + // TODO: Need proposal to apply opening port 9314 + ports: [22, 2497, 4100, 7070, 8080, 9090, 9091, 9100, 9314, 19100, 19523, 19531], + action: 1, + comment: "Default rule from template", + direction: 1, }], - max_simultaneous_connections_per_ip_address: 400, + whitelisted_nodes_tcp_ports_whitelist: [22, 8080], + whitelisted_nodes_udp_ports_whitelist: [4100], + all_nodes_tcp_ports_whitelist: [2497], + all_nodes_udp_ports_whitelist: [], + ports_for_http_adapter_blacklist: [22, 2497, 4100, 7070, 8080, 9090, 9091, 9100, 9314, 9324, 19100, 19522, 19523, 19531], + max_simultaneous_connections_per_ip_address: 1000, }, - cloud_engine_firewall: { + boundary_node_firewall: { config_file: "/run/ic-node/nftables-ruleset/nftables.conf", file_template: "flush ruleset\n\ \n\ table filter {\n\ - define icmp_v4_types_accept = {\n\ - destination-unreachable,\n\ - time-exceeded,\n\ - echo-request,\n\ - echo-reply,\n\ - }\n\ -\n\ set rate_limit {\n\ type ipv4_addr\n\ size 65535\n\ @@ -505,18 +523,18 @@ table filter {\n\ type filter hook input priority 0; policy drop;\n\ iif lo accept\n\ ip saddr @blackhole drop\n\ - ct state new add @rate_limit { ip saddr limit rate over 1000/minute burst 500 packets } counter name rate_limit_v4_counter drop\n\ + ct state new add @rate_limit { ip saddr limit rate over 2000/minute burst 1000 packets } counter name rate_limit_v4_counter drop\n\ # Notes about the rule below:\n\ # - The rule allows a maximum of <> persistent connections to any ip address.\n\ # - The rule drops all new connections that goes over the configured limit.\n\ ct state new add @connection_limit { ip saddr ct count over <> } counter name connection_limit_v4_counter drop\n\ - icmp type $icmp_v4_types_accept accept\n\ + icmp type { echo-reply, destination-unreachable, source-quench, echo-request, time-exceeded } accept\n\ + ct state invalid drop\n\ + ct state { established, related } accept\n\ + ip saddr { 0.0.0.0-255.255.255.255 } ct state new tcp dport 443 accept\n\ +\n\ <>\n\ <>\n\ - ct state { invalid } drop\n\ - ct state { established, related } accept\n\ - ip saddr { 0.0.0.0-255.255.255.255 } ct state new tcp dport 80 accept\n\ - log prefix \"Drop - default policy: \"\n\ }\n\ \n\ chain FORWARD {\n\ @@ -524,24 +542,17 @@ table filter {\n\ }\n\ \n\ chain OUTPUT {\n\ - type filter hook output priority 0; policy accept;\n\ - meta skuid ic-http-adapter ip daddr { 127.0.0.0/8 } ct state { new } tcp dport { 1-19999 } reject # Block restricted localhost ic-http-adapter HTTPS access\n\ - <>\n\ + type filter hook output priority 0; policy drop;\n\ + oif \"lo\" accept\n\ + icmp type { echo-reply, destination-unreachable, source-quench, echo-request, time-exceeded } accept\n\ + ct state invalid drop\n\ + ct state { established, related } accept\n\ + ct state new tcp dport { 53, 80, 443, 8080 } accept\n\ + ct state new udp dport { 53, 123 } accept\n\ }\n\ }\n\ \n\ table ip6 filter {\n\ - define icmp_v6_types_accept = {\n\ - destination-unreachable,\n\ - packet-too-big,\n\ - time-exceeded,\n\ - echo-request,\n\ - echo-reply,\n\ - nd-router-advert,\n\ - nd-neighbor-solicit,\n\ - nd-neighbor-advert,\n\ - }\n\ -\n\ set rate_limit {\n\ type ipv6_addr\n\ size 65535\n\ @@ -566,25 +577,19 @@ table ip6 filter {\n\ type filter hook input priority 0; policy drop;\n\ iif lo accept\n\ ip6 saddr @blackhole6 drop\n\ - ct state { invalid } drop\n\ - ct state { established, related } accept\n\ - ct state new add @rate_limit { ip6 saddr limit rate over 1000/minute burst 500 packets } counter name rate_limit_v6_counter drop\n\ + ct state new add @rate_limit { ip6 saddr limit rate over 2000/minute burst 1000 packets } counter name rate_limit_v6_counter drop\n\ # Notes about the rule below:\n\ # - The rule allows a maximum of <> persistent connections to any ip6 address.\n\ # - The rule drops all new connections that goes over the configured limit.\n\ ct state new add @connection_limit { ip6 saddr ct count over <> } counter name connection_limit_v6_counter drop\n\ - icmpv6 type $icmp_v6_types_accept accept\n\ - # DHCPv6\n\ - udp dport { 546 } accept\n\ - # TCP ports required for GuestOS functionality\n\ - ip6 saddr { {{ ipv6_prefix }} } ct state { new } tcp dport { 7070, 9090, 9091, 9100, 9324, 19531, 19100, 19522 } accept\n\ - # Allow access from HostOS metrics-proxy so GuestOS metrics-proxy can proxy certain metrics to HostOS\n\ - ip6 saddr { hostos } ct state { new } tcp dport { 42372 } accept\n\ - ip6 saddr { ::-ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff } ct state new tcp dport 80 accept\n\ - # Custom templated rules\n\ + icmpv6 type { destination-unreachable, packet-too-big, time-exceeded, echo-request, echo-reply, nd-router-advert, nd-neighbor-solicit, nd-neighbor-advert } accept\n\ + ct state { invalid } drop\n\ + ct state { established, related } accept\n\ + ip6 saddr { {{ ipv6_prefix }} } ct state { new } tcp dport { 7070, 9091, 9100, 9324, 19531, 19100, 19522 } accept\n\ + ip6 saddr { ::-ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff } ct state new tcp dport 443 accept\n\ +\n\ <>\n\ <>\n\ - log prefix \"Drop - default policy: \"\n\ }\n\ \n\ chain FORWARD {\n\ @@ -592,43 +597,39 @@ table ip6 filter {\n\ }\n\ \n\ chain OUTPUT {\n\ - type filter hook output priority 0; policy accept;\n\ - meta skuid ic-http-adapter fib daddr type local ct state { new } tcp dport { 1-19999 } reject # Block restricted local addresses ic-http-adapter HTTPS access\n\ - meta skuid ic-http-adapter ip6 daddr { 2a00:fb01:400:42::/64, 2602:fb2b:110::/48, 2602:fb2b:100::/48, 2602:fb2b:120::/48 } ct state { new } tcp dport { 1-19999 } reject # Block restricted outbound ic-http-adapter HTTPS access\n\ - <>\n\ + type filter hook output priority 0; policy drop;\n\ + oif \"lo\" accept\n\ + icmpv6 type { destination-unreachable, packet-too-big, time-exceeded, echo-request, echo-reply, nd-router-solicit, nd-neighbor-solicit, nd-neighbor-advert } accept\n\ + ct state invalid drop\n\ + ct state { established, related } accept\n\ + ct state new tcp dport { 53, 80, 443, 4460, 8080 } accept\n\ + ct state new udp dport { 53, 123 } accept\n\ }\n\ }\n", ipv4_tcp_rule_template: "ip saddr {<>} ct state { new } tcp dport {<>} <> # <>", - ipv6_tcp_rule_template: "ip6 saddr {<>} ct state { new } tcp dport {<>} <> # <>", ipv4_udp_rule_template: "ip saddr {<>} udp dport {<>} <> # <>", + ipv6_tcp_rule_template: "ip6 saddr {<>} ct state { new } tcp dport {<>} <> # <>", ipv6_udp_rule_template: "ip6 saddr {<>} udp dport {<>} <> # <>", - ipv4_user_output_rule_template: "meta skuid <> ip daddr {<>} ct state { new } tcp dport {<>} <> # <>", - ipv6_user_output_rule_template: "meta skuid <> ip6 daddr {<>} ct state { new } tcp dport {<>} <> # <>", default_rules: [{ - ipv4_prefixes: [], - ipv6_prefixes: [ - "2602:fb2b:120::/48", - "2602:fb2b:100::/48", - "2602:fb2b:110::/48", - "2600:c00:2:100::/64", - "2001:4c08:2003:b09::/64", - "2600:3007:4401::/48", - "2a00:fb01:400::/56", - "2a00:fb01:400:200::/64", - "2a05:d01c:e2c:a700::/56", - "2a05:d01c:d9:2b00::/56", - ], - ports: [22, 2497, 4100, 7070, 8080, 9090, 9091, 9100, 9324, 19100, 19523, 19531], - action: 1, - comment: "Default rule from template", - direction: 1, + ipv4_prefixes: [], + ipv6_prefixes: [ + "2602:fb2b:120::/48", + "2602:fb2b:100::/48", + "2602:fb2b:110::/48", + "2600:c00:2:100::/64", + "2001:4c08:2003:b09::/64", + "2600:3007:4401::/48", + "2a00:fb01:400::/56", + "2a00:fb01:400:200::/64", + "2a05:d01c:e2c:a700::/56", + "2a05:d01c:d9:2b00::/56", + ], + ports: [22, 7070, 9091, 9100, 9324, 19531], + action: 1, + comment: "Default rule from template", + direction: null, }], - whitelisted_nodes_tcp_ports_whitelist: [22, 8080], - whitelisted_nodes_udp_ports_whitelist: [4100], - all_nodes_tcp_ports_whitelist: [2497], - all_nodes_udp_ports_whitelist: [], - ports_for_http_adapter_blacklist: [22, 2497, 4100, 7070, 8080, 9090, 9091, 9100, 19100, 19523, 19531], - max_simultaneous_connections_per_ip_address: 1000, + max_simultaneous_connections_per_ip_address: 400, }, registration: { diff --git a/rs/orchestrator/testdata/nftables_assigned_cloud_engine.conf.golden b/rs/orchestrator/testdata/nftables_assigned_cloud_engine.conf.golden index 88d8d33b72b5..ce7fc9b2d210 100644 --- a/rs/orchestrator/testdata/nftables_assigned_cloud_engine.conf.golden +++ b/rs/orchestrator/testdata/nftables_assigned_cloud_engine.conf.golden @@ -58,7 +58,7 @@ ip saddr {6.6.6.6} ct state { new } tcp dport {1006} accept # global chain OUTPUT { type filter hook output priority 0; policy accept; meta skuid ic-http-adapter ip daddr { 127.0.0.0/8 } ct state { new } tcp dport { 1-19999 } reject # Block restricted localhost ic-http-adapter HTTPS access - meta skuid ic-http-adapter ip daddr {1.1.1.1,3.0.0.3,3.0.0.4,3.0.0.5,3.0.0.6,3.0.0.7,4.0.0.4,4.0.0.5,4.0.0.6,4.0.0.7} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,19100,19523,19531} reject # Automatic blacklisting for ic-http-adapter + meta skuid ic-http-adapter ip daddr {1.1.1.1,3.0.0.3,3.0.0.4,3.0.0.5,3.0.0.6,3.0.0.7,4.0.0.4,4.0.0.5,4.0.0.6,4.0.0.7} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,9314,9324,19100,19522,19523,19531} reject # Automatic blacklisting for ic-http-adapter } } @@ -109,7 +109,7 @@ table ip6 filter { # DHCPv6 udp dport { 546 } accept # TCP ports required for GuestOS functionality - ip6 saddr { ::/64 } ct state { new } tcp dport { 7070, 9090, 9091, 9100, 9324, 19531, 19100, 19522 } accept + ip6 saddr { ::/64 } ct state { new } tcp dport { 7070, 9090, 9091, 9100, 9314, 19531, 19100, 19522 } accept # Allow access from HostOS metrics-proxy so GuestOS metrics-proxy can proxy certain metrics to HostOS ip6 saddr { hostos } ct state { new } tcp dport { 42372 } accept ip6 saddr { ::-ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff } ct state new tcp dport 80 accept @@ -132,6 +132,6 @@ ip6 saddr {::ffff:6.6.6.6} ct state { new } tcp dport {1006} accept # global type filter hook output priority 0; policy accept; meta skuid ic-http-adapter fib daddr type local ct state { new } tcp dport { 1-19999 } reject # Block restricted local addresses ic-http-adapter HTTPS access meta skuid ic-http-adapter ip6 daddr { 2a00:fb01:400:42::/64, 2602:fb2b:110::/48, 2602:fb2b:100::/48, 2602:fb2b:120::/48 } ct state { new } tcp dport { 1-19999 } reject # Block restricted outbound ic-http-adapter HTTPS access - meta skuid ic-http-adapter ip6 daddr {2001:db8:85a3::8a2e:1370:7334,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e5,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e6,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e7,a4c2:7f91:3db6:1e8c:5a4f:cc92:b37:6e41} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,19100,19523,19531} reject # Automatic blacklisting for ic-http-adapter + meta skuid ic-http-adapter ip6 daddr {2001:db8:85a3::8a2e:1370:7334,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e5,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e6,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e7,a4c2:7f91:3db6:1e8c:5a4f:cc92:b37:6e41} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,9314,9324,19100,19522,19523,19531} reject # Automatic blacklisting for ic-http-adapter } } diff --git a/rs/orchestrator/testdata/nftables_assigned_replica.conf.golden b/rs/orchestrator/testdata/nftables_assigned_replica.conf.golden index 9c70a94fe706..746f89d4f9d2 100644 --- a/rs/orchestrator/testdata/nftables_assigned_replica.conf.golden +++ b/rs/orchestrator/testdata/nftables_assigned_replica.conf.golden @@ -52,7 +52,7 @@ ip saddr {6.6.6.6} ct state { new } tcp dport {1006} accept # global chain OUTPUT { type filter hook output priority 0; policy accept; meta skuid ic-http-adapter ip daddr { 127.0.0.0/8 } ct state { new } tcp dport { 1-19999 } reject # Block restricted localhost ic-http-adapter HTTPS access - meta skuid ic-http-adapter ip daddr {1.1.1.1,3.0.0.3,3.0.0.4,3.0.0.5,3.0.0.6,3.0.0.7,4.0.0.4,4.0.0.5,4.0.0.6,4.0.0.7} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,19100,19523,19531} reject # Automatic blacklisting for ic-http-adapter + meta skuid ic-http-adapter ip daddr {1.1.1.1,3.0.0.3,3.0.0.4,3.0.0.5,3.0.0.6,3.0.0.7,4.0.0.4,4.0.0.5,4.0.0.6,4.0.0.7} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,9314,9324,19100,19522,19523,19531} reject # Automatic blacklisting for ic-http-adapter } } @@ -119,6 +119,6 @@ ip6 saddr {::ffff:6.6.6.6} ct state { new } tcp dport {1006} accept # global type filter hook output priority 0; policy accept; meta skuid ic-http-adapter fib daddr type local ct state { new } tcp dport { 1-19999 } reject # Block restricted local addresses ic-http-adapter HTTPS access meta skuid ic-http-adapter ip6 daddr { 2a00:fb01:400:42::/64, 2602:fb2b:110::/48, 2602:fb2b:100::/48, 2602:fb2b:120::/48 } ct state { new } tcp dport { 1-19999 } reject # Block restricted outbound ic-http-adapter HTTPS access - meta skuid ic-http-adapter ip6 daddr {2001:db8:85a3::8a2e:1370:7334,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e5,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e6,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e7,a4c2:7f91:3db6:1e8c:5a4f:cc92:b37:6e41} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,19100,19523,19531} reject # Automatic blacklisting for ic-http-adapter + meta skuid ic-http-adapter ip6 daddr {2001:db8:85a3::8a2e:1370:7334,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e5,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e6,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e7,a4c2:7f91:3db6:1e8c:5a4f:cc92:b37:6e41} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,9314,9324,19100,19522,19523,19531} reject # Automatic blacklisting for ic-http-adapter } } diff --git a/rs/orchestrator/testdata/nftables_unassigned_cloud_engine.conf.golden b/rs/orchestrator/testdata/nftables_unassigned_cloud_engine.conf.golden index c816c5e3ba9b..f643312371a6 100644 --- a/rs/orchestrator/testdata/nftables_unassigned_cloud_engine.conf.golden +++ b/rs/orchestrator/testdata/nftables_unassigned_cloud_engine.conf.golden @@ -51,7 +51,7 @@ ip saddr {6.6.6.6} ct state { new } tcp dport {1006} accept # global chain OUTPUT { type filter hook output priority 0; policy accept; meta skuid ic-http-adapter ip daddr { 127.0.0.0/8 } ct state { new } tcp dport { 1-19999 } reject # Block restricted localhost ic-http-adapter HTTPS access - meta skuid ic-http-adapter ip daddr {1.1.1.1,3.0.0.3,3.0.0.4,3.0.0.5,3.0.0.6,3.0.0.7,4.0.0.4,4.0.0.5,4.0.0.6,4.0.0.7} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,19100,19523,19531} reject # Automatic blacklisting for ic-http-adapter + meta skuid ic-http-adapter ip daddr {1.1.1.1,3.0.0.3,3.0.0.4,3.0.0.5,3.0.0.6,3.0.0.7,4.0.0.4,4.0.0.5,4.0.0.6,4.0.0.7} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,9314,9324,19100,19522,19523,19531} reject # Automatic blacklisting for ic-http-adapter } } @@ -117,6 +117,6 @@ ip6 saddr {::ffff:6.6.6.6} ct state { new } tcp dport {1006} accept # global type filter hook output priority 0; policy accept; meta skuid ic-http-adapter fib daddr type local ct state { new } tcp dport { 1-19999 } reject # Block restricted local addresses ic-http-adapter HTTPS access meta skuid ic-http-adapter ip6 daddr { 2a00:fb01:400:42::/64, 2602:fb2b:110::/48, 2602:fb2b:100::/48, 2602:fb2b:120::/48 } ct state { new } tcp dport { 1-19999 } reject # Block restricted outbound ic-http-adapter HTTPS access - meta skuid ic-http-adapter ip6 daddr {2001:db8:85a3::8a2e:1370:7334,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e5,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e6,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e7,a4c2:7f91:3db6:1e8c:5a4f:cc92:b37:6e41} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,19100,19523,19531} reject # Automatic blacklisting for ic-http-adapter + meta skuid ic-http-adapter ip6 daddr {2001:db8:85a3::8a2e:1370:7334,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e5,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e6,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e7,a4c2:7f91:3db6:1e8c:5a4f:cc92:b37:6e41} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,9314,9324,19100,19522,19523,19531} reject # Automatic blacklisting for ic-http-adapter } } diff --git a/rs/orchestrator/testdata/nftables_unassigned_replica.conf.golden b/rs/orchestrator/testdata/nftables_unassigned_replica.conf.golden index ef1c0e4869e8..4424486dd75c 100644 --- a/rs/orchestrator/testdata/nftables_unassigned_replica.conf.golden +++ b/rs/orchestrator/testdata/nftables_unassigned_replica.conf.golden @@ -51,7 +51,7 @@ ip saddr {6.6.6.6} ct state { new } tcp dport {1006} accept # global chain OUTPUT { type filter hook output priority 0; policy accept; meta skuid ic-http-adapter ip daddr { 127.0.0.0/8 } ct state { new } tcp dport { 1-19999 } reject # Block restricted localhost ic-http-adapter HTTPS access - meta skuid ic-http-adapter ip daddr {1.1.1.1,3.0.0.3,3.0.0.4,3.0.0.5,3.0.0.6,3.0.0.7,4.0.0.4,4.0.0.5,4.0.0.6,4.0.0.7} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,19100,19523,19531} reject # Automatic blacklisting for ic-http-adapter + meta skuid ic-http-adapter ip daddr {1.1.1.1,3.0.0.3,3.0.0.4,3.0.0.5,3.0.0.6,3.0.0.7,4.0.0.4,4.0.0.5,4.0.0.6,4.0.0.7} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,9314,9324,19100,19522,19523,19531} reject # Automatic blacklisting for ic-http-adapter } } @@ -117,6 +117,6 @@ ip6 saddr {::ffff:6.6.6.6} ct state { new } tcp dport {1006} accept # global type filter hook output priority 0; policy accept; meta skuid ic-http-adapter fib daddr type local ct state { new } tcp dport { 1-19999 } reject # Block restricted local addresses ic-http-adapter HTTPS access meta skuid ic-http-adapter ip6 daddr { 2a00:fb01:400:42::/64, 2602:fb2b:110::/48, 2602:fb2b:100::/48, 2602:fb2b:120::/48 } ct state { new } tcp dport { 1-19999 } reject # Block restricted outbound ic-http-adapter HTTPS access - meta skuid ic-http-adapter ip6 daddr {2001:db8:85a3::8a2e:1370:7334,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e5,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e6,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e7,a4c2:7f91:3db6:1e8c:5a4f:cc92:b37:6e41} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,19100,19523,19531} reject # Automatic blacklisting for ic-http-adapter + meta skuid ic-http-adapter ip6 daddr {2001:db8:85a3::8a2e:1370:7334,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e5,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e6,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e7,a4c2:7f91:3db6:1e8c:5a4f:cc92:b37:6e41} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,9314,9324,19100,19522,19523,19531} reject # Automatic blacklisting for ic-http-adapter } } From 852e89171904471eaf05a440846724d48f8e365f Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Wed, 17 Jun 2026 15:20:50 +0000 Subject: [PATCH 17/42] feat: stop ic-gateway first --- rs/orchestrator/src/processes.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rs/orchestrator/src/processes.rs b/rs/orchestrator/src/processes.rs index 50dcf208b7a4..8d860cffb9f7 100644 --- a/rs/orchestrator/src/processes.rs +++ b/rs/orchestrator/src/processes.rs @@ -480,8 +480,8 @@ impl MultipleProcessesManager { /// Stop every managed process. pub(crate) fn stop_all(&mut self) -> OrchestratorResult<()> { - self.replica_manager.stop()?; self.ic_gateway_manager.stop()?; + self.replica_manager.stop()?; Ok(()) } From 9295ec5ac4bc286737dd5de8defe9f54841adecc Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Thu, 18 Jun 2026 08:34:48 +0000 Subject: [PATCH 18/42] fix: fix image size estimates --- ic-os/guestos/envs/prod/BUILD.bazel | 6 +++--- ic-os/guestos/envs/recovery/BUILD.bazel | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ic-os/guestos/envs/prod/BUILD.bazel b/ic-os/guestos/envs/prod/BUILD.bazel index decf8bbc02c9..d1b0c0c2ff28 100644 --- a/ic-os/guestos/envs/prod/BUILD.bazel +++ b/ic-os/guestos/envs/prod/BUILD.bazel @@ -18,19 +18,19 @@ icos_images = icos_build( file_size_check( name = "disk_img_size_check", file = icos_images.disk_image, - max_file_size = 450 * 1000 * 1000, # 419 MB on 2025-03-21 + max_file_size = 475 * 1000 * 1000, # 453 MB on 2026-06-18 ) file_size_check( name = "update_img_size_check", file = icos_images.update_image, - max_file_size = 450 * 1000 * 1000, # 416 MB on 2025-03-21 + max_file_size = 475 * 1000 * 1000, # 451 MB on 2026-06-18 ) file_size_check( name = "update_img_test_size_check", file = icos_images.update_image_test, - max_file_size = 450 * 1000 * 1000, # 417 MB on 2025-06-26 + max_file_size = 475 * 1000 * 1000, # 451 MB on 2026-06-18 ) # Export checksums & build artifacts diff --git a/ic-os/guestos/envs/recovery/BUILD.bazel b/ic-os/guestos/envs/recovery/BUILD.bazel index 00d32b16d7fc..ea84ce367abf 100644 --- a/ic-os/guestos/envs/recovery/BUILD.bazel +++ b/ic-os/guestos/envs/recovery/BUILD.bazel @@ -21,7 +21,7 @@ icos_images = icos_build( file_size_check( name = "disk_img_size_check", file = icos_images.disk_image, - max_file_size = 450 * 1000 * 1000, # 419 MB on 2025-06-26 + max_file_size = 475 * 1000 * 1000, # 453 MB on 2026-06-18 tags = [ "manual", "no-cache", @@ -31,7 +31,7 @@ file_size_check( file_size_check( name = "update_img_size_check", file = icos_images.update_image, - max_file_size = 450 * 1000 * 1000, # 417 MB on 2025-06-26 + max_file_size = 475 * 1000 * 1000, # 451 MB on 2026-06-18 tags = [ "manual", "no-cache", @@ -41,7 +41,7 @@ file_size_check( file_size_check( name = "update_img_test_size_check", file = icos_images.update_image_test, - max_file_size = 450 * 1000 * 1000, # 417 MB on 2025-06-26 + max_file_size = 475 * 1000 * 1000, # 451 MB on 2026-06-18 tags = [ "manual", "no-cache", From 78de3f9f2db89a3df51ae42dd0ee58ede906fd40 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Thu, 18 Jun 2026 11:44:00 +0000 Subject: [PATCH 19/42] fix: spawn child as leader of new process group --- rs/orchestrator/src/process_manager.rs | 32 +++++++++++++++++++------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/rs/orchestrator/src/process_manager.rs b/rs/orchestrator/src/process_manager.rs index 710c560d7b30..409f1b1b1ba4 100644 --- a/rs/orchestrator/src/process_manager.rs +++ b/rs/orchestrator/src/process_manager.rs @@ -8,6 +8,7 @@ use std::{ ffi::OsString, fmt::Debug, io::Result, + os::unix::process::CommandExt, path::PathBuf, sync::{Arc, Mutex}, }; @@ -104,14 +105,19 @@ impl SingleProcessRunner

{ /// running, a log message is printed. /// /// It is critical that we signal and terminate the whole - /// process group of which the [`Process`] should be the - /// leader. The process may spawn other - /// sub-processes under the same process group. For correctness - /// -- the processes may access state file paths and - /// handles -- it is important we signal the sub-processes - /// processes too. This is possible because we shall - /// restrict setgpid() in production -- by default disabled - /// by SELinux type enforcement. + /// process group of which the [`Process`] is the leader. The + /// process may spawn other sub-processes under the same process + /// group. For correctness -- the processes may access state file + /// paths and handles -- it is important we signal the sub-processes + /// too. + /// + /// We guarantee that the [`Process`] is its own process group leader + /// (so its PID equals its PGID, which is what the negation below + /// relies on) by setting its process group at spawn time via + /// `Command::process_group(0)` -- see `start`. We therefore do not + /// rely on the managed binary calling `setpgid` itself. Sub-processes + /// cannot escape the group because `setpgid` is restricted in + /// production by SELinux type enforcement. /// /// We still depend on init to handle reaping of adopted children, /// as the orchestrator has no way of adopting or even knowing the @@ -179,6 +185,16 @@ impl ProcessRunner

for SingleProcessRunner

{ let child = std::process::Command::new(process.get_binary()) .args(process.get_args()) .envs(process.get_env()) + // Put the child into a new process group of which it is the + // leader (PGID == PID). Any sub-processes it spawns inherit this + // group, which lets `kill()` reliably signal the whole group by + // negating the PID. We establish the group here, in the + // orchestrator, rather than relying on each managed binary to + // call `setpgid` itself. This is equivalent to `setpgid(0, 0)` + // run in the forked child before `exec`, while it is still in + // the orchestrator's SELinux domain -- which is permitted to set + // its own process group. + .process_group(0) .spawn()?; debug!(self.log, "Process started. Pid: {}", child.id()); self.set_pid(Pid::from_raw(child.id() as i32)); From 20e0162f4e21d79eeb9ded0c46cae7107f85f166 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Thu, 18 Jun 2026 11:44:19 +0000 Subject: [PATCH 20/42] docs: add TODO --- rs/orchestrator/src/processes.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/rs/orchestrator/src/processes.rs b/rs/orchestrator/src/processes.rs index 8d860cffb9f7..f7946e97e14d 100644 --- a/rs/orchestrator/src/processes.rs +++ b/rs/orchestrator/src/processes.rs @@ -316,6 +316,7 @@ impl IcBoundaryManager { } } + // TODO: consider returning an error when thing fail instead of logging pub(crate) fn ensure_ic_boundary_running_and_restarted_on_domain_change( &mut self, replica_version: ReplicaVersion, From dd9deff62b855ac04529bb614f94acceac06ab62 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Thu, 18 Jun 2026 12:31:55 +0000 Subject: [PATCH 21/42] feat: firewall changes in separate PR --- rs/config/src/config.rs | 9 - rs/config/src/firewall.rs | 51 ------ .../config/tool/templates/ic.json5.template | 165 +----------------- rs/orchestrator/src/firewall.rs | 128 +------------- rs/orchestrator/src/orchestrator.rs | 1 - rs/orchestrator/src/registry_helper.rs | 6 +- rs/orchestrator/src/upgrade.rs | 6 +- ...nftables_assigned_cloud_engine.conf.golden | 21 +-- .../nftables_assigned_replica.conf.golden | 4 +- ...tables_unassigned_cloud_engine.conf.golden | 4 +- .../nftables_unassigned_replica.conf.golden | 4 +- rs/registry/helpers/src/node.rs | 26 +-- 12 files changed, 34 insertions(+), 391 deletions(-) diff --git a/rs/config/src/config.rs b/rs/config/src/config.rs index ddb18c6eb17e..60c3f4d8b0a1 100644 --- a/rs/config/src/config.rs +++ b/rs/config/src/config.rs @@ -11,7 +11,6 @@ use crate::{ crypto::CryptoConfig, execution_environment::Config as HypervisorConfig, firewall::BoundaryNodeConfig as BoundaryNodeFirewallConfig, - firewall::CloudEngineConfig as CloudEngineFirewallConfig, firewall::ReplicaConfig as ReplicaFirewallConfig, http_handler::Config as HttpHandlerConfig, initial_ipv4_config::IPv4Config, @@ -52,7 +51,6 @@ pub struct Config { pub message_routing: MessageRoutingConfig, pub malicious_behavior: MaliciousBehavior, pub firewall: ReplicaFirewallConfig, - pub cloud_engine_firewall: CloudEngineFirewallConfig, pub boundary_node_firewall: BoundaryNodeFirewallConfig, pub registration: RegistrationConfig, pub nns_registry_replicator: NnsRegistryReplicatorConfig, @@ -81,7 +79,6 @@ pub struct ConfigOptional { pub message_routing: Option, pub malicious_behavior: Option, pub firewall: Option, - pub cloud_engine_firewall: Option, pub boundary_node_firewall: Option, pub registration: Option, pub nns_registry_replicator: Option, @@ -115,9 +112,6 @@ impl Config { message_routing: MessageRoutingConfig::default(), malicious_behavior: MaliciousBehavior::default(), firewall: ReplicaFirewallConfig::new(parent_dir.join("replica_firewall")), - cloud_engine_firewall: CloudEngineFirewallConfig::new( - parent_dir.join("cloud_engine_firewall"), - ), boundary_node_firewall: BoundaryNodeFirewallConfig::new( parent_dir.join("boundary_node_firewall"), ), @@ -173,9 +167,6 @@ impl Config { message_routing: cfg.message_routing.unwrap_or(default.message_routing), malicious_behavior: cfg.malicious_behavior.unwrap_or(default.malicious_behavior), firewall: cfg.firewall.unwrap_or(default.firewall), - cloud_engine_firewall: cfg - .cloud_engine_firewall - .unwrap_or(default.cloud_engine_firewall), boundary_node_firewall: cfg .boundary_node_firewall .unwrap_or(default.boundary_node_firewall), diff --git a/rs/config/src/firewall.rs b/rs/config/src/firewall.rs index aa341447cec0..c8a0d2c76a3b 100644 --- a/rs/config/src/firewall.rs +++ b/rs/config/src/firewall.rs @@ -58,57 +58,6 @@ impl ReplicaConfig { } } -#[derive(Clone, PartialEq, Debug, Deserialize, Serialize)] -#[serde(rename_all = "snake_case")] -#[cfg_attr(test, derive(Arbitrary))] -pub struct CloudEngineConfig { - /// Path to use for storing state on the file system - #[cfg_attr(test, proptest(strategy = "any::().prop_map(PathBuf::from)"))] - pub config_file: PathBuf, - pub file_template: String, - pub ipv4_tcp_rule_template: String, - pub ipv6_tcp_rule_template: String, - pub ipv4_udp_rule_template: String, - pub ipv6_udp_rule_template: String, - pub ipv4_user_output_rule_template: String, - pub ipv6_user_output_rule_template: String, - #[cfg_attr(test, proptest(strategy = "any::().prop_map(|_x| vec![])"))] - pub default_rules: Vec, - /// Ports opened to whitelisted nodes in the network. - pub whitelisted_nodes_tcp_ports_whitelist: Vec, - pub whitelisted_nodes_udp_ports_whitelist: Vec, - /// Ports opened to all nodes in the network (including non-whitelisted). - pub all_nodes_tcp_ports_whitelist: Vec, - pub all_nodes_udp_ports_whitelist: Vec, - pub ports_for_http_adapter_blacklist: Vec, - /// We allow a maximum of `max_simultaneous_connections_per_ip_address` persistent connections to any ip address. - /// Any ip address with `max_simultaneous_connections_per_ip_address` connections will be dropped if a new connection is attempted. - pub max_simultaneous_connections_per_ip_address: u32, -} - -impl CloudEngineConfig { - /// Create a CloudEngineConfig from a given path to the config file. - pub fn new(config_file: PathBuf) -> Self { - Self { - config_file, - file_template: "".to_string(), - ipv4_tcp_rule_template: "".to_string(), - ipv6_tcp_rule_template: "".to_string(), - ipv4_udp_rule_template: "".to_string(), - ipv6_udp_rule_template: "".to_string(), - ipv4_user_output_rule_template: "".to_string(), - ipv6_user_output_rule_template: "".to_string(), - default_rules: vec![], - whitelisted_nodes_tcp_ports_whitelist: vec![], - whitelisted_nodes_udp_ports_whitelist: vec![], - all_nodes_tcp_ports_whitelist: vec![], - all_nodes_udp_ports_whitelist: vec![], - ports_for_http_adapter_blacklist: vec![], - max_simultaneous_connections_per_ip_address: 0, - } - } -} - #[derive(Clone, PartialEq, Debug, Deserialize, Serialize)] #[serde(rename_all = "snake_case")] #[cfg_attr(test, derive(Arbitrary))] diff --git a/rs/ic_os/config/tool/templates/ic.json5.template b/rs/ic_os/config/tool/templates/ic.json5.template index ea6a436e9014..8a863eaac3e8 100644 --- a/rs/ic_os/config/tool/templates/ic.json5.template +++ b/rs/ic_os/config/tool/templates/ic.json5.template @@ -327,170 +327,7 @@ table ip6 filter {\n\ whitelisted_nodes_udp_ports_whitelist: [4100], all_nodes_tcp_ports_whitelist: [2497], all_nodes_udp_ports_whitelist: [], - ports_for_http_adapter_blacklist: [22, 2497, 4100, 7070, 8080, 9090, 9091, 9100, 9314, 9324, 19100, 19522, 19523, 19531], - max_simultaneous_connections_per_ip_address: 1000, - }, - - cloud_engine_firewall: { - config_file: "/run/ic-node/nftables-ruleset/nftables.conf", - file_template: "flush ruleset\n\ -\n\ -table filter {\n\ - define icmp_v4_types_accept = {\n\ - destination-unreachable,\n\ - time-exceeded,\n\ - echo-request,\n\ - echo-reply,\n\ - }\n\ -\n\ - set rate_limit {\n\ - type ipv4_addr\n\ - size 65535\n\ - flags dynamic\n\ - }\n\ -\n\ - set connection_limit {\n\ - type ipv4_addr\n\ - size 65535\n\ - flags dynamic\n\ - }\n\ -\n\ - set blackhole {\n\ - type ipv4_addr\n\ - size 65535\n\ - }\n\ -\n\ - counter rate_limit_v4_counter {}\n\ - counter connection_limit_v4_counter {}\n\ -\n\ - chain INPUT {\n\ - type filter hook input priority 0; policy drop;\n\ - iif lo accept\n\ - ip saddr @blackhole drop\n\ - ct state new add @rate_limit { ip saddr limit rate over 1000/minute burst 500 packets } counter name rate_limit_v4_counter drop\n\ - # Notes about the rule below:\n\ - # - The rule allows a maximum of <> persistent connections to any ip address.\n\ - # - The rule drops all new connections that goes over the configured limit.\n\ - ct state new add @connection_limit { ip saddr ct count over <> } counter name connection_limit_v4_counter drop\n\ - icmp type $icmp_v4_types_accept accept\n\ - <>\n\ - <>\n\ - ct state { invalid } drop\n\ - ct state { established, related } accept\n\ - ip saddr { 0.0.0.0-255.255.255.255 } ct state new tcp dport 80 accept\n\ - log prefix \"Drop - default policy: \"\n\ - }\n\ -\n\ - chain FORWARD {\n\ - type filter hook forward priority 0; policy drop;\n\ - }\n\ -\n\ - chain OUTPUT {\n\ - type filter hook output priority 0; policy accept;\n\ - meta skuid ic-http-adapter ip daddr { 127.0.0.0/8 } ct state { new } tcp dport { 1-19999 } reject # Block restricted localhost ic-http-adapter HTTPS access\n\ - <>\n\ - }\n\ -}\n\ -\n\ -table ip6 filter {\n\ - define icmp_v6_types_accept = {\n\ - destination-unreachable,\n\ - packet-too-big,\n\ - time-exceeded,\n\ - echo-request,\n\ - echo-reply,\n\ - nd-router-advert,\n\ - nd-neighbor-solicit,\n\ - nd-neighbor-advert,\n\ - }\n\ -\n\ - set rate_limit {\n\ - type ipv6_addr\n\ - size 65535\n\ - flags dynamic\n\ - }\n\ -\n\ - set connection_limit {\n\ - type ipv6_addr\n\ - size 65535\n\ - flags dynamic\n\ - }\n\ -\n\ - set blackhole6 {\n\ - type ipv6_addr\n\ - size 65535\n\ - }\n\ -\n\ - counter rate_limit_v6_counter {}\n\ - counter connection_limit_v6_counter {}\n\ -\n\ - chain INPUT {\n\ - type filter hook input priority 0; policy drop;\n\ - iif lo accept\n\ - ip6 saddr @blackhole6 drop\n\ - ct state { invalid } drop\n\ - ct state { established, related } accept\n\ - ct state new add @rate_limit { ip6 saddr limit rate over 1000/minute burst 500 packets } counter name rate_limit_v6_counter drop\n\ - # Notes about the rule below:\n\ - # - The rule allows a maximum of <> persistent connections to any ip6 address.\n\ - # - The rule drops all new connections that goes over the configured limit.\n\ - ct state new add @connection_limit { ip6 saddr ct count over <> } counter name connection_limit_v6_counter drop\n\ - icmpv6 type $icmp_v6_types_accept accept\n\ - # DHCPv6\n\ - udp dport { 546 } accept\n\ - # TCP ports required for GuestOS functionality\n\ - ip6 saddr { {{ ipv6_prefix }} } ct state { new } tcp dport { 7070, 9090, 9091, 9100, 9314, 19531, 19100, 19522 } accept\n\ - # Allow access from HostOS metrics-proxy so GuestOS metrics-proxy can proxy certain metrics to HostOS\n\ - ip6 saddr { hostos } ct state { new } tcp dport { 42372 } accept\n\ - ip6 saddr { ::-ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff } ct state new tcp dport 80 accept\n\ - # Custom templated rules\n\ - <>\n\ - <>\n\ - log prefix \"Drop - default policy: \"\n\ - }\n\ -\n\ - chain FORWARD {\n\ - type filter hook forward priority 0; policy drop;\n\ - }\n\ -\n\ - chain OUTPUT {\n\ - type filter hook output priority 0; policy accept;\n\ - meta skuid ic-http-adapter fib daddr type local ct state { new } tcp dport { 1-19999 } reject # Block restricted local addresses ic-http-adapter HTTPS access\n\ - meta skuid ic-http-adapter ip6 daddr { 2a00:fb01:400:42::/64, 2602:fb2b:110::/48, 2602:fb2b:100::/48, 2602:fb2b:120::/48 } ct state { new } tcp dport { 1-19999 } reject # Block restricted outbound ic-http-adapter HTTPS access\n\ - <>\n\ - }\n\ -}\n", - ipv4_tcp_rule_template: "ip saddr {<>} ct state { new } tcp dport {<>} <> # <>", - ipv6_tcp_rule_template: "ip6 saddr {<>} ct state { new } tcp dport {<>} <> # <>", - ipv4_udp_rule_template: "ip saddr {<>} udp dport {<>} <> # <>", - ipv6_udp_rule_template: "ip6 saddr {<>} udp dport {<>} <> # <>", - ipv4_user_output_rule_template: "meta skuid <> ip daddr {<>} ct state { new } tcp dport {<>} <> # <>", - ipv6_user_output_rule_template: "meta skuid <> ip6 daddr {<>} ct state { new } tcp dport {<>} <> # <>", - default_rules: [{ - ipv4_prefixes: [], - ipv6_prefixes: [ - "2602:fb2b:120::/48", - "2602:fb2b:100::/48", - "2602:fb2b:110::/48", - "2600:c00:2:100::/64", - "2001:4c08:2003:b09::/64", - "2600:3007:4401::/48", - "2a00:fb01:400::/56", - "2a00:fb01:400:200::/64", - "2a05:d01c:e2c:a700::/56", - "2a05:d01c:d9:2b00::/56", - ], - // TODO: Need proposal to apply opening port 9314 - ports: [22, 2497, 4100, 7070, 8080, 9090, 9091, 9100, 9314, 19100, 19523, 19531], - action: 1, - comment: "Default rule from template", - direction: 1, - }], - whitelisted_nodes_tcp_ports_whitelist: [22, 8080], - whitelisted_nodes_udp_ports_whitelist: [4100], - all_nodes_tcp_ports_whitelist: [2497], - all_nodes_udp_ports_whitelist: [], - ports_for_http_adapter_blacklist: [22, 2497, 4100, 7070, 8080, 9090, 9091, 9100, 9314, 9324, 19100, 19522, 19523, 19531], + ports_for_http_adapter_blacklist: [22, 2497, 4100, 7070, 8080, 9090, 9091, 9100, 19100, 19523, 19531], max_simultaneous_connections_per_ip_address: 1000, }, diff --git a/rs/orchestrator/src/firewall.rs b/rs/orchestrator/src/firewall.rs index 25c8e073a9f3..0c32bca8e52f 100644 --- a/rs/orchestrator/src/firewall.rs +++ b/rs/orchestrator/src/firewall.rs @@ -5,8 +5,7 @@ use crate::{ registry_helper::RegistryHelper, }; use ic_config::firewall::{ - BoundaryNodeConfig as BoundaryNodeFirewallConfig, - CloudEngineConfig as CloudEngineFirewallConfig, ReplicaConfig as ReplicaFirewallConfig, + BoundaryNodeConfig as BoundaryNodeFirewallConfig, ReplicaConfig as ReplicaFirewallConfig, }; use ic_logger::{ReplicaLogger, debug, info, warn}; use ic_protobuf::registry::{ @@ -34,7 +33,6 @@ enum DataSource { /// The role of the node in the IC, i.e., whether it is acting as a replica or a boundary node. enum Role { AssignedReplica(SubnetId), - AssignedCloudEngine(SubnetId), UnassignedReplica, BoundaryNode, } @@ -50,7 +48,6 @@ pub(crate) struct Firewall { local_cup_reader: LocalCUPReader, logger: ReplicaLogger, replica_config: ReplicaFirewallConfig, - cloud_engine_config: CloudEngineFirewallConfig, boundary_node_config: BoundaryNodeFirewallConfig, compiled_config: String, last_applied_version: Arc>, @@ -65,7 +62,6 @@ impl Firewall { registry: Arc, metrics: Arc, replica_config: ReplicaFirewallConfig, - cloud_engine_config: CloudEngineFirewallConfig, boundary_node_config: BoundaryNodeFirewallConfig, local_cup_reader: LocalCUPReader, logger: ReplicaLogger, @@ -75,7 +71,6 @@ impl Firewall { metrics, local_cup_reader, replica_config, - cloud_engine_config, boundary_node_config, logger, compiled_config: Default::default(), @@ -112,17 +107,11 @@ impl Firewall { let maybe_boundary_node_record = self .registry .get_api_boundary_node_record(self.node_id, registry_version); - let maybe_subnet_id_and_type = self + let maybe_subnet_id = self .registry - .get_subnet_id_and_type_from_node_id(self.node_id, registry_version); - match (maybe_boundary_node_record, maybe_subnet_id_and_type) { - (_, Ok(Some((subnet_id, subnet_type)))) => match subnet_type { - SubnetType::Unspecified - | SubnetType::Application - | SubnetType::System - | SubnetType::VerifiedApplication => Ok(Role::AssignedReplica(subnet_id)), - SubnetType::CloudEngine => Ok(Role::AssignedCloudEngine(subnet_id)), - }, + .get_subnet_id_from_node_id(self.node_id, registry_version); + match (maybe_boundary_node_record, maybe_subnet_id) { + (_, Ok(Some(subnet_id))) => Ok(Role::AssignedReplica(subnet_id)), (Err(OrchestratorError::ApiBoundaryNodeMissingError(_, _)), Ok(None)) => { Ok(Role::UnassignedReplica) } @@ -230,7 +219,6 @@ impl Firewall { | NodeRewardType::Type3dot1 | NodeRewardType::Type1dot1, ) => true, - // TODO(CON-1720): consider accepting only from `Type4*` ( NodeRewardType::Type4 | NodeRewardType::Type4dot1 @@ -508,7 +496,7 @@ impl Firewall { let mut udp_rules = Vec::::new(); let firewall_scopes_to_fetch = match role { - Role::AssignedReplica(subnet_id) | Role::AssignedCloudEngine(subnet_id) => vec![ + Role::AssignedReplica(subnet_id) => vec![ FirewallRulesScope::Node(self.node_id), FirewallRulesScope::Subnet(subnet_id), FirewallRulesScope::ReplicaNodes, @@ -549,9 +537,6 @@ impl Firewall { Role::AssignedReplica(_) | Role::UnassignedReplica => { tcp_rules.append(&mut self.replica_config.default_rules.clone()); } - Role::AssignedCloudEngine(_) => { - tcp_rules.append(&mut self.cloud_engine_config.default_rules.clone()); - } Role::BoundaryNode => { tcp_rules.append(&mut self.boundary_node_config.default_rules.clone()); } @@ -564,19 +549,14 @@ impl Firewall { // Whitelisting for node IPs // In addition to any explicit firewall rules we might apply, we also ALWAYS whitelist // all nodes in the registry on the ports used by the protocol - Role::AssignedReplica(_) | Role::AssignedCloudEngine(_) | Role::UnassignedReplica => { + Role::AssignedReplica(_) | Role::UnassignedReplica => { let (more_tcp_rules, more_udp_rules) = self.get_node_whitelisting_rules(registry_version); // Insert the whitelisting rules at the top of the list (highest priority) tcp_rules = more_tcp_rules.into_iter().chain(tcp_rules).collect(); udp_rules = more_udp_rules.into_iter().chain(udp_rules).collect(); - if matches!(role, Role::AssignedCloudEngine(_)) { - self.cloud_engine_config.insert_rules(tcp_rules, udp_rules) - } else { - // matches!(role, Role::AssignedReplica(_) | Role::UnassignedReplica) - self.replica_config.insert_rules(tcp_rules, udp_rules) - } + self.replica_config.insert_rules(tcp_rules, udp_rules) } Role::BoundaryNode => { let socks_proxy_whitelisting_rules = @@ -627,7 +607,6 @@ impl Firewall { fn write_firewall_file(&self, content: &str, role: Role) -> OrchestratorResult<()> { let f = match role { Role::AssignedReplica(_) | Role::UnassignedReplica => &self.replica_config.config_file, - Role::AssignedCloudEngine(_) => &self.cloud_engine_config.config_file, Role::BoundaryNode => &self.boundary_node_config.config_file, }; write_string_using_tmp_file(f, content) @@ -731,76 +710,6 @@ impl FirewallConfigTemplate for ReplicaFirewallConfig { } } -impl FirewallConfigTemplate for CloudEngineFirewallConfig { - fn insert_rules(&self, tcp_rules: Vec, udp_rules: Vec) -> String { - self.file_template - .replace( - "<>", - &compile_rules( - &self.ipv4_tcp_rule_template, - &tcp_rules, - vec![ - FirewallRuleDirection::Inbound, - FirewallRuleDirection::Unspecified, - ], - ), - ) - .replace( - "<>", - &compile_rules( - &self.ipv4_udp_rule_template, - &udp_rules, - vec![ - FirewallRuleDirection::Inbound, - FirewallRuleDirection::Unspecified, - ], - ), - ) - .replace( - "<>", - &compile_rules( - &self.ipv6_tcp_rule_template, - &tcp_rules, - vec![ - FirewallRuleDirection::Inbound, - FirewallRuleDirection::Unspecified, - ], - ), - ) - .replace( - "<>", - &compile_rules( - &self.ipv6_udp_rule_template, - &udp_rules, - vec![ - FirewallRuleDirection::Inbound, - FirewallRuleDirection::Unspecified, - ], - ), - ) - .replace( - "<>", - &compile_rules( - &self.ipv4_user_output_rule_template, - &tcp_rules, - vec![FirewallRuleDirection::Outbound], - ), - ) - .replace( - "<>", - &compile_rules( - &self.ipv6_user_output_rule_template, - &tcp_rules, - vec![FirewallRuleDirection::Outbound], - ), - ) - .replace( - "<>", - &self.max_simultaneous_connections_per_ip_address.to_string(), - ) - } -} - impl FirewallConfigTemplate for BoundaryNodeFirewallConfig { fn insert_rules(&self, tcp_rules: Vec, udp_rules: Vec) -> String { self.file_template @@ -1136,7 +1045,7 @@ mod tests { #[test] fn nftables_golden_assigned_cloud_engine_test() { golden_test( - Role::AssignedCloudEngine(SUBNET_ID), + Role::AssignedReplica(SUBNET_ID), node_test_id(0), Some(NodeRewardType::Type4), NFTABLES_ASSIGNED_CLOUD_ENGINE_GOLDEN_BYTES, @@ -1251,17 +1160,12 @@ mod tests { replica_firewall_config .config_file .clone_from(&nftables_config_path); - let mut cloud_engine_firewall_config = config.cloud_engine_firewall.unwrap(); - cloud_engine_firewall_config - .config_file - .clone_from(&nftables_config_path); let mut boundary_node_firewall_config = config.boundary_node_firewall.unwrap(); boundary_node_firewall_config .config_file .clone_from(&nftables_config_path); let mut firewall = set_up_firewall_dependencies( replica_firewall_config, - cloud_engine_firewall_config, boundary_node_firewall_config, tmp_dir.path(), role, @@ -1332,7 +1236,6 @@ mod tests { /// Sets up all the necessary dependencies of the [`Firewall`] fn set_up_firewall_dependencies( config: ReplicaFirewallConfig, - cloud_engine_config: CloudEngineFirewallConfig, boundary_node_config: BoundaryNodeFirewallConfig, tmp_dir: &Path, role: Role, @@ -1354,7 +1257,6 @@ mod tests { registry_helper, Arc::new(OrchestratorMetrics::new(&ic_metrics::MetricsRegistry::new())), config, - cloud_engine_config, boundary_node_config, cup_reader, no_op_logger(), @@ -1575,18 +1477,6 @@ mod tests { ); subnet_ids.push(subnet_id); } - Role::AssignedCloudEngine(subnet_id) => { - let subnet_record = SubnetRecordBuilder::from(&[node]) - .with_subnet_type(SubnetType::CloudEngine) - .build(); - add_single_subnet_record( - ®istry_data_provider, - registry_version.get(), - subnet_id, - subnet_record, - ); - subnet_ids.push(subnet_id); - } Role::BoundaryNode => { add_api_boundary_node_record(®istry_data_provider, registry_version, node); } diff --git a/rs/orchestrator/src/orchestrator.rs b/rs/orchestrator/src/orchestrator.rs index 65493553018a..7862b89c2806 100644 --- a/rs/orchestrator/src/orchestrator.rs +++ b/rs/orchestrator/src/orchestrator.rs @@ -363,7 +363,6 @@ impl Orchestrator { Arc::clone(®istry), Arc::clone(&metrics), config.firewall.clone(), - config.cloud_engine_firewall.clone(), config.boundary_node_firewall.clone(), local_cup_reader.clone(), logger.clone(), diff --git a/rs/orchestrator/src/registry_helper.rs b/rs/orchestrator/src/registry_helper.rs index de8ae7df7c5a..ee807b597c41 100644 --- a/rs/orchestrator/src/registry_helper.rs +++ b/rs/orchestrator/src/registry_helper.rs @@ -204,13 +204,13 @@ impl RegistryHelper { Ok(ids.unwrap_or_default()) } - pub(crate) fn get_subnet_id_and_type_from_node_id( + pub(crate) fn get_subnet_id_from_node_id( &self, node_id: NodeId, version: RegistryVersion, - ) -> OrchestratorResult> { + ) -> OrchestratorResult> { self.registry_client - .get_subnet_id_and_type_from_node_id(node_id, version) + .get_subnet_id_from_node_id(node_id, version) .map_err(OrchestratorError::RegistryClientError) } diff --git a/rs/orchestrator/src/upgrade.rs b/rs/orchestrator/src/upgrade.rs index 65c79cf4deb4..b31d8e7b7e91 100644 --- a/rs/orchestrator/src/upgrade.rs +++ b/rs/orchestrator/src/upgrade.rs @@ -722,10 +722,8 @@ fn get_subnet_id(registry: &dyn RegistryClient, cup: &CatchUpPackage) -> Result< .iter() .next() .ok_or("No nodes in current transcript committee found")?; - match registry - .get_subnet_id_and_type_from_node_id(*node_id, dkg_summary.registry_version) - { - Ok(Some((subnet_id, _subnet_type))) => Ok(subnet_id), + match registry.get_subnet_id_from_node_id(*node_id, dkg_summary.registry_version) { + Ok(Some(subnet_id)) => Ok(subnet_id), other => Err(format!( "Couldn't get the subnet id from the registry for node {:?} at registry version {}: {:?}", node_id, dkg_summary.registry_version, other diff --git a/rs/orchestrator/testdata/nftables_assigned_cloud_engine.conf.golden b/rs/orchestrator/testdata/nftables_assigned_cloud_engine.conf.golden index ce7fc9b2d210..c15ea8397ac8 100644 --- a/rs/orchestrator/testdata/nftables_assigned_cloud_engine.conf.golden +++ b/rs/orchestrator/testdata/nftables_assigned_cloud_engine.conf.golden @@ -20,18 +20,12 @@ table filter { flags dynamic } - set blackhole { - type ipv4_addr - size 65535 - } - counter rate_limit_v4_counter {} counter connection_limit_v4_counter {} chain INPUT { type filter hook input priority 0; policy drop; iif lo accept - ip saddr @blackhole drop ct state new add @rate_limit { ip saddr limit rate over 1000/minute burst 500 packets } counter name rate_limit_v4_counter drop # Notes about the rule below: # - The rule allows a maximum of 1000 persistent connections to any ip address. @@ -46,8 +40,8 @@ ip saddr {4.4.4.4} ct state { new } tcp dport {1004} accept # replica_nodes ip saddr {6.6.6.6} ct state { new } tcp dport {1006} accept # global ip saddr {1.1.1.1,3.0.0.3,3.0.0.4,3.0.0.5,3.0.0.6,3.0.0.7,4.0.0.4,4.0.0.5,4.0.0.6,4.0.0.7} udp dport {4100} accept # Automatic whitelisted nodes whitelisting ct state { invalid } drop + # - The rule accepts all established and related connections. It's required for the IPv4 connectivity check. ct state { established, related } accept - ip saddr { 0.0.0.0-255.255.255.255 } ct state new tcp dport 80 accept log prefix "Drop - default policy: " } @@ -58,7 +52,7 @@ ip saddr {6.6.6.6} ct state { new } tcp dport {1006} accept # global chain OUTPUT { type filter hook output priority 0; policy accept; meta skuid ic-http-adapter ip daddr { 127.0.0.0/8 } ct state { new } tcp dport { 1-19999 } reject # Block restricted localhost ic-http-adapter HTTPS access - meta skuid ic-http-adapter ip daddr {1.1.1.1,3.0.0.3,3.0.0.4,3.0.0.5,3.0.0.6,3.0.0.7,4.0.0.4,4.0.0.5,4.0.0.6,4.0.0.7} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,9314,9324,19100,19522,19523,19531} reject # Automatic blacklisting for ic-http-adapter + meta skuid ic-http-adapter ip daddr {1.1.1.1,3.0.0.3,3.0.0.4,3.0.0.5,3.0.0.6,3.0.0.7,4.0.0.4,4.0.0.5,4.0.0.6,4.0.0.7} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,19100,19523,19531} reject # Automatic blacklisting for ic-http-adapter } } @@ -86,18 +80,12 @@ table ip6 filter { flags dynamic } - set blackhole6 { - type ipv6_addr - size 65535 - } - counter rate_limit_v6_counter {} counter connection_limit_v6_counter {} chain INPUT { type filter hook input priority 0; policy drop; iif lo accept - ip6 saddr @blackhole6 drop ct state { invalid } drop ct state { established, related } accept ct state new add @rate_limit { ip6 saddr limit rate over 1000/minute burst 500 packets } counter name rate_limit_v6_counter drop @@ -109,10 +97,9 @@ table ip6 filter { # DHCPv6 udp dport { 546 } accept # TCP ports required for GuestOS functionality - ip6 saddr { ::/64 } ct state { new } tcp dport { 7070, 9090, 9091, 9100, 9314, 19531, 19100, 19522 } accept + ip6 saddr { ::/64 } ct state { new } tcp dport { 7070, 9090, 9091, 9100, 19531, 19100, 19522 } accept # Allow access from HostOS metrics-proxy so GuestOS metrics-proxy can proxy certain metrics to HostOS ip6 saddr { hostos } ct state { new } tcp dport { 42372 } accept - ip6 saddr { ::-ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff } ct state new tcp dport 80 accept # Custom templated rules ip6 saddr {2001:db8:85a3::8a2e:1370:7334,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e5,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e6,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e7,a4c2:7f91:3db6:1e8c:5a4f:cc92:b37:6e41} ct state { new } tcp dport {22,8080} accept # Automatic whitelisted nodes whitelisting ip6 saddr {2001:db8:85a3::8a2e:1370:7334,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e5,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e6,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e7,a4c2:7f91:3db6:1e8c:5a4f:cc92:b37:6e41} ct state { new } tcp dport {2497} accept # Automatic all nodes whitelisting @@ -132,6 +119,6 @@ ip6 saddr {::ffff:6.6.6.6} ct state { new } tcp dport {1006} accept # global type filter hook output priority 0; policy accept; meta skuid ic-http-adapter fib daddr type local ct state { new } tcp dport { 1-19999 } reject # Block restricted local addresses ic-http-adapter HTTPS access meta skuid ic-http-adapter ip6 daddr { 2a00:fb01:400:42::/64, 2602:fb2b:110::/48, 2602:fb2b:100::/48, 2602:fb2b:120::/48 } ct state { new } tcp dport { 1-19999 } reject # Block restricted outbound ic-http-adapter HTTPS access - meta skuid ic-http-adapter ip6 daddr {2001:db8:85a3::8a2e:1370:7334,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e5,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e6,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e7,a4c2:7f91:3db6:1e8c:5a4f:cc92:b37:6e41} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,9314,9324,19100,19522,19523,19531} reject # Automatic blacklisting for ic-http-adapter + meta skuid ic-http-adapter ip6 daddr {2001:db8:85a3::8a2e:1370:7334,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e5,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e6,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e7,a4c2:7f91:3db6:1e8c:5a4f:cc92:b37:6e41} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,19100,19523,19531} reject # Automatic blacklisting for ic-http-adapter } } diff --git a/rs/orchestrator/testdata/nftables_assigned_replica.conf.golden b/rs/orchestrator/testdata/nftables_assigned_replica.conf.golden index 746f89d4f9d2..9c70a94fe706 100644 --- a/rs/orchestrator/testdata/nftables_assigned_replica.conf.golden +++ b/rs/orchestrator/testdata/nftables_assigned_replica.conf.golden @@ -52,7 +52,7 @@ ip saddr {6.6.6.6} ct state { new } tcp dport {1006} accept # global chain OUTPUT { type filter hook output priority 0; policy accept; meta skuid ic-http-adapter ip daddr { 127.0.0.0/8 } ct state { new } tcp dport { 1-19999 } reject # Block restricted localhost ic-http-adapter HTTPS access - meta skuid ic-http-adapter ip daddr {1.1.1.1,3.0.0.3,3.0.0.4,3.0.0.5,3.0.0.6,3.0.0.7,4.0.0.4,4.0.0.5,4.0.0.6,4.0.0.7} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,9314,9324,19100,19522,19523,19531} reject # Automatic blacklisting for ic-http-adapter + meta skuid ic-http-adapter ip daddr {1.1.1.1,3.0.0.3,3.0.0.4,3.0.0.5,3.0.0.6,3.0.0.7,4.0.0.4,4.0.0.5,4.0.0.6,4.0.0.7} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,19100,19523,19531} reject # Automatic blacklisting for ic-http-adapter } } @@ -119,6 +119,6 @@ ip6 saddr {::ffff:6.6.6.6} ct state { new } tcp dport {1006} accept # global type filter hook output priority 0; policy accept; meta skuid ic-http-adapter fib daddr type local ct state { new } tcp dport { 1-19999 } reject # Block restricted local addresses ic-http-adapter HTTPS access meta skuid ic-http-adapter ip6 daddr { 2a00:fb01:400:42::/64, 2602:fb2b:110::/48, 2602:fb2b:100::/48, 2602:fb2b:120::/48 } ct state { new } tcp dport { 1-19999 } reject # Block restricted outbound ic-http-adapter HTTPS access - meta skuid ic-http-adapter ip6 daddr {2001:db8:85a3::8a2e:1370:7334,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e5,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e6,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e7,a4c2:7f91:3db6:1e8c:5a4f:cc92:b37:6e41} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,9314,9324,19100,19522,19523,19531} reject # Automatic blacklisting for ic-http-adapter + meta skuid ic-http-adapter ip6 daddr {2001:db8:85a3::8a2e:1370:7334,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e5,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e6,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e7,a4c2:7f91:3db6:1e8c:5a4f:cc92:b37:6e41} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,19100,19523,19531} reject # Automatic blacklisting for ic-http-adapter } } diff --git a/rs/orchestrator/testdata/nftables_unassigned_cloud_engine.conf.golden b/rs/orchestrator/testdata/nftables_unassigned_cloud_engine.conf.golden index f643312371a6..c816c5e3ba9b 100644 --- a/rs/orchestrator/testdata/nftables_unassigned_cloud_engine.conf.golden +++ b/rs/orchestrator/testdata/nftables_unassigned_cloud_engine.conf.golden @@ -51,7 +51,7 @@ ip saddr {6.6.6.6} ct state { new } tcp dport {1006} accept # global chain OUTPUT { type filter hook output priority 0; policy accept; meta skuid ic-http-adapter ip daddr { 127.0.0.0/8 } ct state { new } tcp dport { 1-19999 } reject # Block restricted localhost ic-http-adapter HTTPS access - meta skuid ic-http-adapter ip daddr {1.1.1.1,3.0.0.3,3.0.0.4,3.0.0.5,3.0.0.6,3.0.0.7,4.0.0.4,4.0.0.5,4.0.0.6,4.0.0.7} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,9314,9324,19100,19522,19523,19531} reject # Automatic blacklisting for ic-http-adapter + meta skuid ic-http-adapter ip daddr {1.1.1.1,3.0.0.3,3.0.0.4,3.0.0.5,3.0.0.6,3.0.0.7,4.0.0.4,4.0.0.5,4.0.0.6,4.0.0.7} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,19100,19523,19531} reject # Automatic blacklisting for ic-http-adapter } } @@ -117,6 +117,6 @@ ip6 saddr {::ffff:6.6.6.6} ct state { new } tcp dport {1006} accept # global type filter hook output priority 0; policy accept; meta skuid ic-http-adapter fib daddr type local ct state { new } tcp dport { 1-19999 } reject # Block restricted local addresses ic-http-adapter HTTPS access meta skuid ic-http-adapter ip6 daddr { 2a00:fb01:400:42::/64, 2602:fb2b:110::/48, 2602:fb2b:100::/48, 2602:fb2b:120::/48 } ct state { new } tcp dport { 1-19999 } reject # Block restricted outbound ic-http-adapter HTTPS access - meta skuid ic-http-adapter ip6 daddr {2001:db8:85a3::8a2e:1370:7334,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e5,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e6,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e7,a4c2:7f91:3db6:1e8c:5a4f:cc92:b37:6e41} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,9314,9324,19100,19522,19523,19531} reject # Automatic blacklisting for ic-http-adapter + meta skuid ic-http-adapter ip6 daddr {2001:db8:85a3::8a2e:1370:7334,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e5,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e6,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e7,a4c2:7f91:3db6:1e8c:5a4f:cc92:b37:6e41} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,19100,19523,19531} reject # Automatic blacklisting for ic-http-adapter } } diff --git a/rs/orchestrator/testdata/nftables_unassigned_replica.conf.golden b/rs/orchestrator/testdata/nftables_unassigned_replica.conf.golden index 4424486dd75c..ef1c0e4869e8 100644 --- a/rs/orchestrator/testdata/nftables_unassigned_replica.conf.golden +++ b/rs/orchestrator/testdata/nftables_unassigned_replica.conf.golden @@ -51,7 +51,7 @@ ip saddr {6.6.6.6} ct state { new } tcp dport {1006} accept # global chain OUTPUT { type filter hook output priority 0; policy accept; meta skuid ic-http-adapter ip daddr { 127.0.0.0/8 } ct state { new } tcp dport { 1-19999 } reject # Block restricted localhost ic-http-adapter HTTPS access - meta skuid ic-http-adapter ip daddr {1.1.1.1,3.0.0.3,3.0.0.4,3.0.0.5,3.0.0.6,3.0.0.7,4.0.0.4,4.0.0.5,4.0.0.6,4.0.0.7} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,9314,9324,19100,19522,19523,19531} reject # Automatic blacklisting for ic-http-adapter + meta skuid ic-http-adapter ip daddr {1.1.1.1,3.0.0.3,3.0.0.4,3.0.0.5,3.0.0.6,3.0.0.7,4.0.0.4,4.0.0.5,4.0.0.6,4.0.0.7} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,19100,19523,19531} reject # Automatic blacklisting for ic-http-adapter } } @@ -117,6 +117,6 @@ ip6 saddr {::ffff:6.6.6.6} ct state { new } tcp dport {1006} accept # global type filter hook output priority 0; policy accept; meta skuid ic-http-adapter fib daddr type local ct state { new } tcp dport { 1-19999 } reject # Block restricted local addresses ic-http-adapter HTTPS access meta skuid ic-http-adapter ip6 daddr { 2a00:fb01:400:42::/64, 2602:fb2b:110::/48, 2602:fb2b:100::/48, 2602:fb2b:120::/48 } ct state { new } tcp dport { 1-19999 } reject # Block restricted outbound ic-http-adapter HTTPS access - meta skuid ic-http-adapter ip6 daddr {2001:db8:85a3::8a2e:1370:7334,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e5,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e6,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e7,a4c2:7f91:3db6:1e8c:5a4f:cc92:b37:6e41} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,9314,9324,19100,19522,19523,19531} reject # Automatic blacklisting for ic-http-adapter + meta skuid ic-http-adapter ip6 daddr {2001:db8:85a3::8a2e:1370:7334,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e5,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e6,3fda:92b7:4c1e:8a23:7d61:2f9c:ab42:19e7,a4c2:7f91:3db6:1e8c:5a4f:cc92:b37:6e41} ct state { new } tcp dport {22,2497,4100,7070,8080,9090,9091,9100,19100,19523,19531} reject # Automatic blacklisting for ic-http-adapter } } diff --git a/rs/registry/helpers/src/node.rs b/rs/registry/helpers/src/node.rs index 17a656b8fe60..69a9d9ee6570 100644 --- a/rs/registry/helpers/src/node.rs +++ b/rs/registry/helpers/src/node.rs @@ -1,8 +1,7 @@ use crate::deserialize_registry_value; -use crate::subnet::{SubnetListRegistry, SubnetRegistry, get_node_ids_from_subnet_record}; +use crate::subnet::{SubnetListRegistry, SubnetRegistry}; use ic_interfaces_registry::{RegistryClient, RegistryClientResult}; pub use ic_protobuf::registry::node::v1::{ConnectionEndpoint, NodeRecord}; -use ic_protobuf::registry::subnet::v1::SubnetType; use ic_registry_keys::{NODE_RECORD_KEY_PREFIX, get_node_record_node_id, make_node_record_key}; use ic_types::registry::RegistryClientError; pub use ic_types::{NodeId, RegistryVersion, SubnetId}; @@ -14,11 +13,11 @@ pub trait NodeRegistry { version: RegistryVersion, ) -> RegistryClientResult; - fn get_subnet_id_and_type_from_node_id( + fn get_subnet_id_from_node_id( &self, node_id: NodeId, version: RegistryVersion, - ) -> RegistryClientResult<(SubnetId, SubnetType)>; + ) -> RegistryClientResult; /// Returns a list of node ids that contains the id of each node that exists /// at version `version`. @@ -35,24 +34,17 @@ impl NodeRegistry for T { deserialize_registry_value::(bytes) } - fn get_subnet_id_and_type_from_node_id( + fn get_subnet_id_from_node_id( &self, node_id: NodeId, version: RegistryVersion, - ) -> RegistryClientResult<(SubnetId, SubnetType)> { + ) -> RegistryClientResult { if let Some(subnet_ids) = self.get_subnet_ids(version)? { for subnet_id in subnet_ids { - let Some(subnet_record) = self.get_subnet_record(subnet_id, version)? else { - continue; - }; - let node_ids = get_node_ids_from_subnet_record(&subnet_record).map_err(|err| { - RegistryClientError::DecodeError { - error: format!("get_node_ids_from_subnet_record() failed with {err}"), - } - })?; - let subnet_type = subnet_record.subnet_type(); - if node_ids.contains(&node_id) { - return Ok(Some((subnet_id, subnet_type))); + if let Some(node_ids) = self.get_node_ids_on_subnet(subnet_id, version)? + && node_ids.contains(&node_id) + { + return Ok(Some(subnet_id)); } } } From b0a493c51e8e3ac57b6307711e2a9a8ed6ec1500 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Thu, 18 Jun 2026 14:00:11 +0000 Subject: [PATCH 22/42] feat: gate launching ic-gateway behind flag --- rs/orchestrator/src/processes.rs | 41 +++++++++++++++------ rs/orchestrator/src/upgrade.rs | 4 ++ rs/tests/consensus/orchestrator/BUILD.bazel | 4 ++ 3 files changed, 38 insertions(+), 11 deletions(-) diff --git a/rs/orchestrator/src/processes.rs b/rs/orchestrator/src/processes.rs index f7946e97e14d..5056de6f955c 100644 --- a/rs/orchestrator/src/processes.rs +++ b/rs/orchestrator/src/processes.rs @@ -386,10 +386,22 @@ impl IcBoundaryManager { // the node's configuration in the registry. // --------------------------------------------------------------------------- +/// Whether the orchestrator is currently allowed to actually launch +/// `ic-gateway`. CloudEngine nodes *should* run `ic-gateway` (see +/// [`should_run_ic_gateway`]), but the launch is gated off for now while the +/// rollout is being prepared. To trigger it later, flip this to `true` (and +/// re-enable the `cloud_engine_ic_gateway_test` system test by removing its +/// `manual` tag in `rs/tests/consensus/orchestrator/BUILD.bazel`). +const IC_GATEWAY_LAUNCH_ENABLED: bool = false; + pub(crate) struct MultipleProcessesManager { replica_manager: ProcessManager, ic_gateway_manager: ProcessManager, registry: Arc, + /// Whether this manager is allowed to actually launch `ic-gateway`. + /// Sourced from [`IC_GATEWAY_LAUNCH_ENABLED`] in production; injected by + /// tests so they can exercise both gate states. + ic_gateway_launch_enabled: bool, } impl MultipleProcessesManager { @@ -398,11 +410,13 @@ impl MultipleProcessesManager { replica_manager: ProcessManager, ic_gateway_manager: ProcessManager, registry: Arc, + ic_gateway_launch_enabled: bool, ) -> Self { Self { replica_manager, ic_gateway_manager, registry, + ic_gateway_launch_enabled, } } @@ -421,6 +435,7 @@ impl MultipleProcessesManager { replica_manager, ic_gateway_manager, registry, + ic_gateway_launch_enabled: IC_GATEWAY_LAUNCH_ENABLED, } } @@ -457,17 +472,21 @@ impl MultipleProcessesManager { self.replica_manager .ensure_running((replica_version.clone(), subnet_id))?; - // Cloud-engine nodes run ic-gateway as a sidecar. - match self.registry.get_subnet_type(subnet_id, registry_version)? { - None - | Some(SubnetType::Unspecified) - | Some(SubnetType::Application) - | Some(SubnetType::System) - | Some(SubnetType::VerifiedApplication) => { - self.ic_gateway_manager.stop()?; - } - Some(SubnetType::CloudEngine) => { - self.ic_gateway_manager.ensure_running(replica_version)?; + // Cloud-engine nodes run ic-gateway as a sidecar, but only once the + // launch is enabled (see `IC_GATEWAY_LAUNCH_ENABLED`). Until then we + // keep it stopped. + if self.ic_gateway_launch_enabled { + match self.registry.get_subnet_type(subnet_id, registry_version)? { + None + | Some(SubnetType::Unspecified) + | Some(SubnetType::Application) + | Some(SubnetType::System) + | Some(SubnetType::VerifiedApplication) => { + self.ic_gateway_manager.stop()?; + } + Some(SubnetType::CloudEngine) => { + self.ic_gateway_manager.ensure_running(replica_version)?; + } } } diff --git a/rs/orchestrator/src/upgrade.rs b/rs/orchestrator/src/upgrade.rs index b31d8e7b7e91..85ec2503162a 100644 --- a/rs/orchestrator/src/upgrade.rs +++ b/rs/orchestrator/src/upgrade.rs @@ -1567,6 +1567,10 @@ mod tests { logger.clone(), ), Arc::clone(®istry), + // ic-gateway launching is gated off in production, but these + // scenarios exercise the policy (CloudEngine => ic-gateway runs), + // so run them with the gate enabled. + true, ))); let manageboot_runner = Box::new(FakeManagebootRunner); diff --git a/rs/tests/consensus/orchestrator/BUILD.bazel b/rs/tests/consensus/orchestrator/BUILD.bazel index 908b6505795d..9c2f494b5a33 100644 --- a/rs/tests/consensus/orchestrator/BUILD.bazel +++ b/rs/tests/consensus/orchestrator/BUILD.bazel @@ -121,6 +121,10 @@ system_test_nns( system_test( name = "cloud_engine_ic_gateway_test", + # Excluded from automatic runs while ic-gateway launching is gated off in the + # orchestrator (see IC_GATEWAY_LAUNCH_ENABLED in + # rs/orchestrator/src/processes.rs). Remove this tag when enabling the launch. + tags = ["manual"], deps = [ # Keep sorted. "//rs/registry/subnet_type", From 93a758e7807b44cd88cbe19fe5a271d1437a7d0f Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Thu, 18 Jun 2026 15:12:33 +0000 Subject: [PATCH 23/42] docs --- rs/orchestrator/src/upgrade.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/rs/orchestrator/src/upgrade.rs b/rs/orchestrator/src/upgrade.rs index 85ec2503162a..15be583f536d 100644 --- a/rs/orchestrator/src/upgrade.rs +++ b/rs/orchestrator/src/upgrade.rs @@ -1567,10 +1567,7 @@ mod tests { logger.clone(), ), Arc::clone(®istry), - // ic-gateway launching is gated off in production, but these - // scenarios exercise the policy (CloudEngine => ic-gateway runs), - // so run them with the gate enabled. - true, + /* ic_gateway_launch_enabled */ true, ))); let manageboot_runner = Box::new(FakeManagebootRunner); From aefebff65d709c4befcb49ae293e9c9f048803ca Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Fri, 19 Jun 2026 08:12:16 +0000 Subject: [PATCH 24/42] docs --- rs/orchestrator/src/processes.rs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/rs/orchestrator/src/processes.rs b/rs/orchestrator/src/processes.rs index 5056de6f955c..d6951ddb196c 100644 --- a/rs/orchestrator/src/processes.rs +++ b/rs/orchestrator/src/processes.rs @@ -316,7 +316,7 @@ impl IcBoundaryManager { } } - // TODO: consider returning an error when thing fail instead of logging + // TODO: consider returning an error when things fail instead of logging pub(crate) fn ensure_ic_boundary_running_and_restarted_on_domain_change( &mut self, replica_version: ReplicaVersion, @@ -387,11 +387,10 @@ impl IcBoundaryManager { // --------------------------------------------------------------------------- /// Whether the orchestrator is currently allowed to actually launch -/// `ic-gateway`. CloudEngine nodes *should* run `ic-gateway` (see -/// [`should_run_ic_gateway`]), but the launch is gated off for now while the -/// rollout is being prepared. To trigger it later, flip this to `true` (and -/// re-enable the `cloud_engine_ic_gateway_test` system test by removing its -/// `manual` tag in `rs/tests/consensus/orchestrator/BUILD.bazel`). +/// `ic-gateway`. CloudEngine nodes *should* run `ic-gateway`, but the launch +/// is gated off for now while the rollout is being prepared. To trigger it +/// later, flip this to `true` (and re-enable the `cloud_engine_ic_gateway_test` +/// system test by removing its `manual` tag). const IC_GATEWAY_LAUNCH_ENABLED: bool = false; pub(crate) struct MultipleProcessesManager { @@ -473,8 +472,8 @@ impl MultipleProcessesManager { .ensure_running((replica_version.clone(), subnet_id))?; // Cloud-engine nodes run ic-gateway as a sidecar, but only once the - // launch is enabled (see `IC_GATEWAY_LAUNCH_ENABLED`). Until then we - // keep it stopped. + // launch is enabled (see `IC_GATEWAY_LAUNCH_ENABLED`). Until then, + // ignore it. if self.ic_gateway_launch_enabled { match self.registry.get_subnet_type(subnet_id, registry_version)? { None From 7ba43d7a35b8cced822fcb10300cc517058cb4e0 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Mon, 22 Jun 2026 08:12:57 +0000 Subject: [PATCH 25/42] style: inline args.ic_binary_directory --- rs/orchestrator/src/orchestrator.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/rs/orchestrator/src/orchestrator.rs b/rs/orchestrator/src/orchestrator.rs index 7862b89c2806..053eb54d7056 100644 --- a/rs/orchestrator/src/orchestrator.rs +++ b/rs/orchestrator/src/orchestrator.rs @@ -243,9 +243,8 @@ impl Orchestrator { Arc::clone(&crypto) as _, ); - let ic_binary_directory = args.ic_binary_directory; let manageboot_runner = Box::new(ManagebootRunnerImpl::new( - ic_binary_directory.join("manageboot.sh"), + args.ic_binary_directory.join("manageboot.sh"), )); // Create a read-only CUP reader that can be shared among Dashboard and Firewall @@ -263,12 +262,12 @@ impl Orchestrator { ); let replica_process_config = ReplicaProcessConfig { - ic_binary_dir: ic_binary_directory.clone(), + ic_binary_dir: args.ic_binary_directory.clone(), cup_path: local_cup_reader.get_cup_path(), replica_config_file: args.replica_config_file.clone(), }; let ic_gateway_process_config = IcGatewayProcessConfig { - ic_binary_dir: ic_binary_directory.clone(), + ic_binary_dir: args.ic_binary_directory.clone(), ic_gateway_env_file: args.ic_gateway_env_file.clone(), }; @@ -340,7 +339,7 @@ impl Orchestrator { }; let ic_boundary_process_config = IcBoundaryProcessConfig { - ic_binary_dir: ic_binary_directory.clone(), + ic_binary_dir: args.ic_binary_directory.clone(), ic_boundary_env_file: args.ic_boundary_env_file.clone(), crypto_config: config.crypto.clone(), }; @@ -371,7 +370,7 @@ impl Orchestrator { let ipv4_configurator = Ipv4Configurator::new( Arc::clone(®istry), Arc::clone(&metrics), - ic_binary_directory, + args.ic_binary_directory, logger.clone(), ); From 603cfb58b6f6e52fe91b2edbe0ca241645cbad80 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Mon, 22 Jun 2026 08:21:51 +0000 Subject: [PATCH 26/42] docs --- rs/orchestrator/src/process_manager.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/rs/orchestrator/src/process_manager.rs b/rs/orchestrator/src/process_manager.rs index 409f1b1b1ba4..f5472857d34c 100644 --- a/rs/orchestrator/src/process_manager.rs +++ b/rs/orchestrator/src/process_manager.rs @@ -58,6 +58,7 @@ pub(crate) trait Process { /// Trait for running a single versioned [`Process`] pub(crate) trait ProcessRunner: Send { + /// Start the given process. fn start(&mut self, process: P) -> Result<()>; /// Stop the currently running process. @@ -71,7 +72,7 @@ pub(crate) trait ProcessRunner: Send { fn get_pid(&self) -> Option; } -/// Runs a single versioned [`Process`] by implementing [`ProcessRunner

`]. +/// A [`SingleProcessRunner`] manages running a single versioned [`Process`] pub(crate) struct SingleProcessRunner { process: Option

, pid_cell: PIDCell, @@ -115,9 +116,7 @@ impl SingleProcessRunner

{ /// (so its PID equals its PGID, which is what the negation below /// relies on) by setting its process group at spawn time via /// `Command::process_group(0)` -- see `start`. We therefore do not - /// rely on the managed binary calling `setpgid` itself. Sub-processes - /// cannot escape the group because `setpgid` is restricted in - /// production by SELinux type enforcement. + /// rely on the managed binary calling `setpgid` itself. /// /// We still depend on init to handle reaping of adopted children, /// as the orchestrator has no way of adopting or even knowing the From afc74aaff5c4d3e7eaef3b28a80325343ff1e103 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Mon, 22 Jun 2026 08:21:59 +0000 Subject: [PATCH 27/42] style: rename Fake struct --- rs/orchestrator/src/upgrade.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/rs/orchestrator/src/upgrade.rs b/rs/orchestrator/src/upgrade.rs index 15be583f536d..c349e175540f 100644 --- a/rs/orchestrator/src/upgrade.rs +++ b/rs/orchestrator/src/upgrade.rs @@ -1190,15 +1190,15 @@ mod tests { /// Fake runner that tracks running state without spawning a real process. /// Used as a drop-in for `SingleProcessRunner

` inside process managers. - pub(crate) struct FakeRunner { + pub(crate) struct FakeProcessRunner { running: bool, } - impl FakeRunner { + impl FakeProcessRunner { pub(crate) fn new() -> Self { Self { running: false } } } - impl ProcessRunner

for FakeRunner { + impl ProcessRunner

for FakeProcessRunner { fn start(&mut self, _process: P) -> std::io::Result<()> { self.running = true; Ok(()) @@ -1519,13 +1519,13 @@ mod tests { let ic_gateway_env_file = dir.join("ic-gateway.env"); std::fs::write(&ic_gateway_env_file, b"TEST_KEY=TEST_VALUE").unwrap(); - let mut replica_runner = Box::new(FakeRunner::new()); + let mut replica_runner = Box::new(FakeProcessRunner::new()); let replica_process_config = ReplicaProcessConfig { ic_binary_dir: ic_binary_dir.clone(), cup_path, replica_config_file: replica_config_file.clone(), }; - let mut ic_gateway_runner = Box::new(FakeRunner::new()); + let mut ic_gateway_runner = Box::new(FakeProcessRunner::new()); let ic_gateway_process_config = IcGatewayProcessConfig { ic_binary_dir, ic_gateway_env_file, From e12eccceb6ae9151c95a6ea94c6864e5c8511ab4 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Mon, 22 Jun 2026 09:29:27 +0000 Subject: [PATCH 28/42] test: add IcBoundaryManager unit tests --- rs/orchestrator/src/processes.rs | 222 +++++++++++++++++++++++++++++++ 1 file changed, 222 insertions(+) diff --git a/rs/orchestrator/src/processes.rs b/rs/orchestrator/src/processes.rs index d6951ddb196c..f9ccd1ff0105 100644 --- a/rs/orchestrator/src/processes.rs +++ b/rs/orchestrator/src/processes.rs @@ -316,6 +316,21 @@ impl IcBoundaryManager { } } + // Used in tests to inject a mock ProcessManager. + #[cfg(test)] + pub(crate) fn new_for_test( + inner: ProcessManager, + registry: Arc, + logger: ReplicaLogger, + ) -> Self { + Self { + inner, + registry, + current_domain_name: None, + logger, + } + } + // TODO: consider returning an error when things fail instead of logging pub(crate) fn ensure_ic_boundary_running_and_restarted_on_domain_change( &mut self, @@ -505,3 +520,210 @@ impl MultipleProcessesManager { Ok(()) } } + +#[cfg(test)] +mod tests { + use super::*; + use ic_logger::no_op_logger; + use ic_metrics::MetricsRegistry; + use ic_registry_client_fake::FakeRegistryClient; + use ic_registry_client_helpers::node_operator::NodeRecord; + use ic_registry_keys::make_node_record_key; + use ic_registry_proto_data_provider::ProtoRegistryDataProvider; + use ic_test_utilities_types::ids::NODE_1; + use std::{path::Path, sync::Mutex}; + use tempfile::tempdir; + + const REPLICA_VERSION: &str = "replica_version_0.1"; + + /// Counters recorded by [`RecordingRunner`], so tests can assert whether + /// (and how often) the managed process was started/stopped. + #[derive(Default)] + struct RunnerLog { + running: bool, + starts: usize, + stops: usize, + } + + /// A `ProcessRunner` fake that records start/stop calls instead of spawning. + struct RecordingRunner { + log: Arc>, + } + + impl ProcessRunner

for RecordingRunner { + fn start(&mut self, _process: P) -> std::io::Result<()> { + let mut log = self.log.lock().unwrap(); + log.running = true; + log.starts += 1; + Ok(()) + } + + fn stop(&mut self) -> std::io::Result<()> { + let mut log = self.log.lock().unwrap(); + log.running = false; + log.stops += 1; + Ok(()) + } + + fn is_running(&self) -> bool { + self.log.lock().unwrap().running + } + + fn get_pid(&self) -> Option { + self.log + .lock() + .unwrap() + .running + .then_some(Pid::from_raw(12345)) + } + } + + /// Builds a registry whose node record for `NODE_1` carries the given domain + /// at each listed registry version (`None` means "no domain"). + fn registry_with_node_domains(domains: &[(u64, Option<&str>)]) -> Arc { + let data_provider = Arc::new(ProtoRegistryDataProvider::new()); + for &(version, domain) in domains { + data_provider + .add( + &make_node_record_key(NODE_1), + RegistryVersion::from(version), + Some(NodeRecord { + domain: domain.map(str::to_string), + ..Default::default() + }), + ) + .unwrap(); + } + let registry_client = Arc::new(FakeRegistryClient::new(data_provider)); + registry_client.update_to_latest_version(); + Arc::new(RegistryHelper::new(NODE_1, registry_client, no_op_logger())) + } + + /// Builds an [`IcBoundaryManager`] backed by a [`RecordingRunner`], returning + /// the manager and a handle to the runner's log. + fn ic_boundary_manager_for_test( + registry: Arc, + dir: &Path, + ) -> (IcBoundaryManager, Arc>) { + let log = Arc::new(Mutex::new(RunnerLog::default())); + let runner = Box::new(RecordingRunner { log: log.clone() }); + let env_file = dir.join("ic-boundary.env"); + std::fs::write(&env_file, b"TEST_KEY=TEST_VALUE").unwrap(); + let config = IcBoundaryProcessConfig { + ic_binary_dir: dir.to_path_buf(), + ic_boundary_env_file: env_file, + crypto_config: CryptoConfig::default(), + }; + let inner = ProcessManager::new_for_test( + runner, + config, + Arc::new(OrchestratorMetrics::new(&MetricsRegistry::new())), + no_op_logger(), + ); + let manager = IcBoundaryManager::new_for_test(inner, registry, no_op_logger()); + (manager, log) + } + + fn ensure(manager: &mut IcBoundaryManager, registry_version: u64) { + manager.ensure_ic_boundary_running_and_restarted_on_domain_change( + ReplicaVersion::try_from(REPLICA_VERSION).unwrap(), + RegistryVersion::from(registry_version), + ); + } + + #[test] + fn ic_boundary_not_started_when_node_has_no_domain() { + let dir = tempdir().unwrap(); + let registry = registry_with_node_domains(&[(1, None)]); + let (mut manager, log) = ic_boundary_manager_for_test(registry, dir.path()); + + ensure(&mut manager, 1); + + let log = log.lock().unwrap(); + assert!(!log.running); + assert_eq!(log.starts, 0); + assert_eq!(log.stops, 0); + assert_eq!(manager.current_domain_name, None); + } + + #[test] + fn ic_boundary_starts_when_node_has_domain() { + let dir = tempdir().unwrap(); + let registry = registry_with_node_domains(&[(1, Some("api1.example.com"))]); + let (mut manager, log) = ic_boundary_manager_for_test(registry, dir.path()); + + ensure(&mut manager, 1); + + let log = log.lock().unwrap(); + assert!(log.running); + assert_eq!(log.starts, 1); + assert_eq!(log.stops, 0); + assert_eq!( + manager.current_domain_name.as_deref(), + Some("api1.example.com") + ); + } + + #[test] + fn ic_boundary_not_restarted_when_domain_unchanged() { + let dir = tempdir().unwrap(); + let registry = registry_with_node_domains(&[(1, Some("api1.example.com"))]); + let (mut manager, log) = ic_boundary_manager_for_test(registry, dir.path()); + + ensure(&mut manager, 1); + ensure(&mut manager, 1); + + let log = log.lock().unwrap(); + assert!(log.running); + // Started once on the first call; the second call must not restart it. + assert_eq!(log.starts, 1); + assert_eq!(log.stops, 0); + assert_eq!( + manager.current_domain_name.as_deref(), + Some("api1.example.com") + ); + } + + #[test] + fn ic_boundary_restarted_when_domain_changes() { + let dir = tempdir().unwrap(); + let registry = registry_with_node_domains(&[ + (1, Some("api1.example.com")), + (2, Some("api2.example.com")), + ]); + let (mut manager, log) = ic_boundary_manager_for_test(registry, dir.path()); + + ensure(&mut manager, 1); + ensure(&mut manager, 2); + + let log = log.lock().unwrap(); + assert!(log.running); + // Restart on domain change: stopped once, started twice. + assert_eq!(log.starts, 2); + assert_eq!(log.stops, 1); + assert_eq!( + manager.current_domain_name.as_deref(), + Some("api2.example.com") + ); + } + + #[test] + fn ic_boundary_stopped_when_domain_is_deleted() { + let dir = tempdir().unwrap(); + let registry = registry_with_node_domains(&[(1, Some("api1.example.com")), (2, None)]); + let (mut manager, log) = ic_boundary_manager_for_test(registry, dir.path()); + + // Running with a domain ... + ensure(&mut manager, 1); + assert!(log.lock().unwrap().running); + + // ... then the domain is removed: ic-boundary must be stopped. + ensure(&mut manager, 2); + + let log = log.lock().unwrap(); + assert!(!log.running); + assert_eq!(log.starts, 1); + assert_eq!(log.stops, 1); + assert_eq!(manager.current_domain_name, None); + } +} From 9ab62e604783173c73b1157eaa9b7611112c4d54 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Mon, 22 Jun 2026 12:53:49 +0000 Subject: [PATCH 29/42] docs --- rs/orchestrator/src/processes.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rs/orchestrator/src/processes.rs b/rs/orchestrator/src/processes.rs index f9ccd1ff0105..a47ff56e63fe 100644 --- a/rs/orchestrator/src/processes.rs +++ b/rs/orchestrator/src/processes.rs @@ -394,7 +394,7 @@ impl IcBoundaryManager { } // --------------------------------------------------------------------------- -// MultipleProcessManager +// MultipleProcessesManager // // This struct manages all processes that the upgrade loop is responsible for, // providing a single entry point for starting and stopping them according to From 45aeab234bbb1840cc9abc842416c44a8edf87b4 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Mon, 22 Jun 2026 13:11:54 +0000 Subject: [PATCH 30/42] feat: gate stopping --- rs/orchestrator/src/processes.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rs/orchestrator/src/processes.rs b/rs/orchestrator/src/processes.rs index a47ff56e63fe..8b0cf87663f2 100644 --- a/rs/orchestrator/src/processes.rs +++ b/rs/orchestrator/src/processes.rs @@ -514,7 +514,9 @@ impl MultipleProcessesManager { /// Stop every managed process. pub(crate) fn stop_all(&mut self) -> OrchestratorResult<()> { - self.ic_gateway_manager.stop()?; + if self.ic_gateway_launch_enabled { + self.ic_gateway_manager.stop()?; + } self.replica_manager.stop()?; Ok(()) From 5b993db15bcc69fbee99332bbaacf235c871a5ad Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Tue, 23 Jun 2026 14:14:35 +0000 Subject: [PATCH 31/42] docs: update `ic_binary_directory` docs --- rs/orchestrator/src/args.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rs/orchestrator/src/args.rs b/rs/orchestrator/src/args.rs index c3cc2f381da7..43e35f980dc5 100644 --- a/rs/orchestrator/src/args.rs +++ b/rs/orchestrator/src/args.rs @@ -36,7 +36,7 @@ pub struct OrchestratorArgs { pub(crate) ic_gateway_env_file: PathBuf, /// The path to the Replica binary location containing the following in case - /// of guest OS deployment: version.txt, manageboot.sh, replica, + /// of guest OS deployment: replica, ic-boundary, ic-gateway, manageboot.sh, /// install-upgrade.sh #[clap(long)] pub(crate) ic_binary_directory: PathBuf, From 55e2587a3d3ba797cd551afe9ee3d8ef7a709f1d Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Tue, 23 Jun 2026 14:19:03 +0000 Subject: [PATCH 32/42] feat: update current domain name only on successes --- rs/orchestrator/src/processes.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/rs/orchestrator/src/processes.rs b/rs/orchestrator/src/processes.rs index 8b0cf87663f2..e9ae21c2b07f 100644 --- a/rs/orchestrator/src/processes.rs +++ b/rs/orchestrator/src/processes.rs @@ -362,9 +362,11 @@ impl IcBoundaryManager { IcBoundaryProcess::NAME, err ); + } else { + // Only update the current domain name if we successfully started ic-boundary, + // so that we can retry on the next call. + self.current_domain_name = Some(domain_name); } - - self.current_domain_name = Some(domain_name); } // ic-boundary should not start when the node doesn't have a domain name Ok(None) => { @@ -380,8 +382,12 @@ impl IcBoundaryManager { IcBoundaryProcess::NAME, err ); + } else { + // Only clear the current domain name if we successfully stopped ic-boundary, + // so that we correctly detect we should first retry to stop it in case we get + // a new domain name in a next call. + self.current_domain_name = None; } - self.current_domain_name = None; } // Failing to read the registry Err(err) => warn!(self.logger, "Failed to fetch domain name: {}", err), From a22c0ded19fde18394fa3bff16a2554a35a33b93 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Tue, 23 Jun 2026 14:21:45 +0000 Subject: [PATCH 33/42] feat: assert prefix-freeness --- rs/tests/driver/src/driver/test_env_api.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/rs/tests/driver/src/driver/test_env_api.rs b/rs/tests/driver/src/driver/test_env_api.rs index 338c5d29bc91..f999d143a4ce 100644 --- a/rs/tests/driver/src/driver/test_env_api.rs +++ b/rs/tests/driver/src/driver/test_env_api.rs @@ -1138,8 +1138,15 @@ impl IcNodeSnapshot { self.node_id ); for (name, value) in metrics { - // Assume the metrics to check are prefix-free. This allows to specify a metric name - // prefix to check all metrics with that prefix. + // First assert the metrics to check are prefix-free. This allows to specify a + // metric name prefix to check all metrics with that prefix. + assert!( + !metrics_to_check + .keys() + .any(|other_name| other_name != name && name.starts_with(*other_name)), + "The metric `{name}` is not prefix-free with respect to the other metrics to check. \ + This is not allowed. Please specify a prefix-free set of metrics to check." + ); let max_value = metrics_to_check .iter() .find(|(metric_name, _)| name.starts_with(**metric_name)) From 8f3a7403ddd0239216816a7b025e9fa41029a00b Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Tue, 23 Jun 2026 14:23:23 +0000 Subject: [PATCH 34/42] fix: fix outdated log+docs --- rs/orchestrator/src/upgrade.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rs/orchestrator/src/upgrade.rs b/rs/orchestrator/src/upgrade.rs index c349e175540f..70e594ed1740 100644 --- a/rs/orchestrator/src/upgrade.rs +++ b/rs/orchestrator/src/upgrade.rs @@ -429,9 +429,9 @@ impl Upgrade { .await .map_err(OrchestratorError::FileDownloadError)?; if let Err(e) = self.stop_children() { - // Even though we fail to stop the replica, we should still + // Even though we fail to stop child processes, we should still // replace the registry local store, so we simply issue a warning. - warn!(self.logger, "Failed to stop replica with error {:?}", e); + warn!(self.logger, "Failed to stop children with error {:?}.", e); } let new_local_store = LocalStoreImpl::new(local_store_location); self.registry_replicator From d360dacdabdad418a4f75a985fc0dae7069ceb02 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Tue, 23 Jun 2026 15:13:02 +0000 Subject: [PATCH 35/42] fix: period --- rs/orchestrator/src/upgrade.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rs/orchestrator/src/upgrade.rs b/rs/orchestrator/src/upgrade.rs index 70e594ed1740..1317fb7da090 100644 --- a/rs/orchestrator/src/upgrade.rs +++ b/rs/orchestrator/src/upgrade.rs @@ -431,7 +431,7 @@ impl Upgrade { if let Err(e) = self.stop_children() { // Even though we fail to stop child processes, we should still // replace the registry local store, so we simply issue a warning. - warn!(self.logger, "Failed to stop children with error {:?}.", e); + warn!(self.logger, "Failed to stop children with error {:?}", e); } let new_local_store = LocalStoreImpl::new(local_store_location); self.registry_replicator From 8119f94b887afddec24fcd7140f6b8d8e58fda32 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Wed, 24 Jun 2026 07:19:11 +0000 Subject: [PATCH 36/42] fix: clippy --- rs/tests/driver/src/driver/test_env_api.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rs/tests/driver/src/driver/test_env_api.rs b/rs/tests/driver/src/driver/test_env_api.rs index f999d143a4ce..977068f2bae0 100644 --- a/rs/tests/driver/src/driver/test_env_api.rs +++ b/rs/tests/driver/src/driver/test_env_api.rs @@ -1143,7 +1143,7 @@ impl IcNodeSnapshot { assert!( !metrics_to_check .keys() - .any(|other_name| other_name != name && name.starts_with(*other_name)), + .any(|other_name| *other_name != name && name.starts_with(*other_name)), "The metric `{name}` is not prefix-free with respect to the other metrics to check. \ This is not allowed. Please specify a prefix-free set of metrics to check." ); From 90016fd2a79c9d855f7ff47eefc9f2cbe53c4437 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Wed, 24 Jun 2026 07:31:55 +0000 Subject: [PATCH 37/42] fix: update current domain name only if both operations succeeded --- rs/orchestrator/src/processes.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/rs/orchestrator/src/processes.rs b/rs/orchestrator/src/processes.rs index e9ae21c2b07f..8eec243c0e5d 100644 --- a/rs/orchestrator/src/processes.rs +++ b/rs/orchestrator/src/processes.rs @@ -339,10 +339,12 @@ impl IcBoundaryManager { ) { match self.registry.get_node_domain_name(registry_version) { Ok(Some(domain_name)) => { + let mut success = true; // stop ic-boundary when the domain name changes and start it again. if Some(&domain_name) != self.current_domain_name.as_ref() && let Err(err) = self.inner.stop() { + success = false; warn!( self.logger, "Failed to stop {}: {}", @@ -356,15 +358,18 @@ impl IcBoundaryManager { .inner .ensure_running((replica_version, domain_name.clone())) { + success = false; warn!( self.logger, "Failed to start {}: {}", IcBoundaryProcess::NAME, err ); - } else { - // Only update the current domain name if we successfully started ic-boundary, - // so that we can retry on the next call. + } + + if success { + // Only update the current domain name if we performed the operations above + // successfully, so that we can retry on the next call if not. self.current_domain_name = Some(domain_name); } } From dc35b79886cef9365d073d08f2dcd97c1a5247b2 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Wed, 24 Jun 2026 08:21:29 +0000 Subject: [PATCH 38/42] fix: metrics prefix-freeness assert --- rs/tests/driver/src/driver/test_env_api.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/rs/tests/driver/src/driver/test_env_api.rs b/rs/tests/driver/src/driver/test_env_api.rs index 977068f2bae0..dc19ba159345 100644 --- a/rs/tests/driver/src/driver/test_env_api.rs +++ b/rs/tests/driver/src/driver/test_env_api.rs @@ -1138,20 +1138,20 @@ impl IcNodeSnapshot { self.node_id ); for (name, value) in metrics { - // First assert the metrics to check are prefix-free. This allows to specify a - // metric name prefix to check all metrics with that prefix. + // Assert the metrics to check are prefix-free. This allows to specify a metric name + // prefix to check all metrics with that prefix. + let mut metrics_to_check = metrics_to_check.iter(); + let max_value = metrics_to_check + .filter(|(metric_name, _)| name.starts_with(**metric_name)) + .map(|(_, max_value)| *max_value) + .next() + .unwrap_or_default(); assert!( - !metrics_to_check - .keys() - .any(|other_name| *other_name != name && name.starts_with(*other_name)), + metrics_to_check.count() == 0, "The metric `{name}` is not prefix-free with respect to the other metrics to check. \ This is not allowed. Please specify a prefix-free set of metrics to check." ); - let max_value = metrics_to_check - .iter() - .find(|(metric_name, _)| name.starts_with(**metric_name)) - .map(|(_, max_value)| *max_value) - .unwrap_or_default(); + assert!( value[0] <= max_value, "The metric `{name}` on node {} exceeded the maximum allowed value: \ From b9adca0dfd96ba3776f5c849431dc0e7c15f408e Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Wed, 24 Jun 2026 08:37:47 +0000 Subject: [PATCH 39/42] fix --- rs/tests/driver/src/driver/test_env_api.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/rs/tests/driver/src/driver/test_env_api.rs b/rs/tests/driver/src/driver/test_env_api.rs index dc19ba159345..07e5024c3bc5 100644 --- a/rs/tests/driver/src/driver/test_env_api.rs +++ b/rs/tests/driver/src/driver/test_env_api.rs @@ -1140,12 +1140,13 @@ impl IcNodeSnapshot { for (name, value) in metrics { // Assert the metrics to check are prefix-free. This allows to specify a metric name // prefix to check all metrics with that prefix. - let mut metrics_to_check = metrics_to_check.iter(); - let max_value = metrics_to_check + let mut metrics_to_check = metrics_to_check + .iter() .filter(|(metric_name, _)| name.starts_with(**metric_name)) - .map(|(_, max_value)| *max_value) - .next() - .unwrap_or_default(); + .map(|(_, max_value)| *max_value); + let max_value = metrics_to_check.next().unwrap_or_default(); + // Assert that the iterator only had one element, i.e. the metrics to check are + // prefix-free. assert!( metrics_to_check.count() == 0, "The metric `{name}` is not prefix-free with respect to the other metrics to check. \ From fe8ffaac9fca11776c708fd782510dbc2df8a095 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Wed, 24 Jun 2026 09:17:34 +0000 Subject: [PATCH 40/42] refactor: return early on error + return errors logged by caller --- rs/orchestrator/src/boundary_node.rs | 11 ++- rs/orchestrator/src/error.rs | 9 ++ rs/orchestrator/src/processes.rs | 124 ++++++++++--------------- rs/orchestrator/src/registry_helper.rs | 4 +- 4 files changed, 68 insertions(+), 80 deletions(-) diff --git a/rs/orchestrator/src/boundary_node.rs b/rs/orchestrator/src/boundary_node.rs index 314fa312ebcc..8846c9f4630d 100644 --- a/rs/orchestrator/src/boundary_node.rs +++ b/rs/orchestrator/src/boundary_node.rs @@ -50,11 +50,20 @@ impl BoundaryNodeManager { // NOTE: We could also shutdown the boundary node here. However, it makes sense to continue // serving requests while the orchestrator is downloading the new image in most cases. } else { - self.process_manager + if let Err(err) = self + .process_manager .ensure_ic_boundary_running_and_restarted_on_domain_change( self.version.clone(), registry_version, + ) + { + warn!( + self.logger, + "Failed to ensure {} is running: {}", + IcBoundaryProcess::NAME, + err ); + } } } // BN should not be active diff --git a/rs/orchestrator/src/error.rs b/rs/orchestrator/src/error.rs index a85282545790..39de413921ec 100644 --- a/rs/orchestrator/src/error.rs +++ b/rs/orchestrator/src/error.rs @@ -78,6 +78,9 @@ pub(crate) enum OrchestratorError { /// An error occurred when trying to get the role (Api boundary node, replica, ...) of the node /// at the given registry version. RoleError(String, RegistryVersion), + + /// The given node is missing a domain name at the given registry version. + DomainNameMissingError(NodeId, RegistryVersion), } impl OrchestratorError { @@ -168,6 +171,12 @@ impl fmt::Display for OrchestratorError { "Failed to get the role of the node at the registry version {registry_version}: {msg}" ) } + OrchestratorError::DomainNameMissingError(node_id, registry_version) => { + write!( + f, + "Node {node_id} does not have an associated domain name at registry version {registry_version}" + ) + } } } } diff --git a/rs/orchestrator/src/processes.rs b/rs/orchestrator/src/processes.rs index 8eec243c0e5d..ce8aa5ad3578 100644 --- a/rs/orchestrator/src/processes.rs +++ b/rs/orchestrator/src/processes.rs @@ -5,7 +5,7 @@ use crate::{ registry_helper::RegistryHelper, }; use ic_config::crypto::CryptoConfig; -use ic_logger::{ReplicaLogger, info, warn}; +use ic_logger::{ReplicaLogger, info}; use ic_protobuf::registry::subnet::v1::SubnetType; use ic_types::{RegistryVersion, ReplicaVersion, SubnetId}; use nix::unistd::Pid; @@ -297,7 +297,6 @@ pub(crate) struct IcBoundaryManager { inner: ProcessManager, registry: Arc, current_domain_name: Option, - logger: ReplicaLogger, } impl IcBoundaryManager { @@ -307,12 +306,11 @@ impl IcBoundaryManager { metrics: Arc, logger: ReplicaLogger, ) -> Self { - let inner = ProcessManager::new(config, metrics, logger.clone()); + let inner = ProcessManager::new(config, metrics, logger); Self { inner, registry, current_domain_name: None, - logger, } } @@ -321,82 +319,47 @@ impl IcBoundaryManager { pub(crate) fn new_for_test( inner: ProcessManager, registry: Arc, - logger: ReplicaLogger, ) -> Self { Self { inner, registry, current_domain_name: None, - logger, } } - // TODO: consider returning an error when things fail instead of logging pub(crate) fn ensure_ic_boundary_running_and_restarted_on_domain_change( &mut self, replica_version: ReplicaVersion, registry_version: RegistryVersion, - ) { - match self.registry.get_node_domain_name(registry_version) { - Ok(Some(domain_name)) => { - let mut success = true; - // stop ic-boundary when the domain name changes and start it again. - if Some(&domain_name) != self.current_domain_name.as_ref() - && let Err(err) = self.inner.stop() - { - success = false; - warn!( - self.logger, - "Failed to stop {}: {}", - IcBoundaryProcess::NAME, - err - ); - } - - // make sure ic-boundary is running - if let Err(err) = self - .inner - .ensure_running((replica_version, domain_name.clone())) - { - success = false; - warn!( - self.logger, - "Failed to start {}: {}", - IcBoundaryProcess::NAME, - err - ); - } - - if success { - // Only update the current domain name if we performed the operations above - // successfully, so that we can retry on the next call if not. - self.current_domain_name = Some(domain_name); - } - } - // ic-boundary should not start when the node doesn't have a domain name - Ok(None) => { - warn!( - self.logger, - "There is no domain associated with the node, while this is a requirement for the API boundary node. Shutting {} down.", - IcBoundaryProcess::NAME - ); - if let Err(err) = self.inner.stop() { - warn!( - self.logger, - "Failed to stop {}: {}", - IcBoundaryProcess::NAME, - err - ); - } else { - // Only clear the current domain name if we successfully stopped ic-boundary, - // so that we correctly detect we should first retry to stop it in case we get - // a new domain name in a next call. - self.current_domain_name = None; - } + ) -> OrchestratorResult<()> { + let domain_name = match self.registry.get_node_domain_name(registry_version) { + Ok(domain_name) => domain_name, + Err(err @ OrchestratorError::DomainNameMissingError(_, _)) => { + // ic-boundary should not start when the node doesn't have a domain name + self.inner.stop()?; + + // Only clear the current domain name if we successfully stopped ic-boundary, so + // that we correctly detect we should first retry to stop it in case we get a new + // domain name in a next call. + self.current_domain_name = None; + return Err(err); } - // Failing to read the registry - Err(err) => warn!(self.logger, "Failed to fetch domain name: {}", err), + Err(err) => return Err(err), + }; + + // stop ic-boundary when the domain name changes and start it again. + if Some(&domain_name) != self.current_domain_name.as_ref() { + self.inner.stop()?; } + + // make sure ic-boundary is running + self.inner + .ensure_running((replica_version, domain_name.clone()))?; + + // Only update the current domain name if we performed the operations above successfully, + // so that we can retry on the next call if not. + self.current_domain_name = Some(domain_name); + Ok(()) } pub(crate) fn stop(&mut self) -> OrchestratorResult<()> { @@ -537,6 +500,7 @@ impl MultipleProcessesManager { #[cfg(test)] mod tests { use super::*; + use assert_matches::assert_matches; use ic_logger::no_op_logger; use ic_metrics::MetricsRegistry; use ic_registry_client_fake::FakeRegistryClient; @@ -633,15 +597,15 @@ mod tests { Arc::new(OrchestratorMetrics::new(&MetricsRegistry::new())), no_op_logger(), ); - let manager = IcBoundaryManager::new_for_test(inner, registry, no_op_logger()); + let manager = IcBoundaryManager::new_for_test(inner, registry); (manager, log) } - fn ensure(manager: &mut IcBoundaryManager, registry_version: u64) { + fn ensure(manager: &mut IcBoundaryManager, registry_version: u64) -> OrchestratorResult<()> { manager.ensure_ic_boundary_running_and_restarted_on_domain_change( ReplicaVersion::try_from(REPLICA_VERSION).unwrap(), RegistryVersion::from(registry_version), - ); + ) } #[test] @@ -650,7 +614,10 @@ mod tests { let registry = registry_with_node_domains(&[(1, None)]); let (mut manager, log) = ic_boundary_manager_for_test(registry, dir.path()); - ensure(&mut manager, 1); + assert_matches!( + ensure(&mut manager, 1), + Err(OrchestratorError::DomainNameMissingError(_, _)) + ); let log = log.lock().unwrap(); assert!(!log.running); @@ -665,7 +632,7 @@ mod tests { let registry = registry_with_node_domains(&[(1, Some("api1.example.com"))]); let (mut manager, log) = ic_boundary_manager_for_test(registry, dir.path()); - ensure(&mut manager, 1); + ensure(&mut manager, 1).expect("ic-boundary should have started successfully"); let log = log.lock().unwrap(); assert!(log.running); @@ -683,8 +650,8 @@ mod tests { let registry = registry_with_node_domains(&[(1, Some("api1.example.com"))]); let (mut manager, log) = ic_boundary_manager_for_test(registry, dir.path()); - ensure(&mut manager, 1); - ensure(&mut manager, 1); + ensure(&mut manager, 1).expect("ic-boundary should have started successfully"); + ensure(&mut manager, 1).expect("ic-boundary should have started successfully"); let log = log.lock().unwrap(); assert!(log.running); @@ -706,8 +673,8 @@ mod tests { ]); let (mut manager, log) = ic_boundary_manager_for_test(registry, dir.path()); - ensure(&mut manager, 1); - ensure(&mut manager, 2); + ensure(&mut manager, 1).expect("ic-boundary should have started successfully"); + ensure(&mut manager, 2).expect("ic-boundary should have started successfully"); let log = log.lock().unwrap(); assert!(log.running); @@ -727,11 +694,14 @@ mod tests { let (mut manager, log) = ic_boundary_manager_for_test(registry, dir.path()); // Running with a domain ... - ensure(&mut manager, 1); + ensure(&mut manager, 1).expect("ic-boundary should have started successfully"); assert!(log.lock().unwrap().running); // ... then the domain is removed: ic-boundary must be stopped. - ensure(&mut manager, 2); + assert_matches!( + ensure(&mut manager, 2), + Err(OrchestratorError::DomainNameMissingError(_, _)) + ); let log = log.lock().unwrap(); assert!(!log.running); diff --git a/rs/orchestrator/src/registry_helper.rs b/rs/orchestrator/src/registry_helper.rs index ee807b597c41..dabc18b6ece8 100644 --- a/rs/orchestrator/src/registry_helper.rs +++ b/rs/orchestrator/src/registry_helper.rs @@ -385,11 +385,11 @@ impl RegistryHelper { pub(crate) fn get_node_domain_name( &self, version: RegistryVersion, - ) -> OrchestratorResult> { + ) -> OrchestratorResult { let result = self .registry_client .get_node_record(self.node_id, version)? .and_then(|node_record| node_record.domain); - Ok(result) + result.ok_or_else(|| OrchestratorError::DomainNameMissingError(self.node_id, version)) } } From db83ccf8a44fa1326d9e14ca90be98724bc1f367 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Wed, 24 Jun 2026 09:33:50 +0000 Subject: [PATCH 41/42] style: clippy --- rs/orchestrator/src/boundary_node.rs | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/rs/orchestrator/src/boundary_node.rs b/rs/orchestrator/src/boundary_node.rs index 8846c9f4630d..0394d218c517 100644 --- a/rs/orchestrator/src/boundary_node.rs +++ b/rs/orchestrator/src/boundary_node.rs @@ -49,21 +49,19 @@ impl BoundaryNodeManager { ); // NOTE: We could also shutdown the boundary node here. However, it makes sense to continue // serving requests while the orchestrator is downloading the new image in most cases. - } else { - if let Err(err) = self - .process_manager - .ensure_ic_boundary_running_and_restarted_on_domain_change( - self.version.clone(), - registry_version, - ) - { - warn!( - self.logger, - "Failed to ensure {} is running: {}", - IcBoundaryProcess::NAME, - err - ); - } + } else if let Err(err) = self + .process_manager + .ensure_ic_boundary_running_and_restarted_on_domain_change( + self.version.clone(), + registry_version, + ) + { + warn!( + self.logger, + "Failed to ensure {} is running: {}", + IcBoundaryProcess::NAME, + err + ); } } // BN should not be active From 748ee21b9cd04a9c24580eacee9cc95ab31a4706 Mon Sep 17 00:00:00 2001 From: Pierugo Pace Date: Wed, 24 Jun 2026 10:59:36 +0000 Subject: [PATCH 42/42] feat: always start/stop all processes despite errors --- rs/orchestrator/src/processes.rs | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/rs/orchestrator/src/processes.rs b/rs/orchestrator/src/processes.rs index ce8aa5ad3578..68b375c05f50 100644 --- a/rs/orchestrator/src/processes.rs +++ b/rs/orchestrator/src/processes.rs @@ -448,6 +448,7 @@ impl MultipleProcessesManager { } /// Start all processes appropriate for this node. + /// If a process fails to start, continue starting the others and return the first error. /// /// Always starts the replica. For cloud-engine subnet nodes it also /// starts ic-gateway. @@ -457,8 +458,11 @@ impl MultipleProcessesManager { subnet_id: SubnetId, registry_version: RegistryVersion, ) -> OrchestratorResult<()> { - self.replica_manager - .ensure_running((replica_version.clone(), subnet_id))?; + let mut result = Ok(()); + result = result.and( + self.replica_manager + .ensure_running((replica_version.clone(), subnet_id)), + ); // Cloud-engine nodes run ic-gateway as a sidecar, but only once the // launch is enabled (see `IC_GATEWAY_LAUNCH_ENABLED`). Until then, @@ -470,15 +474,15 @@ impl MultipleProcessesManager { | Some(SubnetType::Application) | Some(SubnetType::System) | Some(SubnetType::VerifiedApplication) => { - self.ic_gateway_manager.stop()?; + result = result.and(self.ic_gateway_manager.stop()); } Some(SubnetType::CloudEngine) => { - self.ic_gateway_manager.ensure_running(replica_version)?; + result = result.and(self.ic_gateway_manager.ensure_running(replica_version)); } } } - Ok(()) + result } /// Stop the replica process. @@ -486,14 +490,16 @@ impl MultipleProcessesManager { self.replica_manager.stop() } - /// Stop every managed process. + /// Stop every managed process in reverse order of startup + /// If a process fails to stop, continue stopping the others and return the first error. pub(crate) fn stop_all(&mut self) -> OrchestratorResult<()> { + let mut result = Ok(()); if self.ic_gateway_launch_enabled { - self.ic_gateway_manager.stop()?; + result = result.and(self.ic_gateway_manager.stop()); } - self.replica_manager.stop()?; + result = result.and(self.replica_manager.stop()); - Ok(()) + result } }