From 98e9742dc7ce3412b8eeda1c26b2ca24e0efa0cf Mon Sep 17 00:00:00 2001 From: Zeeshan Lakhani Date: Wed, 25 Feb 2026 00:35:22 +0000 Subject: [PATCH] [multicast] M2P forwarding, OPTE port subscription, and sled-agent propagation Complete the multicast data path by adding per-sled M2P (multicast-to- physical) mapping, forwarding entry management, and OPTE port subscription for multicast group members. Sled-agent: - Add `multicast_subscribe` / `multicast_unsubscribe` endpoints (API v29) that configure M2P, forwarding, and OPTE port subscription for a VMM - OPTE port_manager gains set/clear operations for M2P and forwarding - Port subscription cleanup on PortTicket release Nexus: - New `sled.rs` (MulticastSledClient) encapsulating all sled-agent multicast interactions: subscribe/unsubscribe, M2P/forwarding propagation and teardown - Groups RPW propagates M2P and forwarding entries to all member sleds after DPD configuration, with convergent retry on failure - Members RPW uses MemberReconcileCtx to thread shared reconciliation state. This handles subscribe on join, unsubscribe on leave, and re-subscribe on migration - Dataplane client updated for bifurcated replication groups Tests: - Integration tests for M2P/forwarding/subscribe lifecycle - Instance migration multicast re-convergence --- Cargo.lock | 68 +- Cargo.toml | 4 +- common/src/api/internal/shared/mod.rs | 115 ++ illumos-utils/src/opte/illumos.rs | 15 + illumos-utils/src/opte/mod.rs | 2 +- illumos-utils/src/opte/non_illumos.rs | 132 ++- illumos-utils/src/opte/port_manager.rs | 799 +++++++++++-- .../src/test_util/host_phase_2_test_state.rs | 82 +- .../app/background/tasks/multicast/groups.rs | 228 +++- .../app/background/tasks/multicast/members.rs | 867 ++++++++------ .../src/app/background/tasks/multicast/mod.rs | 24 +- nexus/src/app/multicast/dataplane.rs | 201 ++-- nexus/src/app/multicast/mod.rs | 29 + nexus/src/app/multicast/sled.rs | 560 +++++++++ .../integration_tests/multicast/instances.rs | 152 ++- .../tests/integration_tests/multicast/mod.rs | 41 +- .../multicast/networking_integration.rs | 1003 ++++++++++++++++- .../sled-agent-28.0.0-415efe.json.gitstub | 1 + ...efe.json => sled-agent-29.0.0-0f4904.json} | 370 +++++- openapi/sled-agent/sled-agent-latest.json | 2 +- sled-agent/api/src/lib.rs | 132 ++- sled-agent/src/http_entrypoints.rs | 99 +- sled-agent/src/instance.rs | 143 +-- sled-agent/src/instance_manager.rs | 48 +- sled-agent/src/probe_manager.rs | 1 + sled-agent/src/services.rs | 1 + sled-agent/src/sim/collection.rs | 13 +- sled-agent/src/sim/http_entrypoints.rs | 106 +- sled-agent/src/sim/sled_agent.rs | 62 +- sled-agent/src/sled_agent.rs | 65 +- workspace-hack/Cargo.toml | 4 +- 31 files changed, 4425 insertions(+), 944 deletions(-) create mode 100644 nexus/src/app/multicast/sled.rs create mode 100644 openapi/sled-agent/sled-agent-28.0.0-415efe.json.gitstub rename openapi/sled-agent/{sled-agent-28.0.0-415efe.json => sled-agent-29.0.0-0f4904.json} (96%) diff --git a/Cargo.lock b/Cargo.lock index 5a53cbff2e9..61561042c2d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1681,7 +1681,7 @@ version = "3.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "faf9468729b8cbcea668e36183cb69d317348c2e08e994829fb56ebfdfbaac34" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -2897,7 +2897,7 @@ dependencies = [ [[package]] name = "dlpi" version = "0.2.0" -source = "git+https://github.com/oxidecomputer/dlpi-sys#d9645f8d61187e76384474b1100f6537fb644993" +source = "git+https://github.com/oxidecomputer/dlpi-sys#7cce2d3ab9dcac909642e1d1060f27bb2549cfdc" dependencies = [ "libc", "libdlpi-sys", @@ -3506,7 +3506,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -4965,7 +4965,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.6.2", + "socket2 0.5.10", "system-configuration", "tokio", "tower-layer", @@ -5160,7 +5160,7 @@ dependencies = [ [[package]] name = "illumos-sys-hdrs" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/opte?rev=e547d07b08c3f3d6c821c9eb7a958adcffce6e56#e547d07b08c3f3d6c821c9eb7a958adcffce6e56" +source = "git+https://github.com/oxidecomputer/opte?branch=zl%2Ffilter-mcast-srcs#c570ac2126dbbebbd8e98e73b580c5be6b7e460e" dependencies = [ "bitflags 2.11.0", ] @@ -5629,7 +5629,7 @@ checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" dependencies = [ "hermit-abi 0.5.2", "libc", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -5706,7 +5706,7 @@ dependencies = [ "portable-atomic", "portable-atomic-util", "serde_core", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -5841,7 +5841,7 @@ dependencies = [ [[package]] name = "kstat-macro" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/opte?rev=e547d07b08c3f3d6c821c9eb7a958adcffce6e56#e547d07b08c3f3d6c821c9eb7a958adcffce6e56" +source = "git+https://github.com/oxidecomputer/opte?branch=zl%2Ffilter-mcast-srcs#c570ac2126dbbebbd8e98e73b580c5be6b7e460e" dependencies = [ "quote", "syn 2.0.117", @@ -5915,7 +5915,7 @@ checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc" [[package]] name = "libdlpi-sys" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/dlpi-sys#d9645f8d61187e76384474b1100f6537fb644993" +source = "git+https://github.com/oxidecomputer/dlpi-sys#7cce2d3ab9dcac909642e1d1060f27bb2549cfdc" [[package]] name = "libefi-illumos" @@ -6037,7 +6037,7 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" [[package]] name = "libnet" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/netadm-sys?branch=main#1e40efd8497973ef3b7d6f0285510424f53f43c5" +source = "git+https://github.com/oxidecomputer/netadm-sys?branch=main#c1d9f7474d6db11c3bec6fb329abdd35985311f8" dependencies = [ "anyhow", "cfg-if", @@ -6053,7 +6053,7 @@ dependencies = [ "socket2 0.6.2", "thiserror 2.0.18", "tracing", - "winnow 0.7.14", + "winnow 0.7.15", ] [[package]] @@ -7851,7 +7851,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -9327,7 +9327,7 @@ dependencies = [ "uuid", "vergen", "vergen-lib", - "winnow 0.7.14", + "winnow 0.7.15", "x509-cert", "zerocopy 0.8.40", "zeroize", @@ -9472,7 +9472,7 @@ dependencies = [ [[package]] name = "opte" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/opte?rev=e547d07b08c3f3d6c821c9eb7a958adcffce6e56#e547d07b08c3f3d6c821c9eb7a958adcffce6e56" +source = "git+https://github.com/oxidecomputer/opte?branch=zl%2Ffilter-mcast-srcs#c570ac2126dbbebbd8e98e73b580c5be6b7e460e" dependencies = [ "bitflags 2.11.0", "dyn-clone", @@ -9491,7 +9491,7 @@ dependencies = [ [[package]] name = "opte-api" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/opte?rev=e547d07b08c3f3d6c821c9eb7a958adcffce6e56#e547d07b08c3f3d6c821c9eb7a958adcffce6e56" +source = "git+https://github.com/oxidecomputer/opte?branch=zl%2Ffilter-mcast-srcs#c570ac2126dbbebbd8e98e73b580c5be6b7e460e" dependencies = [ "illumos-sys-hdrs", "ingot", @@ -9504,7 +9504,7 @@ dependencies = [ [[package]] name = "opte-ioctl" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/opte?rev=e547d07b08c3f3d6c821c9eb7a958adcffce6e56#e547d07b08c3f3d6c821c9eb7a958adcffce6e56" +source = "git+https://github.com/oxidecomputer/opte?branch=zl%2Ffilter-mcast-srcs#c570ac2126dbbebbd8e98e73b580c5be6b7e460e" dependencies = [ "libc", "libnet", @@ -9601,7 +9601,7 @@ dependencies = [ [[package]] name = "oxide-vpc" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/opte?rev=e547d07b08c3f3d6c821c9eb7a958adcffce6e56#e547d07b08c3f3d6c821c9eb7a958adcffce6e56" +source = "git+https://github.com/oxidecomputer/opte?branch=zl%2Ffilter-mcast-srcs#c570ac2126dbbebbd8e98e73b580c5be6b7e460e" dependencies = [ "cfg-if", "illumos-sys-hdrs", @@ -10457,7 +10457,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9cd31dcfdbbd7431a807ef4df6edd6473228e94d5c805e8cf671227a21bad068" dependencies = [ "anyhow", - "itertools 0.14.0", + "itertools 0.12.1", "proc-macro2", "quote", "rand 0.8.5", @@ -11329,7 +11329,7 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls 0.23.37", - "socket2 0.6.2", + "socket2 0.5.10", "thiserror 2.0.18", "tokio", "tracing", @@ -11367,9 +11367,9 @@ dependencies = [ "cfg_aliases 0.2.1", "libc", "once_cell", - "socket2 0.6.2", + "socket2 0.5.10", "tracing", - "windows-sys 0.60.2", + "windows-sys 0.52.0", ] [[package]] @@ -12284,7 +12284,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys 0.11.0", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -12387,7 +12387,7 @@ dependencies = [ "security-framework", "security-framework-sys", "webpki-root-certs", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -13825,7 +13825,7 @@ version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1c97747dbf44bb1ca44a561ece23508e99cb592e862f22222dcf42f51d1e451" dependencies = [ - "heck 0.5.0", + "heck 0.4.1", "proc-macro2", "quote", "syn 2.0.117", @@ -14462,7 +14462,7 @@ dependencies = [ "getrandom 0.4.1", "once_cell", "rustix 1.1.3", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -14482,7 +14482,7 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8c27177b12a6399ffc08b98f76f7c9a1f4fe9fc967c784c5a071fa8d93cf7e1" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -14995,7 +14995,7 @@ dependencies = [ "toml_datetime 0.7.5+spec-1.1.0", "toml_parser", "toml_writer", - "winnow 0.7.14", + "winnow 0.7.15", ] [[package]] @@ -15010,7 +15010,7 @@ dependencies = [ "toml_datetime 1.0.0+spec-1.1.0", "toml_parser", "toml_writer", - "winnow 0.7.14", + "winnow 0.7.15", ] [[package]] @@ -15064,7 +15064,7 @@ dependencies = [ "serde_spanned 0.6.9", "toml_datetime 0.6.11", "toml_write", - "winnow 0.7.14", + "winnow 0.7.15", ] [[package]] @@ -15076,7 +15076,7 @@ dependencies = [ "indexmap 2.13.0", "toml_datetime 0.7.5+spec-1.1.0", "toml_parser", - "winnow 0.7.14", + "winnow 0.7.15", ] [[package]] @@ -15085,7 +15085,7 @@ version = "1.0.9+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "702d4415e08923e7e1ef96cd5727c0dfed80b4d2fa25db9647fe5eb6f7c5a4c4" dependencies = [ - "winnow 0.7.14", + "winnow 0.7.15", ] [[package]] @@ -16799,7 +16799,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -17195,9 +17195,9 @@ dependencies = [ [[package]] name = "winnow" -version = "0.7.14" +version = "0.7.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829" +checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" dependencies = [ "memchr", ] diff --git a/Cargo.toml b/Cargo.toml index 06f84bee59f..a6decc23d23 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -648,7 +648,7 @@ omicron-workspace-hack = "0.1.0" omicron-zone-package = "0.12.2" oxide-client = { path = "clients/oxide-client" } oxide-tokio-rt = "0.1.2" -oxide-vpc = { git = "https://github.com/oxidecomputer/opte", rev = "e547d07b08c3f3d6c821c9eb7a958adcffce6e56", features = [ "api", "std" ] } +oxide-vpc = { git = "https://github.com/oxidecomputer/opte", branch = "zl/filter-mcast-srcs", features = [ "api", "std" ] } oxlog = { path = "dev-tools/oxlog" } oxnet = "0.1.4" once_cell = "1.21.3" @@ -657,7 +657,7 @@ openapiv3 = "2.2.0" # must match samael's crate! openssl = "0.10" openssl-sys = "0.9" -opte-ioctl = { git = "https://github.com/oxidecomputer/opte", rev = "e547d07b08c3f3d6c821c9eb7a958adcffce6e56" } +opte-ioctl = { git = "https://github.com/oxidecomputer/opte", branch = "zl/filter-mcast-srcs" } oso = "0.27" owo-colors = "4.2.2" oximeter = { path = "oximeter/oximeter" } diff --git a/common/src/api/internal/shared/mod.rs b/common/src/api/internal/shared/mod.rs index ac6cf5e0bda..771ad615bf9 100644 --- a/common/src/api/internal/shared/mod.rs +++ b/common/src/api/internal/shared/mod.rs @@ -164,6 +164,121 @@ pub struct VirtualNetworkInterfaceHost { pub vni: external::Vni, } +/// Mapping from an overlay multicast group to an underlay multicast +/// address. +/// +/// The underlay address must be within `UNDERLAY_MULTICAST_SUBNET` +/// (ff04::/64, a subset of admin-local scope per [RFC 7346]). This +/// invariant is enforced by mapping in Nexus, not validated at this layer. +/// +/// [RFC 7346]: https://www.rfc-editor.org/rfc/rfc7346 +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +pub struct Mcast2PhysMapping { + /// Overlay multicast group address. + pub group: IpAddr, + /// Underlay IPv6 multicast address (ff04::/64). + pub underlay: Ipv6Addr, +} + +/// Clear a mapping from an overlay multicast group to an underlay +/// multicast address. +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +pub struct ClearMcast2Phys { + /// Overlay multicast group address. + pub group: IpAddr, + /// Underlay IPv6 multicast address. See [`Mcast2PhysMapping::underlay`]. + pub underlay: Ipv6Addr, +} + +/// Forwarding entry for an underlay multicast address, specifying +/// which next hops should receive replicated packets. +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +pub struct McastForwardingEntry { + /// Underlay IPv6 multicast address. See [`Mcast2PhysMapping::underlay`]. + pub underlay: Ipv6Addr, + /// Next hops with replication and source filter configuration. + pub next_hops: Vec, +} + +/// Clear all forwarding entries for an underlay multicast address. +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] +pub struct ClearMcastForwarding { + /// Underlay IPv6 multicast address. See [`Mcast2PhysMapping::underlay`]. + pub underlay: Ipv6Addr, +} + +/// A forwarding next hop with replication mode and aggregated +/// source filter. +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)] +pub struct McastForwardingNextHop { + /// Unicast IPv6 address of the destination sled. + pub next_hop: Ipv6Addr, + /// Replication mode for this next hop. + pub replication: McastReplication, + /// Aggregated source filter for this destination. + pub filter: McastSourceFilter, +} + +/// Replication mode for multicast forwarding. +#[derive( + Clone, Copy, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, +)] +#[serde(rename_all = "snake_case")] +pub enum McastReplication { + /// Replicate to front panel ports (egress to external networks). + External, + /// Replicate to sled underlay ports. + Underlay, + /// Replicate to both external and underlay ports. + Both, +} + +/// Source filter for multicast forwarding. +#[derive( + Clone, Debug, Default, Deserialize, Serialize, JsonSchema, PartialEq, +)] +pub struct McastSourceFilter { + /// Filter mode. + pub mode: McastFilterMode, + /// Source addresses to include or exclude. + pub sources: Vec, +} + +/// Filter mode for multicast source filtering. +#[derive( + Clone, + Copy, + Debug, + Default, + Deserialize, + Serialize, + JsonSchema, + PartialEq, + Eq, +)] +#[serde(rename_all = "snake_case")] +pub enum McastFilterMode { + /// Accept only packets from listed sources (SSM). + Include, + /// Accept packets from all sources except those listed. + /// With an empty sources list this is any-source multicast (ASM). + #[default] + Exclude, +} + +/// Declarative multicast group subscription for an OPTE port. +/// +/// Represents a single group membership with optional source filtering. +/// Empty `sources` means any-source multicast (ASM) and non-empty means +/// source-specific multicast (SSM). +#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq)] +pub struct MulticastGroupCfg { + /// The multicast group IP address (IPv4 or IPv6). + pub group_ip: IpAddr, + /// Source addresses for source-filtered multicast. + pub sources: Vec, +} + /// DHCP configuration for a port /// /// Not present here: Hostname (DHCPv4 option 12; used in DHCPv6 option 39); we diff --git a/illumos-utils/src/opte/illumos.rs b/illumos-utils/src/opte/illumos.rs index c6d457c8460..b315c0536bc 100644 --- a/illumos-utils/src/opte/illumos.rs +++ b/illumos-utils/src/opte/illumos.rs @@ -70,6 +70,21 @@ pub enum Error { "Tried to update attached subnets on non-existent port ({0}, {1:?})" )] AttachedSubnetUpdateMissingPort(uuid::Uuid, NetworkInterfaceKind), + + #[error("Failed to set multicast-to-physical mapping: {0}")] + SetMcastM2p(String), + + #[error("Failed to clear multicast-to-physical mapping: {0}")] + ClearMcastM2p(String), + + #[error("Failed to set multicast forwarding: {0}")] + SetMcastFwd(String), + + #[error("Failed to clear multicast forwarding: {0}")] + ClearMcastFwd(String), + + #[error("Failed to list multicast forwarding: {0}")] + ListMcastFwd(String), } /// Delete all xde devices on the system. diff --git a/illumos-utils/src/opte/mod.rs b/illumos-utils/src/opte/mod.rs index 697b720ffb6..c0e290f8b69 100644 --- a/illumos-utils/src/opte/mod.rs +++ b/illumos-utils/src/opte/mod.rs @@ -19,6 +19,7 @@ mod port_manager; pub use firewall_rules::opte_firewall_rules; use macaddr::MacAddr6; use omicron_common::api::internal::shared; +pub use omicron_common::api::internal::shared::MulticastGroupCfg; use omicron_common::api::internal::shared::PrivateIpConfig; pub use oxide_vpc::api::BoundaryServices; pub use oxide_vpc::api::DhcpCfg; @@ -33,7 +34,6 @@ use oxnet::IpNet; use oxnet::Ipv4Net; use oxnet::Ipv6Net; pub use port::Port; -pub use port_manager::MulticastGroupCfg; pub use port_manager::PortCreateParams; pub use port_manager::PortManager; pub use port_manager::PortTicket; diff --git a/illumos-utils/src/opte/non_illumos.rs b/illumos-utils/src/opte/non_illumos.rs index ded56ac8945..ae471c6b3bf 100644 --- a/illumos-utils/src/opte/non_illumos.rs +++ b/illumos-utils/src/opte/non_illumos.rs @@ -2,26 +2,40 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -//! Mock / dummy versions of the OPTE module, for non-illumos platforms +//! Mock / dummy versions of the OPTE module, for non-illumos platforms. +//! +//! Most methods are either `unimplemented!()` or silent no-ops. +//! Multicast subscribe/unsubscribe is an exception,as it maintains real +//! in-memory state because port manager tests assert on subscription contents. use crate::addrobj::AddrObject; use omicron_common::api::internal::shared::NetworkInterfaceKind; use oxide_vpc::api::AddRouterEntryReq; +use oxide_vpc::api::ClearMcast2PhysReq; +use oxide_vpc::api::ClearMcastForwardingReq; use oxide_vpc::api::ClearVirt2PhysReq; use oxide_vpc::api::DelRouterEntryReq; use oxide_vpc::api::DetachSubnetResp; use oxide_vpc::api::Direction; +use oxide_vpc::api::DumpMcast2PhysResp; +use oxide_vpc::api::DumpMcastForwardingResp; use oxide_vpc::api::DumpVirt2PhysResp; use oxide_vpc::api::IpCfg; use oxide_vpc::api::IpCidr; use oxide_vpc::api::ListPortsResp; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeReq; use oxide_vpc::api::NoResp; use oxide_vpc::api::PortInfo; +use oxide_vpc::api::RemoveCidrResp; use oxide_vpc::api::RouterClass; use oxide_vpc::api::RouterTarget; use oxide_vpc::api::SetExternalIpsReq; use oxide_vpc::api::SetFwRulesReq; +use oxide_vpc::api::SetMcast2PhysReq; +use oxide_vpc::api::SetMcastForwardingReq; use oxide_vpc::api::SetVirt2PhysReq; +use oxide_vpc::api::SourceFilter; use oxide_vpc::api::VpcCfg; use slog::Logger; use std::collections::HashMap; @@ -76,6 +90,21 @@ pub enum Error { "Tried to update attached subnets on non-existent port ({0}, {1:?})" )] AttachedSubnetUpdateMissingPort(uuid::Uuid, NetworkInterfaceKind), + + #[error("Failed to set multicast-to-physical mapping: {0}")] + SetMcastM2p(String), + + #[error("Failed to clear multicast-to-physical mapping: {0}")] + ClearMcastM2p(String), + + #[error("Failed to set multicast forwarding: {0}")] + SetMcastFwd(String), + + #[error("Failed to clear multicast forwarding: {0}")] + ClearMcastFwd(String), + + #[error("Failed to list multicast forwarding: {0}")] + ListMcastFwd(String), } pub fn initialize_xde_driver( @@ -172,6 +201,8 @@ pub(crate) struct PortData { pub port: PortInfo, /// The routes for this port. This simulates the router layer. pub routes: Vec, + /// Multicast group subscriptions: group IP → source filter. + pub mcast_subscriptions: HashMap, } #[derive(Debug)] @@ -237,7 +268,11 @@ impl Handle { return Err(OpteError::DuplicatePort(entry.key().to_string())); } Entry::Vacant(entry) => { - entry.insert(PortData { port, routes: Vec::new() }); + entry.insert(PortData { + port, + routes: Vec::new(), + mcast_subscriptions: HashMap::new(), + }); } } Ok(NO_RESPONSE) @@ -277,7 +312,59 @@ impl Handle { _: IpCidr, _: Direction, ) -> Result { - unimplemented!("Not yet used in tests") + Ok(NO_RESPONSE) + } + + /// Remove a CIDR allow rule from a port. + pub fn remove_cidr( + &self, + _: &str, + cidr: IpCidr, + _: Direction, + ) -> Result { + Ok(RemoveCidrResp::Ok(cidr)) + } + + /// Subscribe a port to a multicast group. + pub fn mcast_subscribe( + &self, + req: &McastSubscribeReq, + ) -> Result { + let mut inner = opte_state().lock().unwrap(); + let Some(port_data) = inner.ports.get_mut(&req.port_name) else { + return Err(OpteError::NoPort(req.port_name.clone())); + }; + let group_ip: IpAddr = match req.group { + oxide_vpc::api::IpAddr::Ip4(v4) => { + std::net::Ipv4Addr::from(v4).into() + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + std::net::Ipv6Addr::from(v6).into() + } + }; + port_data.mcast_subscriptions.insert(group_ip, req.filter.clone()); + Ok(NO_RESPONSE) + } + + /// Unsubscribe a port from a multicast group. + pub fn mcast_unsubscribe( + &self, + req: &McastUnsubscribeReq, + ) -> Result { + let mut inner = opte_state().lock().unwrap(); + let Some(port_data) = inner.ports.get_mut(&req.port_name) else { + return Err(OpteError::NoPort(req.port_name.clone())); + }; + let group_ip: IpAddr = match req.group { + oxide_vpc::api::IpAddr::Ip4(v4) => { + std::net::Ipv4Addr::from(v4).into() + } + oxide_vpc::api::IpAddr::Ip6(v6) => { + std::net::Ipv6Addr::from(v6).into() + } + }; + port_data.mcast_subscriptions.remove(&group_ip); + Ok(NO_RESPONSE) } /// Delete a router entry from a port. @@ -323,6 +410,45 @@ impl Handle { unimplemented!("Not yet used in tests") } + /// Set a multicast-to-physical mapping. + pub fn set_m2p(&self, _: &SetMcast2PhysReq) -> Result { + Ok(NO_RESPONSE) + } + + /// Clear a multicast-to-physical mapping. + pub fn clear_m2p( + &self, + _: &ClearMcast2PhysReq, + ) -> Result { + Ok(NO_RESPONSE) + } + + /// Set multicast forwarding for a port. + pub fn set_mcast_fwd( + &self, + _: &SetMcastForwardingReq, + ) -> Result { + Ok(NO_RESPONSE) + } + + /// Clear multicast forwarding for a port. + pub fn clear_mcast_fwd( + &self, + _: &ClearMcastForwardingReq, + ) -> Result { + Ok(NO_RESPONSE) + } + + /// Dump all multicast-to-physical mappings. + pub fn dump_m2p(&self) -> Result { + Ok(DumpMcast2PhysResp { ip4: Vec::new(), ip6: Vec::new() }) + } + + /// Dump all multicast forwarding entries. + pub fn dump_mcast_fwd(&self) -> Result { + Ok(DumpMcastForwardingResp { entries: Vec::new() }) + } + /// List ports on the current system. #[allow(dead_code)] pub(crate) fn list_ports(&self) -> Result { diff --git a/illumos-utils/src/opte/port_manager.rs b/illumos-utils/src/opte/port_manager.rs index 52c26f0a21b..8b3a2337003 100644 --- a/illumos-utils/src/opte/port_manager.rs +++ b/illumos-utils/src/opte/port_manager.rs @@ -20,11 +20,20 @@ use macaddr::MacAddr6; use omicron_common::address::IPV4_MULTICAST_RANGE; use omicron_common::address::IPV6_MULTICAST_RANGE; use omicron_common::api::external; +use omicron_common::api::internal::shared::ClearMcast2Phys; +use omicron_common::api::internal::shared::ClearMcastForwarding; use omicron_common::api::internal::shared::ExternalIpConfig; use omicron_common::api::internal::shared::ExternalIpGatewayMap; use omicron_common::api::internal::shared::ExternalIpv4Config; use omicron_common::api::internal::shared::ExternalIpv6Config; use omicron_common::api::internal::shared::InternetGatewayRouterTarget; +use omicron_common::api::internal::shared::Mcast2PhysMapping; +use omicron_common::api::internal::shared::McastFilterMode; +use omicron_common::api::internal::shared::McastForwardingEntry; +use omicron_common::api::internal::shared::McastForwardingNextHop; +use omicron_common::api::internal::shared::McastReplication; +use omicron_common::api::internal::shared::McastSourceFilter; +use omicron_common::api::internal::shared::MulticastGroupCfg; use omicron_common::api::internal::shared::NetworkInterface; use omicron_common::api::internal::shared::NetworkInterfaceKind; use omicron_common::api::internal::shared::PrivateIpConfig; @@ -41,10 +50,14 @@ use omicron_common::api::internal::shared::RouterVersion; use omicron_common::api::internal::shared::VirtualNetworkInterfaceHost; use oxide_vpc::api::AddRouterEntryReq; use oxide_vpc::api::AttachedSubnetConfig; +use oxide_vpc::api::ClearMcast2PhysReq; +use oxide_vpc::api::ClearMcastForwardingReq; use oxide_vpc::api::DelRouterEntryReq; use oxide_vpc::api::DetachSubnetResp; use oxide_vpc::api::DhcpCfg; +use oxide_vpc::api::Direction; use oxide_vpc::api::ExternalIpCfg; +use oxide_vpc::api::FilterMode; use oxide_vpc::api::IpCfg; use oxide_vpc::api::IpCidr; use oxide_vpc::api::Ipv4Cfg; @@ -52,10 +65,16 @@ use oxide_vpc::api::Ipv4Cidr; use oxide_vpc::api::Ipv6Cfg; use oxide_vpc::api::Ipv6Cidr; use oxide_vpc::api::MacAddr; +use oxide_vpc::api::McastSubscribeReq; +use oxide_vpc::api::McastUnsubscribeReq; +use oxide_vpc::api::MulticastUnderlay; use oxide_vpc::api::RouterClass; use oxide_vpc::api::SNat4Cfg; use oxide_vpc::api::SNat6Cfg; use oxide_vpc::api::SetExternalIpsReq; +use oxide_vpc::api::SetMcast2PhysReq; +use oxide_vpc::api::SetMcastForwardingReq; +use oxide_vpc::api::SourceFilter; use oxide_vpc::api::TransitIpConfig; use oxide_vpc::api::VpcCfg; use oxnet::IpNet; @@ -89,22 +108,24 @@ struct RouteSet { active_ports: usize, } -/// Configuration for multicast groups on an OPTE port. +/// Lock ordering for `PortManagerInner` fields: /// -/// TODO: This type should be moved to [oxide_vpc::api] when OPTE dependencies -/// are updated, following the same pattern as other VPC configuration types -/// like [ExternalIpCfg], [IpCfg], etc. +/// Several operations hold multiple locks simultaneously to prevent +/// races. The two nesting pairs are: /// -/// TODO: Eventually remove. -#[derive(Debug, Clone, PartialEq)] -pub struct MulticastGroupCfg { - /// The multicast group IP address (IPv4 or IPv6). - pub group_ip: IpAddr, - /// Source addresses for source-filtered multicast (optional for ASM, - /// required for SSM). - pub sources: Vec, -} - +/// - `vpc_routes_ensure`: `routes` → `ports` (both held to prevent +/// concurrent Nexus instances from interleaving deltas). +/// - `multicast_groups_ensure`: `ports` → `mcast_subscriptions`. +/// - `create_port`, `external_ips_ensure`: `ports` → `eip_gateways`. +/// - `firewall_rules_ensure`: `ports` only. +/// +/// No function nests `mcast_subscriptions` → `routes` or vice versa, +/// so there is no cycle despite `routes` → `ports` and +/// `ports` → `mcast_subscriptions` having opposite `ports` positions. +/// +/// `release_inner` acquires each lock sequentially (dropping the +/// previous before acquiring the next), so it does not conflict with +/// the nested orderings above. #[derive(Debug)] struct PortManagerInner { log: Logger, @@ -127,6 +148,63 @@ struct PortManagerInner { /// /// IGW IDs are specific to the VPC of each NIC. eip_gateways: Mutex>>>, + + /// Per-port multicast subscription state tracked for diffing. + /// Maps (NIC ID, NIC kind) to the set of groups currently + /// subscribed in OPTE, with their source filters. + // TODO: use `VnicUuid` instead of bare `Uuid` here and + // throughout port_manager. + mcast_subscriptions: + Mutex>, +} + +/// Per-port multicast state tracked by the port manager. +#[derive(Debug, Default)] +struct McastPortState { + /// Active subscriptions, mapping group IP → source filter. + subscriptions: HashMap, + /// Whether multicast CIDR allow rules have been installed. + has_cidr_allows: bool, +} + +/// The set of (CIDR, direction) pairs for multicast allow rules. +/// Covers [`IPV4_MULTICAST_RANGE`] (224.0.0.0/4) and +/// [`IPV6_MULTICAST_RANGE`] (ff00::/8) in both directions. +fn mcast_cidr_allow_set() -> [(IpCidr, Direction); 4] { + let v4 = IPV4_MULTICAST_RANGE; + let v6 = IPV6_MULTICAST_RANGE; + let ipv4 = IpCidr::Ip4(Ipv4Cidr::new( + oxide_vpc::api::Ipv4Addr::from(v4.addr()), + oxide_vpc::api::Ipv4PrefixLen::new(v4.width()).unwrap(), + )); + let ipv6 = IpCidr::Ip6(Ipv6Cidr::new( + oxide_vpc::api::Ipv6Addr::from(v6.addr()), + oxide_vpc::api::Ipv6PrefixLen::new(v6.width()).unwrap(), + )); + [ + (ipv4, Direction::In), + (ipv4, Direction::Out), + (ipv6, Direction::In), + (ipv6, Direction::Out), + ] +} + +/// Convert a `MulticastGroupCfg` into OPTE's `SourceFilter`. +/// +/// Empty sources maps to ASM (EXCLUDE with no entries, accepting all +/// sources). Non-empty sources maps to SSM (INCLUDE with the listed +/// sources). +fn multicast_cfg_to_source_filter(cfg: &MulticastGroupCfg) -> SourceFilter { + if cfg.sources.is_empty() { + SourceFilter::default() + } else { + SourceFilter::Include( + cfg.sources + .iter() + .map(|s| oxide_vpc::api::IpAddr::from(*s)) + .collect(), + ) + } } impl PortManagerInner { @@ -139,14 +217,15 @@ impl PortManagerInner { } } -#[derive(Debug)] /// Parameters needed to create and configure an OPTE port. +#[derive(Debug)] pub struct PortCreateParams<'a> { pub nic: &'a NetworkInterface, pub external_ips: &'a Option, pub firewall_rules: &'a [ResolvedVpcFirewallRule], pub dhcp_config: DhcpCfg, pub attached_subnets: Vec, + pub multicast_groups: &'a [MulticastGroupCfg], } impl<'a> TryFrom<&PortCreateParams<'a>> for IpCfg { @@ -411,6 +490,7 @@ impl PortManager { ports: Mutex::new(BTreeMap::new()), routes: Mutex::new(Default::default()), eip_gateways: Mutex::new(Default::default()), + mcast_subscriptions: Mutex::new(Default::default()), }); Self { inner } @@ -432,6 +512,7 @@ impl PortManager { firewall_rules, dhcp_config, attached_subnets: _, + multicast_groups, } = params; let is_service = matches!(nic.kind, NetworkInterfaceKind::Service { .. }); @@ -606,6 +687,12 @@ impl PortManager { } drop(route_map); + // Configure multicast group subscriptions if any were + // provided at instance start. + if !multicast_groups.is_empty() { + self.multicast_groups_ensure(nic.id, nic.kind, multicast_groups)?; + } + info!( self.inner.log, "Created OPTE port"; @@ -823,70 +910,167 @@ impl PortManager { Ok(()) } - /// Validate multicast group memberships for an OPTE port. - /// - /// This method validates multicast group configurations but does not yet - /// configure OPTE port-level multicast group membership. The actual - /// multicast forwarding is currently handled by the reconciler + DPD - /// at the dataplane switch level. - /// - /// TODO: Once OPTE kernel module supports multicast group APIs, this - /// method should be updated to configure OPTE port-level multicast - /// group membership. Note: multicast groups are fleet-scoped and can span - /// across VPCs. + /// Ensure multicast group subscriptions for an OPTE port match the + /// requested set. This diffs current vs new state and issues + /// subscribe/unsubscribe ioctls as needed. Also manages + /// multicast CIDR allow rules on the port. pub fn multicast_groups_ensure( &self, nic_id: Uuid, nic_kind: NetworkInterfaceKind, multicast_groups: &[MulticastGroupCfg], ) -> Result<(), Error> { + // Validate and build the new subscription set before acquiring locks. + let mut new_subs: HashMap = HashMap::new(); + for group in multicast_groups { + if !group.group_ip.is_multicast() { + return Err(Error::InvalidPortIpConfig(format!( + "not a multicast address: {}", + group.group_ip, + ))); + } + new_subs + .insert(group.group_ip, multicast_cfg_to_source_filter(group)); + } + + let hdl = Handle::new()?; + + // Hold both locks to prevent a concurrent port release from + // racing with subscription updates, consistent with + // firewall_rules_ensure and vpc_routes_ensure. let ports = self.inner.ports.lock().unwrap(); let port = ports.get(&(nic_id, nic_kind)).ok_or_else(|| { Error::MulticastUpdateMissingPort(nic_id, nic_kind) })?; + let port_name = port.name().to_string(); + + let mut mcast_subs = self.inner.mcast_subscriptions.lock().unwrap(); + let state = mcast_subs.entry((nic_id, nic_kind)).or_default(); + + // Unsubscribe groups that are no longer requested. + let to_remove: Vec = state + .subscriptions + .keys() + .filter(|g| !new_subs.contains_key(g)) + .copied() + .collect(); + + let removed = to_remove.len(); + for group_ip in &to_remove { + debug!( + self.inner.log, + "unsubscribing from multicast group"; + "port" => &port_name, + "group" => %group_ip, + ); - debug!( - self.inner.log, - "Validating multicast group configuration for OPTE port"; - "port_name" => port.name(), - "nic_id" => ?nic_id, - "groups" => ?multicast_groups, - ); + hdl.mcast_unsubscribe(&McastUnsubscribeReq { + port_name: port_name.clone(), + group: (*group_ip).into(), + })?; - // Validate multicast group configurations - for group in multicast_groups { - if !group.group_ip.is_multicast() { - error!( + state.subscriptions.remove(group_ip); + } + + // Subscribe to new groups or update changed filters. + let mut added = 0usize; + for (group_ip, filter) in &new_subs { + let needs_subscribe = match state.subscriptions.get(group_ip) { + None => true, + Some(current) => current != filter, + }; + + if needs_subscribe { + added += 1; + debug!( self.inner.log, - "Invalid multicast IP address"; - "group_ip" => %group.group_ip, - "port_name" => port.name(), + "subscribing to multicast group"; + "port" => &port_name, + "group" => %group_ip, + "filter" => ?filter, ); - return Err(Error::InvalidPortIpConfig(String::from( - "invalid multicast IP address", - ))); + + hdl.mcast_subscribe(&McastSubscribeReq { + port_name: port_name.clone(), + group: (*group_ip).into(), + filter: filter.clone(), + })?; + + state.subscriptions.insert(*group_ip, filter.clone()); } } - // TODO: Configure firewall rules to allow multicast traffic. - // Add exceptions in source/dest MAC/L3 addr checking for multicast - // addresses matching known groups, only doing cidr-checking on the - // multicasst destination side. + // Install multicast CIDR allow rules when we go from none + // to some subscriptions and remove when we go back to none. + let cidr_before = state.has_cidr_allows; + let has_subs = !state.subscriptions.is_empty(); + if has_subs && !state.has_cidr_allows { + self.install_mcast_cidr_allows(&hdl, &port_name)?; + state.has_cidr_allows = true; + } else if !has_subs && state.has_cidr_allows { + self.remove_mcast_cidr_allows(&hdl, &port_name)?; + state.has_cidr_allows = false; + } + let cidr_changed = cidr_before != state.has_cidr_allows; - info!( - self.inner.log, - "OPTE port configured for multicast traffic"; - "port_name" => port.name(), - "ipv4_range" => %IPV4_MULTICAST_RANGE, - "ipv6_range" => %IPV6_MULTICAST_RANGE, - "multicast_groups" => multicast_groups.len(), - ); + if added > 0 || removed > 0 || cidr_changed { + info!( + self.inner.log, + "multicast subscriptions updated"; + "port" => &port_name, + "added" => added, + "removed" => removed, + "active_groups" => state.subscriptions.len(), + "cidr_allows" => state.has_cidr_allows, + "cidr_changed" => cidr_changed, + ); + } else { + debug!( + self.inner.log, + "multicast subscriptions reconciled, no change"; + "port" => &port_name, + "active_groups" => state.subscriptions.len(), + ); + } - // TODO: Configure OPTE port for specific multicast group membership - // once OPTE kernel module APIs are available. This is distinct from - // zone vNIC underlay configuration (see instance.rs - // `join_multicast_group_inner`). + Ok(()) + } + /// Install CIDR allow rules for multicast traffic on a port. + fn install_mcast_cidr_allows( + &self, + hdl: &Handle, + port_name: &str, + ) -> Result<(), Error> { + for (cidr, dir) in mcast_cidr_allow_set() { + debug!( + self.inner.log, + "installing multicast CIDR allow rule"; + "port" => port_name, + "cidr" => %cidr, + "direction" => ?dir, + ); + hdl.allow_cidr(port_name, cidr, dir)?; + } + Ok(()) + } + + /// Remove CIDR allow rules for multicast traffic from a port. + fn remove_mcast_cidr_allows( + &self, + hdl: &Handle, + port_name: &str, + ) -> Result<(), Error> { + for (cidr, dir) in mcast_cidr_allow_set() { + debug!( + self.inner.log, + "removing multicast CIDR allow rule"; + "port" => port_name, + "cidr" => %cidr, + "direction" => ?dir, + ); + hdl.remove_cidr(port_name, cidr, dir)?; + } Ok(()) } @@ -1012,6 +1196,200 @@ impl PortManager { Ok(()) } + /// Install a multicast overlay-to-underlay (M2P) mapping in OPTE. + pub fn set_mcast_m2p(&self, req: &Mcast2PhysMapping) -> Result<(), Error> { + info!( + self.inner.log, + "Setting multicast overlay-to-underlay mapping"; + "group" => %req.group, + "underlay" => %req.underlay, + ); + let underlay = MulticastUnderlay::new(req.underlay.into()) + .map_err(Error::SetMcastM2p)?; + let hdl = Handle::new()?; + hdl.set_m2p(&SetMcast2PhysReq { group: req.group.into(), underlay })?; + Ok(()) + } + + /// Remove a multicast overlay-to-underlay (M2P) mapping from OPTE. + pub fn clear_mcast_m2p(&self, req: &ClearMcast2Phys) -> Result<(), Error> { + info!( + self.inner.log, + "Clearing multicast overlay-to-underlay mapping"; + "group" => %req.group, + "underlay" => %req.underlay, + ); + let underlay = MulticastUnderlay::new(req.underlay.into()) + .map_err(Error::ClearMcastM2p)?; + let hdl = Handle::new()?; + hdl.clear_m2p(&ClearMcast2PhysReq { + group: req.group.into(), + underlay, + })?; + Ok(()) + } + + /// Set multicast forwarding next hops for an underlay group address. + pub fn set_mcast_fwd( + &self, + req: &McastForwardingEntry, + ) -> Result<(), Error> { + // Safe to unwrap: 77 is well within the 24-bit VNI range. + let mcast_vni = + Vni::new(oxide_vpc::api::DEFAULT_MULTICAST_VNI).unwrap(); + + info!( + self.inner.log, + "Setting multicast forwarding"; + "underlay" => %req.underlay, + "next_hops" => req.next_hops.len(), + ); + let underlay = MulticastUnderlay::new(req.underlay.into()) + .map_err(Error::SetMcastFwd)?; + let next_hops = req + .next_hops + .iter() + .map(|nexthop| oxide_vpc::api::McastForwardingNextHop { + next_hop: oxide_vpc::api::NextHopV6 { + addr: nexthop.next_hop.into(), + vni: mcast_vni, + }, + replication: match nexthop.replication { + McastReplication::External => { + oxide_vpc::api::Replication::External + } + McastReplication::Underlay => { + oxide_vpc::api::Replication::Underlay + } + McastReplication::Both => oxide_vpc::api::Replication::Both, + }, + source_filter: match nexthop.filter.mode { + McastFilterMode::Include => SourceFilter::Include( + nexthop + .filter + .sources + .iter() + .copied() + .map(Into::into) + .collect(), + ), + McastFilterMode::Exclude => SourceFilter::Exclude( + nexthop + .filter + .sources + .iter() + .copied() + .map(Into::into) + .collect(), + ), + }, + }) + .collect(); + let hdl = Handle::new()?; + hdl.set_mcast_fwd(&SetMcastForwardingReq { underlay, next_hops })?; + Ok(()) + } + + /// Remove all multicast forwarding entries for an underlay group address. + pub fn clear_mcast_fwd( + &self, + req: &ClearMcastForwarding, + ) -> Result<(), Error> { + info!( + self.inner.log, + "Clearing multicast forwarding"; + "underlay" => %req.underlay, + ); + let underlay = MulticastUnderlay::new(req.underlay.into()) + .map_err(Error::ClearMcastFwd)?; + let hdl = Handle::new()?; + hdl.clear_mcast_fwd(&ClearMcastForwardingReq { underlay })?; + Ok(()) + } + + /// Dump all multicast overlay-to-underlay (M2P) mappings from OPTE. + pub fn list_mcast_m2p(&self) -> Result, Error> { + let hdl = Handle::new()?; + let resp = hdl.dump_m2p()?; + let mappings = resp + .ip4 + .into_iter() + .map(|(group, underlay)| Mcast2PhysMapping { + group: IpAddr::V4(group.into()), + underlay: Ipv6Addr::from(underlay.addr()), + }) + .chain(resp.ip6.into_iter().map(|(group, underlay)| { + Mcast2PhysMapping { + group: IpAddr::V6(group.into()), + underlay: Ipv6Addr::from(underlay.addr()), + } + })) + .collect(); + Ok(mappings) + } + + /// Dump all multicast forwarding entries from OPTE. + pub fn list_mcast_fwd(&self) -> Result, Error> { + let hdl = Handle::new()?; + let resp = hdl.dump_mcast_fwd()?; + resp.entries + .into_iter() + .map(|entry| { + let next_hops = entry + .next_hops + .into_iter() + .map(|nexthop| { + let replication = match nexthop.replication { + oxide_vpc::api::Replication::External => { + McastReplication::External + } + oxide_vpc::api::Replication::Underlay => { + McastReplication::Underlay + } + oxide_vpc::api::Replication::Both => { + McastReplication::Both + } + oxide_vpc::api::Replication::Reserved => { + return Err(Error::ListMcastFwd( + "OPTE returned Reserved replication \ + mode, which has no defined semantics" + .to_string(), + )); + } + }; + + Ok(McastForwardingNextHop { + next_hop: nexthop.next_hop.addr.into(), + replication, + filter: McastSourceFilter { + mode: match nexthop.source_filter.mode() { + FilterMode::Include => { + McastFilterMode::Include + } + FilterMode::Exclude => { + McastFilterMode::Exclude + } + }, + sources: nexthop + .source_filter + .sources() + .iter() + .copied() + .map(Into::into) + .collect(), + }, + }) + }) + .collect::, _>>()?; + + Ok(McastForwardingEntry { + underlay: Ipv6Addr::from(entry.underlay.addr()), + next_hops, + }) + }) + .collect() + } + pub fn attached_subnets_ensure( &self, nic_id: Uuid, @@ -1245,6 +1623,15 @@ impl PortTicket { remove_key(&mut routes, key); } drop(routes); + + // Cleanup multicast subscription tracking for this port. + // Kernel state is cleaned up when the xde device is deleted. + self.manager + .mcast_subscriptions + .lock() + .unwrap() + .remove(&(self.id, self.kind)); + debug!( self.manager.log, "Removed OPTE port from manager"; @@ -1280,12 +1667,14 @@ impl Drop for PortTicket { mod tests { use super::PortCreateParams; use super::PortManager; + use crate::opte::Error; use crate::opte::Handle; use macaddr::MacAddr6; use omicron_common::api::external::{MacAddr, Vni}; use omicron_common::api::internal::shared::ExternalIpConfig; use omicron_common::api::internal::shared::ExternalIpConfigBuilder; use omicron_common::api::internal::shared::InternetGatewayRouterTarget; + use omicron_common::api::internal::shared::MulticastGroupCfg; use omicron_common::api::internal::shared::NetworkInterface; use omicron_common::api::internal::shared::NetworkInterfaceKind; use omicron_common::api::internal::shared::PrivateIpConfig; @@ -1299,17 +1688,23 @@ mod tests { use omicron_common::api::internal::shared::SourceNatConfigV6; use omicron_test_utils::dev::test_setup_log; use oxide_vpc::api::DhcpCfg; + use oxide_vpc::api::FilterMode; use oxide_vpc::api::IpCfg; use oxide_vpc::api::Ipv4Cidr; use oxide_vpc::api::Ipv6Cidr; + use oxide_vpc::api::SourceFilter; use oxnet::IpNet; use oxnet::Ipv4Net; use oxnet::Ipv6Net; use std::collections::HashSet; + use std::net::IpAddr; use std::net::Ipv4Addr; use std::net::Ipv6Addr; use uuid::Uuid; + // Maximum ephemeral port number for source NAT (14-bit range). + const MAX_PORT: u16 = (1 << 14) - 1; + // Regression for https://github.com/oxidecomputer/omicron/issues/7541. #[test] fn multiple_ports_does_not_destroy_default_route() { @@ -1331,7 +1726,7 @@ mod tests { const SERVICES_VPC_VNI: Vni = Vni::SERVICES_VNI; let handle = Handle::new().unwrap(); - handle.set_xde_underlay("foo0", "foo1").unwrap(); + handle.set_xde_underlay("underlay0", "underlay1").unwrap(); // First, create a port for a service. // @@ -1360,7 +1755,7 @@ mod tests { .unwrap() .into(), ); - const MAX_PORT: u16 = (1 << 14) - 1; + let (port0, _ticket0) = manager .create_port(PortCreateParams { nic: &NetworkInterface { @@ -1385,6 +1780,7 @@ mod tests { dns6_servers: Vec::new(), }, attached_subnets: vec![], + multicast_groups: &[], }) .unwrap(); @@ -1564,6 +1960,7 @@ mod tests { dns6_servers: Vec::new(), }, attached_subnets: vec![], + multicast_groups: &[], }) .unwrap(); @@ -1735,6 +2132,7 @@ mod tests { dns6_servers: vec![], }, attached_subnets: vec![], + multicast_groups: &[], }; let IpCfg::Ipv4(oxide_vpc::api::Ipv4Cfg { vpc_subnet, @@ -1808,6 +2206,7 @@ mod tests { dns6_servers: vec![], }, attached_subnets: vec![], + multicast_groups: &[], }; let IpCfg::Ipv6(oxide_vpc::api::Ipv6Cfg { vpc_subnet, @@ -1815,12 +2214,14 @@ mod tests { gateway_ip, external_ips: oxide_vpc::api::ExternalIpCfg { snat, ephemeral_ip, floating_ips }, - attached_subnets: _, - transit_ips: _, + attached_subnets, + transit_ips, }) = IpCfg::try_from(&prs).unwrap() else { - panic!("Expected IPv4 config") + panic!("Expected IPv6 config") }; + assert!(attached_subnets.is_empty()); + assert!(transit_ips.is_empty()); assert_eq!(private_ip, priv_ip.into()); assert_eq!( @@ -1894,6 +2295,7 @@ mod tests { dns6_servers: vec![], }, attached_subnets: vec![], + multicast_groups: &[], }; let IpCfg::DualStack { ipv4, ipv6 } = IpCfg::try_from(&prs).unwrap() else { @@ -1984,6 +2386,7 @@ mod tests { dns6_servers: vec![], }, attached_subnets: vec![], + multicast_groups: &[], }; let _ = IpCfg::try_from(&prs).expect_err( "Should fail to convert with public IPv6 and private IPv4", @@ -2030,9 +2433,277 @@ mod tests { dns6_servers: vec![], }, attached_subnets: vec![], + multicast_groups: &[], }; let _ = IpCfg::try_from(&prs).expect_err( "Should fail to convert with public IPv4 and private IPv6", ); } + + #[test] + fn multicast_groups_ensure_diffing() { + let logctx = test_setup_log("multicast_groups_ensure_diffing"); + let manager = PortManager::new(logctx.log.clone(), Ipv6Addr::LOCALHOST); + + let handle = Handle::new().unwrap(); + handle.set_xde_underlay("underlay0", "underlay1").unwrap(); + + let nic_id = Uuid::new_v4(); + let nic_kind = NetworkInterfaceKind::Service { id: Uuid::new_v4() }; + + let private_subnet = + Ipv4Net::new(Ipv4Addr::new(172, 20, 0, 0), 24).unwrap(); + let private_ip = Ipv4Addr::new(172, 20, 0, 4); + let ip_config = + PrivateIpConfig::new_ipv4(private_ip, private_subnet).unwrap(); + let public_ip = Ipv4Addr::new(10, 0, 0, 4); + + let external_ips = Some( + ExternalIpConfigBuilder::new() + .with_source_nat( + SourceNatConfigV4::new(public_ip, 0, MAX_PORT).unwrap(), + ) + .build() + .unwrap() + .into(), + ); + + // Bindings keep the port registered in the manager for this scope. + let (_port, _ticket) = manager + .create_port(PortCreateParams { + nic: &NetworkInterface { + id: nic_id, + kind: nic_kind, + name: "opte0".parse().unwrap(), + ip_config, + mac: MacAddr(MacAddr6::new( + 0xa8, 0x40, 0x25, 0x00, 0x00, 0x01, + )), + vni: Vni::SERVICES_VNI, + primary: true, + slot: 0, + }, + external_ips: &external_ips, + firewall_rules: &[], + dhcp_config: DhcpCfg { + hostname: None, + host_domain: None, + domain_search_list: Vec::new(), + dns4_servers: Vec::new(), + dns6_servers: Vec::new(), + }, + attached_subnets: vec![], + multicast_groups: &[], + }) + .unwrap(); + + let group1: IpAddr = "239.1.1.1".parse().unwrap(); + let group2: IpAddr = "239.1.1.2".parse().unwrap(); + let source_a: IpAddr = "10.0.0.1".parse().unwrap(); + + // Subscribe to two groups: one ASM, one SSM. + manager + .multicast_groups_ensure( + nic_id, + nic_kind, + &[ + MulticastGroupCfg { group_ip: group1, sources: vec![] }, + MulticastGroupCfg { + group_ip: group2, + sources: vec![source_a], + }, + ], + ) + .unwrap(); + + // Verify port manager tracking. + { + let subs = manager.inner.mcast_subscriptions.lock().unwrap(); + let state = subs.get(&(nic_id, nic_kind)).unwrap(); + assert_eq!(state.subscriptions.len(), 2); + assert_eq!( + *state.subscriptions.get(&group1).unwrap(), + SourceFilter::default(), + ); + assert_eq!( + state.subscriptions.get(&group2).unwrap().mode(), + FilterMode::Include, + ); + assert!(state.has_cidr_allows); + } + + // Verify mock OPTE state matches. + { + let opte = handle.state().lock().unwrap(); + let port = opte.ports.get("opte0").unwrap(); + assert_eq!(port.mcast_subscriptions.len(), 2); + assert!(port.mcast_subscriptions.contains_key(&group1)); + assert!(port.mcast_subscriptions.contains_key(&group2)); + } + + // Remove group2, keep group1. + manager + .multicast_groups_ensure( + nic_id, + nic_kind, + &[MulticastGroupCfg { group_ip: group1, sources: vec![] }], + ) + .unwrap(); + + { + let subs = manager.inner.mcast_subscriptions.lock().unwrap(); + let state = subs.get(&(nic_id, nic_kind)).unwrap(); + assert_eq!(state.subscriptions.len(), 1); + assert!(state.subscriptions.contains_key(&group1)); + assert!(!state.subscriptions.contains_key(&group2)); + assert!(state.has_cidr_allows); + } + + { + let opte = handle.state().lock().unwrap(); + let port = opte.ports.get("opte0").unwrap(); + assert_eq!(port.mcast_subscriptions.len(), 1); + assert!(!port.mcast_subscriptions.contains_key(&group2)); + } + + // Remove all groups. + manager.multicast_groups_ensure(nic_id, nic_kind, &[]).unwrap(); + + { + let subs = manager.inner.mcast_subscriptions.lock().unwrap(); + let state = subs.get(&(nic_id, nic_kind)).unwrap(); + assert!(state.subscriptions.is_empty()); + assert!(!state.has_cidr_allows); + } + + { + let opte = handle.state().lock().unwrap(); + let port = opte.ports.get("opte0").unwrap(); + assert!(port.mcast_subscriptions.is_empty()); + } + + logctx.cleanup_successful(); + } + + #[test] + fn multicast_port_deletion_cleanup() { + let logctx = test_setup_log("multicast_port_deletion_cleanup"); + let manager = PortManager::new(logctx.log.clone(), Ipv6Addr::LOCALHOST); + + let handle = Handle::new().unwrap(); + handle.set_xde_underlay("underlay0", "underlay1").unwrap(); + + let nic_id = Uuid::new_v4(); + let nic_kind = NetworkInterfaceKind::Service { id: Uuid::new_v4() }; + + let private_subnet = + Ipv4Net::new(Ipv4Addr::new(172, 20, 0, 0), 24).unwrap(); + let private_ip = Ipv4Addr::new(172, 20, 0, 4); + let ip_config = + PrivateIpConfig::new_ipv4(private_ip, private_subnet).unwrap(); + let public_ip = Ipv4Addr::new(10, 0, 0, 4); + + let external_ips = Some( + ExternalIpConfigBuilder::new() + .with_source_nat( + SourceNatConfigV4::new(public_ip, 0, MAX_PORT).unwrap(), + ) + .build() + .unwrap() + .into(), + ); + + let (_port, ticket) = manager + .create_port(PortCreateParams { + nic: &NetworkInterface { + id: nic_id, + kind: nic_kind, + name: "opte0".parse().unwrap(), + ip_config, + mac: MacAddr(MacAddr6::new( + 0xa8, 0x40, 0x25, 0x00, 0x00, 0x01, + )), + vni: Vni::SERVICES_VNI, + primary: true, + slot: 0, + }, + external_ips: &external_ips, + firewall_rules: &[], + dhcp_config: DhcpCfg { + hostname: None, + host_domain: None, + domain_search_list: Vec::new(), + dns4_servers: Vec::new(), + dns6_servers: Vec::new(), + }, + attached_subnets: vec![], + multicast_groups: &[], + }) + .unwrap(); + + let group1: IpAddr = "239.2.2.1".parse().unwrap(); + + // Subscribe to a multicast group. + manager + .multicast_groups_ensure( + nic_id, + nic_kind, + &[MulticastGroupCfg { group_ip: group1, sources: vec![] }], + ) + .unwrap(); + + // Verify subscription tracking exists. + { + let subs = manager.inner.mcast_subscriptions.lock().unwrap(); + assert!( + subs.contains_key(&(nic_id, nic_kind)), + "subscription tracking should exist before release" + ); + let state = subs.get(&(nic_id, nic_kind)).unwrap(); + assert_eq!(state.subscriptions.len(), 1); + } + + // Release the port ticket, which should clean up subscription + // tracking. + ticket.release(); + + // Verify subscription tracking is removed. + { + let subs = manager.inner.mcast_subscriptions.lock().unwrap(); + assert!( + !subs.contains_key(&(nic_id, nic_kind)), + "subscription tracking should be removed after release" + ); + } + + logctx.cleanup_successful(); + } + + #[test] + fn multicast_ensure_missing_port_error() { + let logctx = test_setup_log("multicast_ensure_missing_port_error"); + let manager = PortManager::new(logctx.log.clone(), Ipv6Addr::LOCALHOST); + + let nic_id = Uuid::new_v4(); + let nic_kind = NetworkInterfaceKind::Instance { id: Uuid::new_v4() }; + let group: IpAddr = "239.3.3.1".parse().unwrap(); + + let res = manager.multicast_groups_ensure( + nic_id, + nic_kind, + &[MulticastGroupCfg { group_ip: group, sources: vec![] }], + ); + + match res { + Err(Error::MulticastUpdateMissingPort(id, kind)) => { + assert_eq!(id, nic_id); + assert_eq!(kind, nic_kind); + } + other => { + panic!("expected MulticastUpdateMissingPort, got {other:?}") + } + } + + logctx.cleanup_successful(); + } } diff --git a/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs b/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs index 8f448eb690a..65b49c43926 100644 --- a/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs +++ b/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs @@ -204,7 +204,8 @@ mod api_impl { use omicron_common::api::internal::shared::SledIdentifiers; use omicron_common::api::internal::shared::VirtualNetworkInterfaceHost; use omicron_common::api::internal::shared::{ - ResolvedVpcRouteSet, ResolvedVpcRouteState, + ClearMcast2Phys, ClearMcastForwarding, Mcast2PhysMapping, + McastForwardingEntry, ResolvedVpcRouteSet, ResolvedVpcRouteState, }; use sled_agent_types::artifact::ArtifactConfig; use sled_agent_types::artifact::ArtifactCopyFromDepotBody; @@ -225,7 +226,7 @@ mod api_impl { use sled_agent_types::firewall_rules::VpcFirewallRulesEnsureBody; use sled_agent_types::instance::InstanceEnsureBody; use sled_agent_types::instance::InstanceExternalIpBody; - use sled_agent_types::instance::InstanceMulticastBody; + use sled_agent_types::instance::InstanceMulticastMembership; use sled_agent_types::instance::VmmIssueDiskSnapshotRequestBody; use sled_agent_types::instance::VmmIssueDiskSnapshotRequestPathParam; use sled_agent_types::instance::VmmIssueDiskSnapshotRequestResponse; @@ -626,45 +627,17 @@ mod api_impl { async fn vmm_join_multicast_group( _rqctx: RequestContext, _path_params: Path, - body: TypedBody, + _body: TypedBody, ) -> Result { - let body_args = body.into_inner(); - match body_args { - InstanceMulticastBody::Join(_) => { - // MGS test utility - just return success for test compatibility - Ok(HttpResponseUpdatedNoContent()) - } - InstanceMulticastBody::Leave(_) => { - // This endpoint is for joining - reject leave operations - Err(HttpError::for_bad_request( - None, - "Join endpoint cannot process Leave operations" - .to_string(), - )) - } - } + unimplemented!() } async fn vmm_leave_multicast_group( _rqctx: RequestContext, _path_params: Path, - body: TypedBody, + _body: TypedBody, ) -> Result { - let body_args = body.into_inner(); - match body_args { - InstanceMulticastBody::Leave(_) => { - // MGS test utility - just return success for test compatibility - Ok(HttpResponseUpdatedNoContent()) - } - InstanceMulticastBody::Join(_) => { - // This endpoint is for leaving - reject join operations - Err(HttpError::for_bad_request( - None, - "Leave endpoint cannot process Join operations" - .to_string(), - )) - } - } + unimplemented!() } async fn disk_put( @@ -755,6 +728,47 @@ mod api_impl { unimplemented!() } + async fn set_mcast_m2p( + _rqctx: RequestContext, + _body: TypedBody, + ) -> Result { + unimplemented!() + } + + async fn clear_mcast_m2p( + _rqctx: RequestContext, + _body: TypedBody, + ) -> Result { + unimplemented!() + } + + async fn set_mcast_fwd( + _rqctx: RequestContext, + _body: TypedBody, + ) -> Result { + unimplemented!() + } + + async fn clear_mcast_fwd( + _rqctx: RequestContext, + _body: TypedBody, + ) -> Result { + unimplemented!() + } + + async fn list_mcast_m2p( + _rqctx: RequestContext, + ) -> Result>, HttpError> { + unimplemented!() + } + + async fn list_mcast_fwd( + _rqctx: RequestContext, + ) -> Result>, HttpError> + { + unimplemented!() + } + async fn uplink_ensure( _rqctx: RequestContext, _body: TypedBody, diff --git a/nexus/src/app/background/tasks/multicast/groups.rs b/nexus/src/app/background/tasks/multicast/groups.rs index 933acc91242..4232a22216e 100644 --- a/nexus/src/app/background/tasks/multicast/groups.rs +++ b/nexus/src/app/background/tasks/multicast/groups.rs @@ -20,6 +20,9 @@ //! - **"Creating" state**: Initiate DPD "ensure" to apply configuration //! - **"Active" state**: Detect DPD drift and sync directly //! - **"Deleting" state**: Switch cleanup and database removal +//! - **M2P/forwarding propagation**: Convergent per-sled propagation of +//! M2P mappings and forwarding entries via sled-agent after member +//! state changes //! - **Extensible processing**: Support for different group types //! //! # Group State Transition Matrix @@ -93,6 +96,7 @@ use super::{ use crate::app::multicast::dataplane::{ GroupUpdateParams, MulticastDataplaneClient, }; +use crate::app::multicast::sled::MulticastSledClient; use crate::app::saga::create_saga_dag; use crate::app::sagas; @@ -100,7 +104,7 @@ use crate::app::sagas; /// /// This grace period avoids racing with in-progress member attachment operations /// that occur immediately after group creation. -const ORPHAN_GROUP_MIN_AGE: chrono::Duration = chrono::Duration::seconds(10); +const ORPHAN_GROUP_MIN_AGE: chrono::TimeDelta = chrono::TimeDelta::seconds(10); /// Check if DPD tag matches the database group's tag. /// @@ -130,35 +134,48 @@ fn dpd_state_matches_sources( let dpd_sources = dpd_group.sources.clone(); let group_ip = group.multicast_ip.ip(); - // Expected DPD state based on source filter logic (RFC 4607) - let expected_sources = if is_ssm_address(group_ip) { - Some(&source_filter.specific_sources) + if is_ssm_address(group_ip) { + // SSM: always expect specific sources + match dpd_sources { + None => false, + Some(dpd_srcs) => { + let mut dpd_ips: Vec<_> = dpd_srcs + .into_iter() + .filter_map(|src| match src { + dpd_client::types::IpSrc::Exact(ip) => Some(ip), + _ => None, + }) + .collect(); + dpd_ips.sort(); + + let mut expected: Vec<_> = + source_filter.specific_sources.iter().copied().collect(); + expected.sort(); + + dpd_ips == expected + } + } } else if source_filter.has_any_source_member { - None + dpd_sources.is_none() } else { - Some(&source_filter.specific_sources) - }; - - match (dpd_sources, expected_sources) { - (None, None) => true, - (Some(_), None) => false, // DPD has sources but shouldn't - (None, Some(_)) => false, // DPD missing sources - (Some(dpd_srcs), Some(expected)) => { - // Extract exact IPs from DPD sources - let mut dpd_ips: Vec<_> = dpd_srcs - .into_iter() - .filter_map(|src| match src { - dpd_client::types::IpSrc::Exact(ip) => Some(ip), - _ => None, - }) - .collect(); - dpd_ips.sort(); - - let mut expected_sorted: Vec<_> = - expected.iter().copied().collect(); - expected_sorted.sort(); - - dpd_ips == expected_sorted + match dpd_sources { + None => source_filter.specific_sources.is_empty(), + Some(dpd_srcs) => { + let mut dpd_ips: Vec<_> = dpd_srcs + .into_iter() + .filter_map(|src| match src { + dpd_client::types::IpSrc::Exact(ip) => Some(ip), + _ => None, + }) + .collect(); + dpd_ips.sort(); + + let mut expected: Vec<_> = + source_filter.specific_sources.iter().copied().collect(); + expected.sort(); + + dpd_ips == expected + } } } } @@ -189,6 +206,7 @@ trait GroupStateProcessor { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result; /// Process a group in "Active" state (check DPD sync status). @@ -198,6 +216,7 @@ trait GroupStateProcessor { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result; } @@ -222,9 +241,15 @@ impl GroupStateProcessor for ExternalGroupProcessor { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { reconciler - .handle_deleting_external_group(opctx, group, dataplane_client) + .handle_deleting_external_group( + opctx, + group, + dataplane_client, + sled_client, + ) .await } @@ -235,9 +260,15 @@ impl GroupStateProcessor for ExternalGroupProcessor { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { reconciler - .handle_active_external_group(opctx, group, dataplane_client) + .handle_active_external_group( + opctx, + group, + dataplane_client, + sled_client, + ) .await } } @@ -345,6 +376,7 @@ impl MulticastGroupReconciler { opctx: &OpContext, state: MulticastGroupState, dataplane_client: Option<&MulticastDataplaneClient>, + sled_client: Option<&MulticastSledClient>, ) -> Result { trace!(opctx.log, "searching for multicast groups"; "state" => %state); @@ -372,7 +404,12 @@ impl MulticastGroupReconciler { let results = stream::iter(groups) .map(|group| async move { let result = self - .process_group_state(opctx, &group, dataplane_client) + .process_group_state( + opctx, + &group, + dataplane_client, + sled_client, + ) .await; (group, result) }) @@ -413,7 +450,7 @@ impl MulticastGroupReconciler { processed += 1; } - debug!( + trace!( opctx.log, "processed multicast group"; "state" => %state, @@ -455,6 +492,7 @@ impl MulticastGroupReconciler { opctx, MulticastGroupState::Creating, None, + None, ) .await } @@ -464,11 +502,13 @@ impl MulticastGroupReconciler { &self, opctx: &OpContext, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { self.reconcile_groups_by_state( opctx, MulticastGroupState::Deleting, Some(dataplane_client), + Some(sled_client), ) .await } @@ -478,11 +518,13 @@ impl MulticastGroupReconciler { &self, opctx: &OpContext, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { self.reconcile_groups_by_state( opctx, MulticastGroupState::Active, Some(dataplane_client), + Some(sled_client), ) .await } @@ -494,6 +536,7 @@ impl MulticastGroupReconciler { opctx: &OpContext, group: &MulticastGroup, dataplane_client: Option<&MulticastDataplaneClient>, + sled_client: Option<&MulticastSledClient>, ) -> Result { // Future: Match on group type to select different processors if // we add more nuanced group types @@ -506,15 +549,31 @@ impl MulticastGroupReconciler { MulticastGroupState::Deleting => { let dataplane_client = dataplane_client .context("dataplane client required for deleting state")?; + let sled_client = sled_client + .context("sled client required for deleting state")?; processor - .process_deleting(self, opctx, group, dataplane_client) + .process_deleting( + self, + opctx, + group, + dataplane_client, + sled_client, + ) .await } MulticastGroupState::Active => { let dataplane_client = dataplane_client .context("dataplane client required for active state")?; + let sled_client = sled_client + .context("sled client required for active state")?; processor - .process_active(self, opctx, group, dataplane_client) + .process_active( + self, + opctx, + group, + dataplane_client, + sled_client, + ) .await } MulticastGroupState::Deleted => { @@ -632,6 +691,7 @@ impl MulticastGroupReconciler { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { debug!( opctx.log, @@ -644,8 +704,13 @@ impl MulticastGroupReconciler { "dpd_cleanup_required" => true ); - self.process_deleting_group_inner(opctx, group, dataplane_client) - .await?; + self.process_deleting_group_inner( + opctx, + group, + dataplane_client, + sled_client, + ) + .await?; Ok(StateTransition::StateChanged) } @@ -658,6 +723,7 @@ impl MulticastGroupReconciler { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { let underlay_group_id = group .underlay_group_id @@ -759,6 +825,22 @@ impl MulticastGroupReconciler { "group_id" => %group.id(), "multicast_ip" => %group.multicast_ip ); + + // Propagate M2P/forwarding to member sleds after DPD + // sync to ensure OPTE state is also consistent. + if let Err(e) = sled_client + .propagate_m2p_and_forwarding(opctx, group) + .await + { + warn!( + opctx.log, + "failed to propagate M2P/forwarding after \ + drift correction (will retry)"; + "group_id" => %group.id(), + "error" => %e + ); + } + Ok(StateTransition::StateChanged) } Err(e) => { @@ -773,6 +855,19 @@ impl MulticastGroupReconciler { } } } else { + // Even when DPD is in sync, propagate M2P/forwarding to + // member sleds to correct any sled-level drift. + if let Err(e) = + sled_client.propagate_m2p_and_forwarding(opctx, group).await + { + warn!( + opctx.log, + "failed to propagate M2P/forwarding (will retry)"; + "group_id" => %group.id(), + "error" => %e + ); + } + Ok(StateTransition::NoChange) } } @@ -784,7 +879,7 @@ impl MulticastGroupReconciler { opctx: &OpContext, group: &MulticastGroup, ) -> Result { - debug!( + trace!( opctx.log, "processing creating multicast group"; "group" => ?group @@ -801,7 +896,7 @@ impl MulticastGroupReconciler { format!("failed to fetch linked underlay group {underlay_id}") })?; - debug!( + trace!( opctx.log, "found linked underlay group"; "group" => ?group, @@ -810,7 +905,7 @@ impl MulticastGroupReconciler { underlay } None => { - debug!( + trace!( opctx.log, "creating new underlay group"; "group" => ?group @@ -872,6 +967,7 @@ impl MulticastGroupReconciler { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result<(), anyhow::Error> { let tag = Self::get_multicast_tag(group) .context("multicast group missing tag")?; @@ -887,6 +983,15 @@ impl MulticastGroupReconciler { "cleanup_includes" => "[external_group, underlay_group, forwarding_rules, member_ports]" ); + // Clear M2P/forwarding from all sleds before DPD cleanup. + // This must succeed before deleting DB records, otherwise + // stale OPTE state would persist on failed sleds with no + // source of truth to drive a later cleanup pass. + sled_client + .clear_m2p_and_forwarding(opctx, group) + .await + .context("failed to clear M2P/forwarding from sleds")?; + // Use dataplane client from reconciliation pass to cleanup switch(es) // state by tag dataplane_client @@ -1047,9 +1152,8 @@ mod tests { } #[test] - fn test_dpd_state_matches_sources_asm_address() { - // ASM address with all members specifying sources: expect those - // sources in DPD. + fn test_dpd_state_matches_sources_asm_with_specific_sources() { + // ASM address with specific sources only (no any-source members) let source_filter = SourceFilterState { specific_sources: BTreeSet::from(["10.0.0.1" .parse::() @@ -1057,23 +1161,29 @@ mod tests { has_any_source_member: false, }; - let group = create_group("224.1.1.1"); // ASM address (not 232.x.x.x) + let group = create_group("224.1.1.1"); // ASM address - // DPD has matching sources (correct) + // DPD has matching specific sources let dpd_group = create_dpd_group(Some(vec![dpd_client::types::IpSrc::Exact( "10.0.0.1".parse().unwrap(), )])); assert!(dpd_state_matches_sources(&dpd_group, &source_filter, &group)); - // DPD has None (mismatch: ASM with all-specific should have sources) + // DPD has None (mismatch: should have specific sources) let dpd_group = create_dpd_group(None); assert!(!dpd_state_matches_sources(&dpd_group, &source_filter, &group)); + + // DPD has IpSrc::Any (mismatch: should have specific sources) + let dpd_group = + create_dpd_group(Some(vec![dpd_client::types::IpSrc::Any])); + assert!(!dpd_state_matches_sources(&dpd_group, &source_filter, &group)); } #[test] fn test_dpd_state_matches_sources_asm_with_any_source_member() { - // ASM address with has_any_source_member=true - expects None from DPD + // ASM address with has_any_source_member=true: we send None to DPD, + // and DPD canonicalizes any-source representations to None. let source_filter = SourceFilterState { specific_sources: BTreeSet::new(), has_any_source_member: true, @@ -1081,11 +1191,33 @@ mod tests { let group = create_group("224.1.1.1"); // ASM address - // DPD has None (correct for ASM with any-source members) + // DPD has None (correct: any-source canonicalizes to None) + let dpd_group = create_dpd_group(None); + assert!(dpd_state_matches_sources(&dpd_group, &source_filter, &group)); + + // DPD has specific sources (mismatch) + let dpd_group = + create_dpd_group(Some(vec![dpd_client::types::IpSrc::Exact( + "10.0.0.1".parse().unwrap(), + )])); + assert!(!dpd_state_matches_sources(&dpd_group, &source_filter, &group)); + } + + #[test] + fn test_dpd_state_matches_sources_asm_no_sources() { + // ASM with no source filters at all expects None + let source_filter = SourceFilterState { + specific_sources: BTreeSet::new(), + has_any_source_member: false, + }; + + let group = create_group("224.1.1.1"); // ASM address + + // DPD has None (correct: no sources configured) let dpd_group = create_dpd_group(None); assert!(dpd_state_matches_sources(&dpd_group, &source_filter, &group)); - // DPD has sources (mismatch: should be none) + // DPD has sources (mismatch) let dpd_group = create_dpd_group(Some(vec![dpd_client::types::IpSrc::Exact( "10.0.0.1".parse().unwrap(), diff --git a/nexus/src/app/background/tasks/multicast/members.rs b/nexus/src/app/background/tasks/multicast/members.rs index 1b7f81c6ab3..0b8d82d9af1 100644 --- a/nexus/src/app/background/tasks/multicast/members.rs +++ b/nexus/src/app/background/tasks/multicast/members.rs @@ -42,6 +42,11 @@ //! - **State transitions**: "Joining" → "Joined" → "Left" with reactivation //! - **Dataplane updates**: Applying and removing configuration via DPD //! client(s) on switches +//! - **M2P/forwarding propagation**: After join, leave, or migration, M2P +//! mappings and forwarding entries are propagated to all sleds via +//! sled-agent inline (not deferred to the next reconciliation pass) +//! - **OPTE subscriptions**: Per-VMM multicast group filters managed via +//! sled-agent on the hosting sled //! - **Sled migration**: Detecting moves and updating dataplane configuration //! (no transition to "Left") //! - **Cleanup**: Removing orphaned switch state for deleted members @@ -124,10 +129,31 @@ use omicron_uuid_kinds::{ use super::{MulticastGroupReconciler, StateTransition, SwitchBackplanePort}; use crate::app::multicast::dataplane::MulticastDataplaneClient; +use crate::app::multicast::sled::MulticastSledClient; + +/// Pre-fetched instance state for multicast reconciliation. +#[derive(Clone, Copy, Debug, Default)] +struct InstanceMulticastState { + /// Whether the instance is in a state that can receive multicast traffic. + valid: bool, + /// Current sled hosting the VMM, if any. + sled_id: Option, + /// Current propolis VMM identifier, if any. + propolis_id: Option, +} + +/// Context shared across member reconciliation operations. +struct MemberReconcileCtx<'a> { + opctx: &'a OpContext, + group: &'a MulticastGroup, + member: &'a MulticastGroupMember, + instance_states: &'a InstanceStateMap, + dataplane_client: &'a MulticastDataplaneClient, + sled_client: &'a MulticastSledClient, +} -/// Pre-fetched instance state data for batch processing. -/// Maps instance_id -> (is_valid_for_multicast, current_sled_id). -type InstanceStateMap = HashMap)>; +/// Maps instance_id to pre-fetched multicast-relevant state. +type InstanceStateMap = HashMap; /// Backplane port mapping from DPD-client. /// Maps switch port ID to backplane link configuration. @@ -168,33 +194,21 @@ trait MemberStateProcessor { async fn process_joining( &self, reconciler: &MulticastGroupReconciler, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result; /// Process a member in "Joined" state. async fn process_joined( &self, reconciler: &MulticastGroupReconciler, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result; /// Process a member in "Left" state. async fn process_left( &self, reconciler: &MulticastGroupReconciler, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result; } @@ -205,61 +219,25 @@ impl MemberStateProcessor for InstanceMemberProcessor { async fn process_joining( &self, reconciler: &MulticastGroupReconciler, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { - reconciler - .handle_instance_joining( - opctx, - group, - member, - instance_states, - dataplane_client, - ) - .await + reconciler.handle_instance_joining(ctx).await } async fn process_joined( &self, reconciler: &MulticastGroupReconciler, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { - reconciler - .handle_instance_joined( - opctx, - group, - member, - instance_states, - dataplane_client, - ) - .await + reconciler.handle_instance_joined(ctx).await } async fn process_left( &self, reconciler: &MulticastGroupReconciler, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { - reconciler - .handle_instance_left( - opctx, - group, - member, - instance_states, - dataplane_client, - ) - .await + reconciler.handle_instance_left(ctx).await } } @@ -276,6 +254,7 @@ impl MulticastGroupReconciler { &self, opctx: &OpContext, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { trace!(opctx.log, "reconciling member state changes"); @@ -286,7 +265,12 @@ impl MulticastGroupReconciler { for group in groups { match self - .process_group_member_states(opctx, &group, dataplane_client) + .process_group_member_states( + opctx, + &group, + dataplane_client, + sled_client, + ) .await { Ok(count) => { @@ -326,6 +310,7 @@ impl MulticastGroupReconciler { opctx: &OpContext, group: &MulticastGroup, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { let mut processed = 0; @@ -348,6 +333,7 @@ impl MulticastGroupReconciler { &member, &instance_states, dataplane_client, + sled_client, ) .await; (member, res) @@ -364,7 +350,7 @@ impl MulticastGroupReconciler { StateTransition::StateChanged | StateTransition::NoChange => { processed += 1; - debug!( + trace!( opctx.log, "processed member state change"; "member" => ?member, @@ -374,7 +360,7 @@ impl MulticastGroupReconciler { } StateTransition::NeedsCleanup => { processed += 1; - debug!( + trace!( opctx.log, "member marked for cleanup"; "member" => ?member, @@ -382,7 +368,7 @@ impl MulticastGroupReconciler { ); } StateTransition::EntityGone => { - debug!( + trace!( opctx.log, "member deleted during processing"; "member" => ?member, @@ -407,7 +393,7 @@ impl MulticastGroupReconciler { /// Main dispatch function for processing member state changes. /// - /// Routes to appropriate node based on member type. + /// Routes to the appropriate handler based on member state. async fn process_member_state( &self, opctx: &OpContext, @@ -415,6 +401,7 @@ impl MulticastGroupReconciler { member: &MulticastGroupMember, instance_states: &InstanceStateMap, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { // Check if the parent group has been deleted or is being deleted. // If so, delete the member so cleanup can proceed. @@ -444,43 +431,24 @@ impl MulticastGroupReconciler { // For now, all members are instance-based, but this is where we'd // dispatch to different processors for different member types let processor = InstanceMemberProcessor; + let ctx = MemberReconcileCtx { + opctx, + group, + member, + instance_states, + dataplane_client, + sled_client, + }; match member.state { MulticastGroupMemberState::Joining => { - processor - .process_joining( - self, - opctx, - group, - member, - instance_states, - dataplane_client, - ) - .await + processor.process_joining(self, &ctx).await } MulticastGroupMemberState::Joined => { - processor - .process_joined( - self, - opctx, - group, - member, - instance_states, - dataplane_client, - ) - .await + processor.process_joined(self, &ctx).await } MulticastGroupMemberState::Left => { - processor - .process_left( - self, - opctx, - group, - member, - instance_states, - dataplane_client, - ) - .await + processor.process_left(self, &ctx).await } } } @@ -495,7 +463,7 @@ impl MulticastGroupReconciler { ) -> Result { // Skip if member is already deleted if member.time_deleted.is_some() { - debug!( + trace!( opctx.log, "member already deleted, no action needed"; "member_id" => %member.id, @@ -532,35 +500,25 @@ impl MulticastGroupReconciler { /// when ready. Uses CAS operations for concurrent-safe state updates. async fn handle_instance_joining( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { - // Extract pre-fetched instance state - let (instance_valid, current_sled_id) = - self.get_instance_state_from_cache(instance_states, member); + let instance_state = + self.get_instance_state_from_cache(ctx.instance_states, ctx.member); - // Execute reconciliation CAS operation let reconcile_res = self .execute_joining_reconciliation( - opctx, - group, - member, - instance_valid, - current_sled_id, + ctx.opctx, + ctx.group, + ctx.member, + instance_state.valid, + instance_state.sled_id, ) .await?; - // Process reconciliation result self.process_joining_reconcile_result( - opctx, - group, - member, - instance_valid, + ctx, + instance_state, reconcile_res, - dataplane_client, ) .await } @@ -570,8 +528,8 @@ impl MulticastGroupReconciler { &self, instance_states: &InstanceStateMap, member: &MulticastGroupMember, - ) -> (bool, Option) { - instance_states.get(&member.parent_id).copied().unwrap_or((false, None)) + ) -> InstanceMulticastState { + instance_states.get(&member.parent_id).copied().unwrap_or_default() } /// Execute the reconciliation CAS operation for a member in "Joining" state. @@ -600,39 +558,29 @@ impl MulticastGroupReconciler { /// Process the result of a "Joining" state reconciliation operation. async fn process_joining_reconcile_result( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_valid: bool, + ctx: &MemberReconcileCtx<'_>, + instance_state: InstanceMulticastState, reconcile_result: ReconcileJoiningResult, - dataplane_client: &MulticastDataplaneClient, ) -> Result { match reconcile_result.action { ReconcileAction::TransitionedToLeft => { - self.handle_transitioned_to_left(opctx, group, member).await + self.handle_transitioned_to_left( + ctx.opctx, ctx.group, ctx.member, + ) + .await } ReconcileAction::UpdatedSledId { old, new } => { self.handle_sled_id_updated( - opctx, - group, - member, - instance_valid, + ctx, + instance_state, SledIdUpdate { old, new }, - dataplane_client, ) .await } ReconcileAction::NotFound | ReconcileAction::NoChange => { - self.handle_no_change_or_not_found( - opctx, - group, - member, - instance_valid, - dataplane_client, - ) - .await + self.handle_no_change_or_not_found(ctx, instance_state).await } } } @@ -660,63 +608,43 @@ impl MulticastGroupReconciler { /// Handle the case where a member's sled_id was updated. async fn handle_sled_id_updated( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_valid: bool, + ctx: &MemberReconcileCtx<'_>, + instance_state: InstanceMulticastState, sled_id_update: SledIdUpdate, - dataplane_client: &MulticastDataplaneClient, ) -> Result { - debug!( - opctx.log, + trace!( + ctx.opctx.log, "updated member sled_id, checking if ready to join"; - "member_id" => %member.id, + "member_id" => %ctx.member.id, "old_sled_id" => ?sled_id_update.old, "new_sled_id" => ?sled_id_update.new, - "group_state" => ?group.state, - "instance_valid" => instance_valid + "group_state" => ?ctx.group.state, + "instance_valid" => instance_state.valid ); - self.try_complete_join_if_ready( - opctx, - group, - member, - instance_valid, - dataplane_client, - ) - .await + self.try_complete_join_if_ready(ctx, instance_state).await } /// Handle the case where no changes were made or member was not found. async fn handle_no_change_or_not_found( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_valid: bool, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, + instance_state: InstanceMulticastState, ) -> Result { // Check if member is already in Joined state - if member.state == MulticastGroupMemberState::Joined { - debug!( - opctx.log, + if ctx.member.state == MulticastGroupMemberState::Joined { + trace!( + ctx.opctx.log, "member already in 'Joined' state, no action needed"; - "member_id" => %member.id, - "group_id" => %group.id(), - "group_name" => group.name().as_str() + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), + "group_name" => ctx.group.name().as_str() ); return Ok(StateTransition::NoChange); } // Try to complete the join if conditions are met - self.try_complete_join_if_ready( - opctx, - group, - member, - instance_valid, - dataplane_client, - ) - .await + self.try_complete_join_if_ready(ctx, instance_state).await } fn is_ready_to_join( @@ -729,30 +657,31 @@ impl MulticastGroupReconciler { async fn try_complete_join_if_ready( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_valid: bool, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, + instance_state: InstanceMulticastState, ) -> Result { - if self.is_ready_to_join(group, instance_valid) { - self.complete_instance_member_join( - opctx, - group, - member, - dataplane_client, - ) - .await?; - Ok(StateTransition::StateChanged) + if self.is_ready_to_join(ctx.group, instance_state.valid) { + let joined = self + .complete_instance_member_join( + ctx, + None, + instance_state.propolis_id, + ) + .await?; + if joined { + Ok(StateTransition::StateChanged) + } else { + Ok(StateTransition::NoChange) + } } else { - debug!( - opctx.log, + trace!( + ctx.opctx.log, "member not ready to join: waiting for next run"; - "member_id" => %member.id, - "group_id" => %group.id(), - "group_name" => group.name().as_str(), - "instance_valid" => instance_valid, - "group_state" => ?group.state + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), + "group_name" => ctx.group.name().as_str(), + "instance_valid" => instance_state.valid, + "group_state" => ?ctx.group.state ); Ok(StateTransition::NoChange) } @@ -761,62 +690,61 @@ impl MulticastGroupReconciler { /// Instance-specific handler for members in "Joined" state. async fn handle_instance_joined( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { - // Get pre-fetched instance state and sled_id - let (instance_valid, current_sled_id) = instance_states - .get(&member.parent_id) + let instance_state = ctx + .instance_states + .get(&ctx.member.parent_id) .copied() - .unwrap_or((false, None)); + .unwrap_or_default(); - match (instance_valid, current_sled_id) { - // Invalid instance -> remove from dataplane and transition to "Left" + match (instance_state.valid, instance_state.sled_id) { (false, _) => { self.handle_invalid_instance( - opctx, - group, - member, - dataplane_client, + ctx.opctx, + ctx.group, + ctx.member, + ctx.dataplane_client, + ctx.sled_client, ) .await } - // Valid instance with sled, but sled changed (migration) - (true, Some(sled_id)) if member.sled_id != Some(sled_id.into()) => { + (true, Some(sled_id)) + if ctx.member.sled_id != Some(sled_id.into()) => + { self.handle_sled_migration( - opctx, - group, - member, + ctx, sled_id, - dataplane_client, + instance_state.propolis_id, ) .await } - // Valid instance with sled, sled unchanged -> verify configuration (true, Some(_)) => { - self.verify_members(opctx, group, member, dataplane_client) - .await?; + self.verify_members( + ctx.opctx, + ctx.group, + ctx.member, + ctx.dataplane_client, + ctx.sled_client, + ) + .await?; trace!( - opctx.log, + ctx.opctx.log, "member configuration verified, no changes needed"; - "member_id" => %member.id, - "group_id" => %group.id() + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id() ); Ok(StateTransition::NoChange) } - // Valid instance but no sled_id (shouldn't typically happen in "Joined" state) (true, None) => { self.handle_joined_without_sled( - opctx, - group, - member, - dataplane_client, + ctx.opctx, + ctx.group, + ctx.member, + ctx.dataplane_client, ) .await } @@ -830,13 +758,14 @@ impl MulticastGroupReconciler { group: &MulticastGroup, member: &MulticastGroupMember, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result { // Remove from dataplane first if let Err(e) = self .remove_member_from_dataplane(opctx, member, dataplane_client) .await { - debug!( + warn!( opctx.log, "failed to remove member from dataplane, will retry"; "member_id" => %member.id, @@ -845,6 +774,24 @@ impl MulticastGroupReconciler { return Err(e); } + // Unsubscribe the VMM from the multicast group before the CAS + // clears the sled ID. Best-effort since the VMM may already be torn + // down. + if let Some(sled_id) = member.sled_id { + if let Err(e) = sled_client + .unsubscribe_vmm(opctx, group, member, sled_id.into(), None) + .await + { + warn!( + opctx.log, + "failed to unsubscribe VMM during instance invalidation"; + "member_id" => %member.id, + "sled_id" => %sled_id, + "error" => %e + ); + } + } + // Update database state (atomically set "Left" and clear `sled_id`) let updated = self .datastore @@ -870,6 +817,21 @@ impl MulticastGroupReconciler { return Ok(StateTransition::NoChange); } + // Propagate updated M2P/forwarding to all sleds so the + // dataplane reflects the member's departure. Best-effort since + // group reconciliation will converge if this fails. + if let Err(e) = + sled_client.propagate_m2p_and_forwarding(opctx, group).await + { + warn!( + opctx.log, + "failed to propagate M2P/forwarding after member leave"; + "member_id" => %member.id, + "group_id" => %group.id(), + "error" => %e + ); + } + info!( opctx.log, "multicast member lifecycle transition: 'Joined' → 'Left' (instance invalid)"; @@ -877,7 +839,6 @@ impl MulticastGroupReconciler { "instance_id" => %member.parent_id, "group_id" => %group.id(), "group_multicast_ip" => %group.multicast_ip, - "dpd_operation" => "remove_member_from_underlay_group", "reason" => "instance_no_longer_valid_for_multicast_traffic" ); Ok(StateTransition::StateChanged) @@ -886,46 +847,58 @@ impl MulticastGroupReconciler { /// Handle sled migration for a "Joined" member. async fn handle_sled_migration( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, + ctx: &MemberReconcileCtx<'_>, new_sled_id: SledUuid, - dataplane_client: &MulticastDataplaneClient, + cached_propolis_id: Option, ) -> Result { info!( - opctx.log, + ctx.opctx.log, "detected sled migration for 'Joined' member: re-applying configuration"; - "member_id" => %member.id, - "instance_id" => %member.parent_id, - "group_id" => %group.id(), - "group_name" => group.name().as_str(), - "group_multicast_ip" => %group.multicast_ip, - "old_sled_id" => ?member.sled_id, + "member_id" => %ctx.member.id, + "instance_id" => %ctx.member.parent_id, + "group_id" => %ctx.group.id(), + "group_name" => ctx.group.name().as_str(), + "group_multicast_ip" => %ctx.group.multicast_ip, + "old_sled_id" => ?ctx.member.sled_id, "new_sled_id" => %new_sled_id ); // Remove from old sled's dataplane first if let Err(e) = self - .remove_member_from_dataplane(opctx, member, dataplane_client) + .remove_member_from_dataplane( + ctx.opctx, + ctx.member, + ctx.dataplane_client, + ) .await { - debug!( - opctx.log, + warn!( + ctx.opctx.log, "failed to remove member from old sled, will retry"; - "member_id" => %member.id, - "old_sled_id" => ?member.sled_id, + "member_id" => %ctx.member.id, + "old_sled_id" => ?ctx.member.sled_id, "error" => ?e ); return Err(e); } - // Update sled_id in database using CAS + // Source-sled OPTE cleanup (M2P, forwarding, port subscription) + // is handled by VMM teardown: remove_propolis_zone -> + // release_opte_ports -> PortTicket::release_inner, which + // clears multicast subscriptions along with V2P and firewall + // rules. + // + // This is consistent with all other OPTE state. Nexus + // never explicitly calls sled-agent for source-sled cleanup + // after migration. + + // Update `sled_id` in database using CAS let updated = self .datastore .multicast_group_member_update_sled_id_if_current( - opctx, - InstanceUuid::from_untyped_uuid(member.parent_id), - member.sled_id, + ctx.opctx, + InstanceUuid::from_untyped_uuid(ctx.member.parent_id), + ctx.member.sled_id, Some(new_sled_id.into()), ) .await @@ -935,49 +908,53 @@ impl MulticastGroupReconciler { if !updated { debug!( - opctx.log, + ctx.opctx.log, "skipping sled_id update after migration due to concurrent change"; - "member_id" => %member.id, - "group_id" => %group.id(), - "old_sled_id" => ?member.sled_id, + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), + "old_sled_id" => ?ctx.member.sled_id, "new_sled_id" => %new_sled_id ); return Ok(StateTransition::NoChange); } - // Re-apply configuration on new sled - // If this fails (e.g., sled not yet in inventory), transition to "Joining" for retry + // Re-apply configuration on new sled. Pass `new_sled_id` explicitly + // because the in-memory member struct still has the old sled_id. match self .complete_instance_member_join( - opctx, - group, - member, - dataplane_client, + ctx, + Some(new_sled_id), + cached_propolis_id, ) .await { - Ok(()) => { + Ok(joined) => { info!( - opctx.log, + ctx.opctx.log, "member configuration re-applied after sled migration"; - "member_id" => %member.id, - "instance_id" => %member.parent_id, - "group_id" => %group.id(), - "group_name" => group.name().as_str(), - "group_multicast_ip" => %group.multicast_ip, + "member_id" => %ctx.member.id, + "instance_id" => %ctx.member.parent_id, + "group_id" => %ctx.group.id(), + "group_name" => ctx.group.name().as_str(), + "group_multicast_ip" => %ctx.group.multicast_ip, "new_sled_id" => %new_sled_id, - "dpd_operation" => "re_add_member_to_underlay_multicast_group" + "dpd_operation" => "re_add_member_to_underlay_multicast_group", + "joined" => joined ); - Ok(StateTransition::StateChanged) + if joined { + Ok(StateTransition::StateChanged) + } else { + Ok(StateTransition::NoChange) + } } Err(e) => { // Failed to join on new sled. We transition to "Joining" and // retry next cycle/run. warn!( - opctx.log, + ctx.opctx.log, "failed to complete join on new sled after migration: transitioning to 'Joining' for retry"; - "member_id" => %member.id, - "group_id" => %group.id(), + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), "new_sled_id" => %new_sled_id, "error" => %e ); @@ -1005,9 +982,9 @@ impl MulticastGroupReconciler { let updated = self .datastore .multicast_group_member_set_state_if_current( - opctx, - MulticastGroupUuid::from_untyped_uuid(group.id()), - InstanceUuid::from_untyped_uuid(member.parent_id), + ctx.opctx, + MulticastGroupUuid::from_untyped_uuid(ctx.group.id()), + InstanceUuid::from_untyped_uuid(ctx.member.parent_id), MulticastGroupMemberState::Joined, MulticastGroupMemberState::Joining, ) @@ -1018,10 +995,10 @@ impl MulticastGroupReconciler { if updated { info!( - opctx.log, + ctx.opctx.log, "member transitioned to 'Joining': will retry on next reconciliation run"; - "member_id" => %member.id, - "group_id" => %group.id(), + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), "new_sled_id" => %new_sled_id ); Ok(StateTransition::StateChanged) @@ -1103,22 +1080,27 @@ impl MulticastGroupReconciler { /// Instance-specific handler for members in "Left" state. async fn handle_instance_left( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - instance_states: &InstanceStateMap, - dataplane_client: &MulticastDataplaneClient, + ctx: &MemberReconcileCtx<'_>, ) -> Result { - // Get pre-fetched instance state and sled_id - let (instance_valid, current_sled_id) = instance_states - .get(&member.parent_id) + let InstanceMulticastState { + valid: instance_valid, + sled_id: current_sled_id, + .. + } = ctx + .instance_states + .get(&ctx.member.parent_id) .copied() - .unwrap_or((false, None)); - - // Handle permanent deletion first - if member.time_deleted.is_some() { - self.cleanup_deleted_member(opctx, group, member, dataplane_client) - .await?; + .unwrap_or_default(); + + if ctx.member.time_deleted.is_some() { + self.cleanup_deleted_member( + ctx.opctx, + ctx.group, + ctx.member, + ctx.dataplane_client, + ctx.sled_client, + ) + .await?; return Ok(StateTransition::NeedsCleanup); } @@ -1129,27 +1111,57 @@ impl MulticastGroupReconciler { // - sled_id is None (uses fallback path) // - member was already removed from DPD if let Err(e) = self - .remove_member_from_dataplane(opctx, member, dataplane_client) + .remove_member_from_dataplane( + ctx.opctx, + ctx.member, + ctx.dataplane_client, + ) .await { - debug!( - opctx.log, + warn!( + ctx.opctx.log, "failed to clean up DPD state for 'Left' member (will retry)"; - "member_id" => %member.id, + "member_id" => %ctx.member.id, "error" => ?e ); - // Continue to reactivation even on cleanup failure because - // the add operation may succeed if the port was already removed } - // Handle reactivation: instance valid and group active -> transition to "Joining" - if instance_valid && group.state == MulticastGroupState::Active { + // Unsubscribe the VMM's OPTE port from this multicast group. + // Best-effort since if the VMM is already gone, there's nothing to + // unsubscribe (the OPTE port was destroyed with the VMM). + if let Some(sled_id) = ctx.member.sled_id { + if let Err(e) = ctx + .sled_client + .unsubscribe_vmm( + ctx.opctx, + ctx.group, + ctx.member, + sled_id.into(), + None, + ) + .await + { + warn!( + ctx.opctx.log, + "failed to unsubscribe VMM from multicast group"; + "member_id" => %ctx.member.id, + "sled_id" => %sled_id, + "error" => %e + ); + } + } + + if instance_valid && ctx.group.state == MulticastGroupState::Active { return self - .reactivate_left_member(opctx, group, member, current_sled_id) + .reactivate_left_member( + ctx.opctx, + ctx.group, + ctx.member, + current_sled_id, + ) .await; } - // Stay in "Left" state Ok(StateTransition::NoChange) } @@ -1250,10 +1262,10 @@ impl MulticastGroupReconciler { // Build the state map from the fetched data state_map.extend(members.iter().map(|member| { - let (is_valid, sled_id) = if let Some((instance, vmm_opt)) = + let state = if let Some((instance, vmm_opt)) = instance_vmm_data.get(&member.parent_id) { - let is_valid = matches!( + let valid = matches!( instance.nexus_state.state(), InstanceState::Creating | InstanceState::Starting @@ -1267,13 +1279,16 @@ impl MulticastGroupReconciler { SledUuid::from_untyped_uuid(vmm.sled_id.into_untyped_uuid()) }); - (is_valid, sled_id) + let propolis_id = vmm_opt + .as_ref() + .map(|vmm| PropolisUuid::from_untyped_uuid(vmm.id)); + + InstanceMulticastState { valid, sled_id, propolis_id } } else { - // Instance not found (mark as invalid) - (false, None) + InstanceMulticastState::default() }; - (member.parent_id, (is_valid, sled_id)) + (member.parent_id, state) })); debug!( @@ -1381,76 +1396,144 @@ impl MulticastGroupReconciler { } } - /// Complete a member join operation ("Joining" -> "Joined") for an instance. + /// Complete a member join by configuring the dataplane and subscribing + /// the VMM. + /// + /// When `sled_id_override` is provided (e.g., during migration), it + /// is used instead of the potentially stale `member.sled_id`. + /// + /// # Returns + /// + /// `Ok(true)` when the join completed successfully. `Ok(false)` when no + /// sled was available and the operation was a no-op. async fn complete_instance_member_join( &self, - opctx: &OpContext, - group: &MulticastGroup, - member: &MulticastGroupMember, - dataplane_client: &MulticastDataplaneClient, - ) -> Result<(), anyhow::Error> { + ctx: &MemberReconcileCtx<'_>, + sled_id_override: Option, + cached_propolis_id: Option, + ) -> Result { debug!( - opctx.log, + ctx.opctx.log, "completing member join"; - "member" => ?member, - "group" => ?group + "member" => ?ctx.member, + "group" => ?ctx.group ); - // Get sled_id from member record, or look it up and update if missing - let sled_id = match member.sled_id { - Some(id) => id, - None => { - match self - .lookup_and_update_member_sled_id(opctx, member) - .await? - { - Some(id) => id, - None => return Ok(()), // No sled available, cannot join - } - } + // Use the override if provided, then the member's cached sled_id, + // then look it up from the instance as a last resort. + let sled_id: SledUuid = if let Some(id) = + sled_id_override.or(ctx.member.sled_id.map(Into::into)) + { + id + } else if let Some(id) = + self.lookup_and_update_member_sled_id(ctx.opctx, ctx.member).await? + { + id.into() + } else { + return Ok(false); }; self.add_member_to_dataplane( - opctx, - group, - member, - sled_id.into(), - dataplane_client, + ctx.opctx, + ctx.group, + ctx.member, + sled_id, + ctx.dataplane_client, ) .await?; - // Transition to "Joined" state (only if still in "Joining") - let updated = self - .datastore - .multicast_group_member_set_state_if_current( - opctx, - MulticastGroupUuid::from_untyped_uuid(group.id()), - InstanceUuid::from_untyped_uuid(member.parent_id), - MulticastGroupMemberState::Joining, - MulticastGroupMemberState::Joined, + // If the member is already in a "Joined" state (migration path), skip + // the state transition but still propagate and subscribe. During + // migration the caller updates the sled ID without changing state, + // so we must not gate propagation on this CAS. + if ctx.member.state != MulticastGroupMemberState::Joined { + let updated = self + .datastore + .multicast_group_member_set_state_if_current( + ctx.opctx, + MulticastGroupUuid::from_untyped_uuid(ctx.group.id()), + InstanceUuid::from_untyped_uuid(ctx.member.parent_id), + MulticastGroupMemberState::Joining, + MulticastGroupMemberState::Joined, + ) + .await + .context( + "failed to conditionally transition member to 'Joined' state", + )?; + + if !updated { + debug!( + ctx.opctx.log, + "skipping Joining→Joined transition due to concurrent update"; + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id() + ); + // Concurrent update moved the member away from the "Joining" + // state, so skip propagation and subscribe. + return Ok(false); + } + } + + // Propagate M2P mappings and forwarding entries to all sleds. + // The member is now "Joined" in the database, so propagate includes + // this sled in forwarding next-hops. If propagation or subscribe + // fails below, the member remains "Joined" with incomplete sled + // state. The reconciler's next pass converges via + // `handle_instance_joined` -> `verify_members`. + // + // Propagation failures are best-effort since the reconciler will + // re-converge all sleds on the next cycle. Subscribe failures + // below are treated as hard errors because the VMM cannot + // receive traffic without an OPTE port subscription. + if let Err(e) = ctx + .sled_client + .propagate_m2p_and_forwarding(ctx.opctx, ctx.group) + .await + { + warn!( + ctx.opctx.log, + "failed to propagate M2P/forwarding after member join"; + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), + "error" => %e + ); + } + + // Subscribe the VMM's OPTE port last. Propagation above is + // best-effort, and any sleds that failed will be converged by the + // reconciler on the next cycle. + if let Err(e) = ctx + .sled_client + .subscribe_vmm( + ctx.opctx, + ctx.group, + ctx.member, + sled_id, + cached_propolis_id, ) .await - .context( - "failed to conditionally transition member to 'Joined' state", - )?; - if !updated { - debug!( - opctx.log, - "skipping Joining→Joined transition due to concurrent update"; - "member_id" => %member.id, - "group_id" => %group.id() + { + warn!( + ctx.opctx.log, + "failed to subscribe VMM to multicast group via sled-agent \ + (will retry next cycle)"; + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), + "sled_id" => %sled_id, + "error" => %e ); + return Err(e); } info!( - opctx.log, + ctx.opctx.log, "member join completed"; - "member_id" => %member.id, - "group_id" => %group.id(), + "member_id" => %ctx.member.id, + "group_id" => %ctx.group.id(), "sled_id" => %sled_id ); - Ok(()) + Ok(true) } /// Apply member dataplane configuration (via DPD-client). @@ -1870,6 +1953,10 @@ impl MulticastGroupReconciler { /// - Removing the member from any unexpected/stale rear ports /// - Adding the member to expected ports /// + /// If the sled cannot be resolved (e.g., decommissioned), the member + /// is transitioned to "Left" and M2P/forwarding is propagated inline + /// to remove stale entries. + /// /// This handles cases like `sp_slot` changes where the sled's physical /// location changed but the `sled_id` stayed the same. async fn verify_members( @@ -1878,6 +1965,7 @@ impl MulticastGroupReconciler { group: &MulticastGroup, member: &MulticastGroupMember, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result<(), anyhow::Error> { debug!( opctx.log, @@ -1940,6 +2028,23 @@ impl MulticastGroupReconciler { ) .await; + // Unsubscribe the VMM before the CAS clears sled_id, + // otherwise the OPTE subscription is stranded with no + // way to identify the sled on later passes. Best-effort + // since the VMM may already be torn down. + if let Err(e) = sled_client + .unsubscribe_vmm(opctx, group, member, sled_id.into(), None) + .await + { + warn!( + opctx.log, + "failed to unsubscribe VMM during port resolution failure"; + "member_id" => %member.id, + "sled_id" => %sled_id, + "error" => %e + ); + } + let updated = self .datastore .multicast_group_member_to_left_if_current( @@ -1952,6 +2057,21 @@ impl MulticastGroupReconciler { .context("failed to transition member to 'Left' after port resolution failure")?; if updated { + // Propagate updated M2P/forwarding to remove + // stale entries for this now-Left member. + if let Err(e) = sled_client + .propagate_m2p_and_forwarding(opctx, group) + .await + { + warn!( + opctx.log, + "failed to propagate M2P/forwarding after \ + member left due to unresolvable sled"; + "member_id" => %member.id, + "group_id" => %group.id(), + "error" => %e + ); + } info!( opctx.log, "member transitioned to 'Left': sled no longer resolvable"; @@ -2105,6 +2225,23 @@ impl MulticastGroupReconciler { } } + // Ensure the VMM subscription is in place for the current propolis_id. + // This is idempotent and covers cases where the propolis_id changed + // (e.g., after live migration) but the sled_id stayed the same. + if let Err(e) = sled_client + .subscribe_vmm(opctx, group, member, sled_id.into(), None) + .await + { + warn!( + opctx.log, + "failed to verify VMM subscription during member verification"; + "member_id" => %member.id, + "sled_id" => %sled_id, + "error" => %e + ); + return Err(e); + } + info!( opctx.log, "member verification completed"; @@ -2607,13 +2744,33 @@ impl MulticastGroupReconciler { } /// Cleanup a member that is marked for deletion (time_deleted set). + /// + /// This includes unsubscribing a member from its VMM, removing + /// it from the dataplane, and hard-deleting the DB row. async fn cleanup_deleted_member( &self, opctx: &OpContext, group: &MulticastGroup, member: &MulticastGroupMember, dataplane_client: &MulticastDataplaneClient, + sled_client: &MulticastSledClient, ) -> Result<(), anyhow::Error> { + // Unsubscribe from sled-agent (best-effort, VMM may be gone). + if let Some(sled_id) = member.sled_id { + if let Err(e) = sled_client + .unsubscribe_vmm(opctx, group, member, sled_id.into(), None) + .await + { + debug!( + opctx.log, + "failed to unsubscribe VMM during member cleanup"; + "member_id" => %member.id, + "sled_id" => %sled_id, + "error" => %e + ); + } + } + // Use the consolidated cleanup helper with strict error handling self.cleanup_member_from_dataplane( opctx, diff --git a/nexus/src/app/background/tasks/multicast/mod.rs b/nexus/src/app/background/tasks/multicast/mod.rs index 8f592a41087..dc21e9ec0e8 100644 --- a/nexus/src/app/background/tasks/multicast/mod.rs +++ b/nexus/src/app/background/tasks/multicast/mod.rs @@ -84,7 +84,7 @@ //! - Unlike linear probing (`h + i`), scattered outputs avoid clustering //! - **8-bit salt**: 256 unique underlay addresses per external IP //! - **Resolution**: Exhaustion requires 256 other groups to occupy exactly -//! those 256 scattered addresses—effectively impossible in 2^64 space +//! those 256 scattered addresses, effectively impossible in 2^64 space //! //! ### Forwarding Architecture (Incoming multicast traffic to guests) //! @@ -151,6 +151,7 @@ use sled_hardware_types::BaseboardId; use crate::app::background::BackgroundTask; use crate::app::multicast::dataplane::MulticastDataplaneClient; +use crate::app::multicast::sled::MulticastSledClient; use crate::app::saga::StartSaga; pub(crate) mod groups; @@ -362,7 +363,7 @@ impl MulticastGroupReconciler { /// │ 6 │ 0xa ⊕ 6 │ 0xc │ /// │ 7 │ 0xa ⊕ 7 │ 0xd │ /// └──────┴─────────┴────────┘ -/// Outputs: [a, b, 8, 9, e, f, c, d] — scattered, not sequential +/// Outputs: [a, b, 8, 9, e, f, c, d] (scattered, not sequential) /// ``` /// /// On collision (i.e., underlay IP already in use), we increment salt and retry. @@ -533,6 +534,10 @@ impl MulticastGroupReconciler { } }; + // Create sled-agent client for OPTE subscriptions and + // M2P/forwarding propagation. + let sled_client = MulticastSledClient::new(self.datastore.clone()); + // Process creating groups match self.reconcile_creating_groups(opctx).await { Ok(count) => status.groups_created += count, @@ -543,7 +548,10 @@ impl MulticastGroupReconciler { } // Process member state changes - match self.reconcile_member_states(opctx, &dataplane_client).await { + match self + .reconcile_member_states(opctx, &dataplane_client, &sled_client) + .await + { Ok(count) => status.members_processed += count, Err(e) => { let msg = format!("failed to reconcile member states: {e:#}"); @@ -574,7 +582,10 @@ impl MulticastGroupReconciler { } // Reconcile active groups (verify state, update dataplane as needed) - match self.reconcile_active_groups(opctx, &dataplane_client).await { + match self + .reconcile_active_groups(opctx, &dataplane_client, &sled_client) + .await + { Ok(count) => status.groups_verified += count, Err(e) => { let msg = format!("failed to reconcile active groups: {e:#}"); @@ -583,7 +594,10 @@ impl MulticastGroupReconciler { } // Process deleting groups (DPD cleanup + hard-delete from DB) - match self.reconcile_deleting_groups(opctx, &dataplane_client).await { + match self + .reconcile_deleting_groups(opctx, &dataplane_client, &sled_client) + .await + { Ok(count) => status.groups_deleted += count, Err(e) => { let msg = format!("failed to reconcile deleting groups: {e:#}"); diff --git a/nexus/src/app/multicast/dataplane.rs b/nexus/src/app/multicast/dataplane.rs index 991a4ef1082..ba69cfdeb35 100644 --- a/nexus/src/app/multicast/dataplane.rs +++ b/nexus/src/app/multicast/dataplane.rs @@ -114,7 +114,8 @@ trait IntoUnderlayMulticast { impl IntoUnderlayMulticast for IpAddr { fn into_underlay_multicast(self) -> Result { match self { - IpAddr::V6(ipv6) => Ok(UnderlayMulticastIpv6(ipv6)), + IpAddr::V6(ipv6) => UnderlayMulticastIpv6::try_from(ipv6) + .map_err(|e| Error::invalid_request(e.to_string())), IpAddr::V4(_) => Err(Error::invalid_request( "underlay multicast groups must use IPv6 addresses", )), @@ -154,6 +155,13 @@ pub(crate) struct GroupUpdateParams<'a> { pub source_filter: &'a SourceFilterState, } +/// Resolved DPD parameters derived from an external/underlay group pair. +struct ResolvedGroupParams { + vlan_id: Option, + underlay_ip: UnderlayMulticastIpv6, + nat_target: NatTarget, +} + impl MulticastDataplaneClient { /// Create a new client - builds fresh DPD clients for current switch /// topology. @@ -180,14 +188,67 @@ impl MulticastDataplaneClient { fn select_one_switch( &self, ) -> MulticastDataplaneResult<(&SwitchSlot, &dpd_client::Client)> { - let mut switches: Vec<_> = self.dpd_clients.iter().collect(); - switches.sort_by_key(|(loc, _)| *loc); - switches - .into_iter() - .next() + self.dpd_clients + .iter() + .min_by_key(|(loc, _)| *loc) .ok_or_else(|| Error::internal_error("no DPD clients available")) } + /// Resolve VLAN ID, underlay address, and NAT target from group pair. + fn resolve_group_params( + external_group: &ExternalMulticastGroup, + underlay_group: &UnderlayMulticastGroup, + ) -> MulticastDataplaneResult { + let vlan_id = external_group + .mvlan + .map(|v| VlanID::new(v as u16)) + .transpose() + .map_err(|e| { + Error::internal_error(&format!("invalid VLAN ID: {e:#}")) + })? + .map(u16::from); + let underlay_ip = + underlay_group.multicast_ip.ip().into_underlay_multicast()?; + let underlay_ipv6 = match underlay_group.multicast_ip.ip() { + IpAddr::V6(ipv6) => ipv6, + IpAddr::V4(_) => { + return Err(Error::internal_error( + "underlay multicast groups must use IPv6 addresses", + )); + } + }; + let nat_target = NatTarget { + internal_ip: underlay_ipv6, + inner_mac: MacAddr { a: underlay_ipv6.derive_multicast_mac() }, + vni: Vni::from(u32::from(external_group.vni.0)), + }; + Ok(ResolvedGroupParams { vlan_id, underlay_ip, nat_target }) + } + + /// Compute DPD source filter from aggregated member source state. + /// + /// For SSM addresses, always returns specific sources. For ASM addresses, + /// returns `None` (any source) if any member omitted sources, otherwise + /// returns the union of all member sources. + fn compute_sources_for_dpd( + external_group_ip: IpAddr, + source_filter: &SourceFilterState, + ) -> Option> { + if is_ssm_address(external_group_ip) + || !source_filter.has_any_source_member + { + Some( + source_filter + .specific_sources + .iter() + .map(|ip| dpd_client::types::IpSrc::Exact(*ip)) + .collect(), + ) + } else { + None + } + } + async fn dpd_ensure_underlay_created( &self, client: &dpd_client::Client, @@ -398,58 +459,14 @@ impl MulticastDataplaneClient { Error::internal_error("multicast group missing tag") })?; - // Convert MVLAN to u16 for DPD, validating through VlanID - let vlan_id = external_group - .mvlan - .map(|v| VlanID::new(v as u16)) - .transpose() - .map_err(|e| { - Error::internal_error(&format!("invalid VLAN ID: {e:#}")) - })? - .map(u16::from); - let underlay_ip_admin = - underlay_group.multicast_ip.ip().into_underlay_multicast()?; - let underlay_ipv6 = match underlay_group.multicast_ip.ip() { - IpAddr::V6(ipv6) => ipv6, - IpAddr::V4(_) => { - return Err(Error::internal_error( - "underlay multicast groups must use IPv6 addresses", - )); - } - }; - - let nat_target = NatTarget { - internal_ip: underlay_ipv6, - inner_mac: MacAddr { a: underlay_ipv6.derive_multicast_mac() }, - vni: Vni::from(u32::from(external_group.vni.0)), - }; - + let ResolvedGroupParams { + vlan_id, + underlay_ip: underlay_ip_admin, + nat_target, + } = Self::resolve_group_params(external_group, underlay_group)?; let external_group_ip = external_group.multicast_ip.ip(); - - // Source filtering per RFC 4607: - // - SSM (232/8, ff3x::/32): always use specific sources. API - // validation prevents SSM joins without sources. - // - ASM: use specific sources when all members specify sources, - // otherwise None to allow any source at the switch level. - let sources_dpd = if is_ssm_address(external_group_ip) { - Some( - source_filter - .specific_sources - .iter() - .map(|ip| IpSrc::Exact(*ip)) - .collect::>(), - ) - } else if source_filter.has_any_source_member { - None - } else { - Some( - source_filter - .specific_sources - .iter() - .map(|ip| IpSrc::Exact(*ip)) - .collect::>(), - ) - }; + let sources_dpd = + Self::compute_sources_for_dpd(external_group_ip, source_filter); let create_operations = dpd_clients.into_iter().map(|(switch_slot, client)| { @@ -555,66 +572,20 @@ impl MulticastDataplaneClient { let dpd_clients = &self.dpd_clients; - // Pre-compute shared data once - // Convert MVLAN to u16 for DPD, validating through VlanID - let vlan_id = params - .external_group - .mvlan - .map(|v| VlanID::new(v as u16)) - .transpose() - .map_err(|e| { - Error::internal_error(&format!("invalid VLAN ID: {e:#}")) - })? - .map(u16::from); - let underlay_ip_admin = params - .underlay_group - .multicast_ip - .ip() - .into_underlay_multicast()?; - let underlay_ipv6 = match params.underlay_group.multicast_ip.ip() { - IpAddr::V6(ipv6) => ipv6, - IpAddr::V4(_) => { - return Err(Error::internal_error( - "underlay multicast groups must use IPv6 addresses", - )); - } - }; - - let nat_target = NatTarget { - internal_ip: underlay_ipv6, - inner_mac: MacAddr { a: underlay_ipv6.derive_multicast_mac() }, - vni: Vni::from(u32::from(params.external_group.vni.0)), - }; - + let ResolvedGroupParams { + vlan_id, + underlay_ip: underlay_ip_admin, + nat_target, + } = Self::resolve_group_params( + params.external_group, + params.underlay_group, + )?; let new_name_str = params.new_name.to_string(); let external_group_ip = params.external_group.multicast_ip.ip(); - - // Source filtering per RFC 4607: - // - SSM (232/8, ff3x::/32): always use specific sources. API - // validation prevents SSM joins without sources. - // - ASM: use specific sources when all members specify sources, - // otherwise None to allow any source at the switch level. - let sources_dpd = if is_ssm_address(external_group_ip) { - Some( - params - .source_filter - .specific_sources - .iter() - .map(|ip| IpSrc::Exact(*ip)) - .collect::>(), - ) - } else if params.source_filter.has_any_source_member { - None - } else { - Some( - params - .source_filter - .specific_sources - .iter() - .map(|ip| IpSrc::Exact(*ip)) - .collect::>(), - ) - }; + let sources_dpd = Self::compute_sources_for_dpd( + external_group_ip, + params.source_filter, + ); let update_operations = dpd_clients.into_iter().map(|(switch_slot, client)| { diff --git a/nexus/src/app/multicast/mod.rs b/nexus/src/app/multicast/mod.rs index c5f6a066839..cac27bcc173 100644 --- a/nexus/src/app/multicast/mod.rs +++ b/nexus/src/app/multicast/mod.rs @@ -70,6 +70,7 @@ use omicron_common::api::external::{ use omicron_uuid_kinds::{GenericUuid, InstanceUuid, MulticastGroupUuid}; pub(crate) mod dataplane; +pub(crate) mod sled; /// Validate that SSM addresses have source IPs. /// @@ -872,4 +873,32 @@ mod tests { 0xff1e, 0, 0, 0, 0, 0, 0, 1 )))); } + + #[test] + fn test_generate_group_name_from_ip() { + let v4 = IpAddr::V4(Ipv4Addr::new(224, 1, 2, 3)); + assert_eq!( + generate_group_name_from_ip(v4).unwrap().as_str(), + "mcast-224-1-2-3" + ); + + let v4_zeros = IpAddr::V4(Ipv4Addr::new(224, 0, 0, 1)); + assert_eq!( + generate_group_name_from_ip(v4_zeros).unwrap().as_str(), + "mcast-224-0-0-1" + ); + + let v6: IpAddr = IpAddr::V6(Ipv6Addr::new(0xff0e, 0, 0, 0, 0, 0, 0, 1)); + assert_eq!( + generate_group_name_from_ip(v6).unwrap().as_str(), + "mcast-ff0e-0-0-0-0-0-0-1" + ); + + let v6_ssm: IpAddr = + IpAddr::V6(Ipv6Addr::new(0xff3e, 0, 0, 0, 0, 0, 0, 0xabcd)); + assert_eq!( + generate_group_name_from_ip(v6_ssm).unwrap().as_str(), + "mcast-ff3e-0-0-0-0-0-0-abcd" + ); + } } diff --git a/nexus/src/app/multicast/sled.rs b/nexus/src/app/multicast/sled.rs new file mode 100644 index 00000000000..039dfc74abe --- /dev/null +++ b/nexus/src/app/multicast/sled.rs @@ -0,0 +1,560 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Sled-agent multicast operations for OPTE subscriptions, M2P mappings, +//! and forwarding entries. +//! +//! Parallel to [`dataplane`] which handles DPD switch operations, this +//! module manages sled-local multicast state via sled-agent: +//! +//! - **OPTE subscriptions**: Per-VMM multicast group filters on the +//! hosting sled +//! - **M2P mappings**: Overlay multicast IP to underlay IPv6 address +//! translation, installed on all sleds +//! - **Forwarding entries**: Underlay multicast address to next-hop sled +//! replication lists, installed on all sleds +//! +//! [`dataplane`]: super::dataplane + +use std::collections::BTreeSet; +use std::net::{IpAddr, Ipv6Addr}; +use std::sync::Arc; + +use anyhow::Context; +use slog::{debug, info, warn}; + +use nexus_db_model::{ + MulticastGroup, MulticastGroupMember, MulticastGroupMemberState, +}; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_types::deployment::SledFilter; +use nexus_types::identity::{Asset, Resource}; +use omicron_common::api::external::DataPageParams; +use omicron_uuid_kinds::{ + GenericUuid, InstanceUuid, MulticastGroupUuid, PropolisUuid, SledUuid, +}; +use sled_agent_client::types::{ + ClearMcast2Phys, ClearMcastForwarding, Mcast2PhysMapping, McastFilterMode, + McastForwardingEntry, McastForwardingNextHop, McastReplication, + McastSourceFilter, +}; + +/// Client for sled-agent multicast operations. +/// +/// Unlike [`MulticastDataplaneClient`] which pre-builds per-switch clients, +/// sled clients are constructed on demand since the target sled set varies +/// per group. +/// +/// [`MulticastDataplaneClient`]: super::dataplane::MulticastDataplaneClient +pub(crate) struct MulticastSledClient { + datastore: Arc, +} + +impl MulticastSledClient { + pub(crate) fn new(datastore: Arc) -> Self { + Self { datastore } + } + + /// Create a sled-agent client for the given sled. + /// + /// Looks up the sled's address in the database and constructs an HTTP + /// client. Follows the same pattern as V2P mapping propagation. + async fn sled_client( + &self, + opctx: &OpContext, + sled_id: SledUuid, + ) -> Result + { + nexus_networking::sled_client( + &self.datastore, + opctx, + sled_id, + &opctx.log, + ) + .await + } + + /// Look up the current `propolis_id` for an instance. + async fn lookup_propolis_id( + &self, + opctx: &OpContext, + instance_id: InstanceUuid, + ) -> Result, anyhow::Error> { + let instance_state = self + .datastore + .instance_get_state(opctx, &instance_id) + .await + .context("failed to look up instance state")? + .context("instance not found")?; + + Ok(instance_state.propolis_id.map(PropolisUuid::from_untyped_uuid)) + } + + /// Build the membership descriptor sent to sled-agent for + /// subscribe/unsubscribe calls. + fn membership_for( + group: &MulticastGroup, + member: &MulticastGroupMember, + ) -> sled_agent_client::types::InstanceMulticastMembership { + sled_agent_client::types::InstanceMulticastMembership { + group_ip: group.multicast_ip.ip(), + sources: member.source_ips.iter().map(|s| s.ip()).collect(), + } + } + + /// Subscribe a VMM to a multicast group via sled-agent. + /// + /// Looks up the instance's current `propolis_id` and calls the sled-agent + /// endpoint to configure OPTE port-level multicast filters. The member's + /// per-instance source IPs are passed for SSM filtering. + pub(crate) async fn subscribe_vmm( + &self, + opctx: &OpContext, + group: &MulticastGroup, + member: &MulticastGroupMember, + sled_id: SledUuid, + cached_propolis_id: Option, + ) -> Result<(), anyhow::Error> { + let instance_id = InstanceUuid::from_untyped_uuid(member.parent_id); + let propolis_id = match cached_propolis_id { + Some(id) => id, + None => self + .lookup_propolis_id(opctx, instance_id) + .await? + .context("instance has no propolis_id, cannot subscribe")?, + }; + + let client = self + .sled_client(opctx, sled_id) + .await + .context("failed to create sled-agent client")?; + + let membership = Self::membership_for(group, member); + + client + .vmm_join_multicast_group(&propolis_id, &membership) + .await + .context("sled-agent vmm_join_multicast_group call failed")?; + + debug!( + opctx.log, + "subscribed VMM to multicast group via sled-agent"; + "member_id" => %member.id, + "propolis_id" => %propolis_id, + "sled_id" => %sled_id, + "group_ip" => %group.multicast_ip + ); + + Ok(()) + } + + /// Unsubscribe a VMM from a multicast group via sled-agent. + /// + /// Best-effort since if the VMM or sled is already gone, the unsubscribe + /// is effectively a no-op since the OPTE port was destroyed. + pub(crate) async fn unsubscribe_vmm( + &self, + opctx: &OpContext, + group: &MulticastGroup, + member: &MulticastGroupMember, + sled_id: SledUuid, + cached_propolis_id: Option, + ) -> Result<(), anyhow::Error> { + let instance_id = InstanceUuid::from_untyped_uuid(member.parent_id); + + // If the instance has no propolis_id (already stopped/destroyed), + // the OPTE port is gone and there's nothing to unsubscribe. + let propolis_id = match cached_propolis_id { + Some(id) => id, + None => match self.lookup_propolis_id(opctx, instance_id).await? { + Some(id) => id, + None => { + debug!( + opctx.log, + "no propolis_id for instance, skipping unsubscribe"; + "member_id" => %member.id, + "instance_id" => %instance_id + ); + return Ok(()); + } + }, + }; + + let client = self + .sled_client(opctx, sled_id) + .await + .context("failed to create sled-agent client")?; + + let membership = Self::membership_for(group, member); + + client + .vmm_leave_multicast_group(&propolis_id, &membership) + .await + .context("sled-agent vmm_leave_multicast_group call failed")?; + + debug!( + opctx.log, + "unsubscribed VMM from multicast group via sled-agent"; + "member_id" => %member.id, + "propolis_id" => %propolis_id, + "sled_id" => %sled_id, + "group_ip" => %group.multicast_ip + ); + + Ok(()) + } + + /// Propagate M2P mappings and forwarding entries to all VPC-routing sleds. + /// + /// Performs convergent per-sled propagation: each sled's current state + /// is queried and diffed against desired state. New entries are added + /// and stale state is removed (member leaves, instance stops). When no + /// joined members remain, every sled has stale state and it is cleared. + /// + /// # Scope + /// + /// M2P mappings and forwarding entries are pushed to all VPC-routing + /// sleds, not just member sleds. Any instance on any sled may send to + /// a multicast group address. Hence, without the M2P mapping, OPTE's + /// overlay layer silently drops the packet. Forwarding entries are needed + /// on sender sleds so OPTE can replicate to member sleds. Subscriptions + /// (per-port group membership) remain member-sled-only. + pub(crate) async fn propagate_m2p_and_forwarding( + &self, + opctx: &OpContext, + group: &MulticastGroup, + ) -> Result<(), anyhow::Error> { + let underlay_group_id = group + .underlay_group_id + .context("group missing underlay_group_id")?; + + let underlay_group = self + .datastore + .underlay_multicast_group_fetch(opctx, underlay_group_id) + .await + .context("failed to fetch underlay group")?; + + let underlay_ip = match underlay_group.multicast_ip.ip() { + IpAddr::V6(v6) => v6, + other => anyhow::bail!( + "underlay multicast address for group {} is {other}, expected IPv6", + group.id() + ), + }; + + let group_ip = group.multicast_ip.ip(); + + // Compute desired state from DB, determining which sleds should have + // M2P and forwarding entries for this group. + let group_id = MulticastGroupUuid::from_untyped_uuid(group.id()); + let members = self + .datastore + .multicast_group_members_list( + opctx, + group_id, + &DataPageParams::max_page(), + ) + .await + .context("failed to list group members")?; + + let member_sled_ids: BTreeSet = members + .iter() + .filter(|m| m.state == MulticastGroupMemberState::Joined) + .filter_map(|m| m.sled_id.map(SledUuid::from)) + .collect(); + + // Build desired M2P entry. + let desired_m2p = + Mcast2PhysMapping { group: group_ip, underlay: underlay_ip }; + + // Look up member sled underlay IPs for forwarding next-hop + // computation. These are the sleds that host "Joined" members + // and should appear as next hops in every sled's forwarding + // entry. + let mut member_sled_ips: Vec<(SledUuid, Ipv6Addr)> = Vec::new(); + let mut failed_lookups: usize = 0; + for sled_id in &member_sled_ids { + let lookup = match nexus_networking::sled_lookup( + &self.datastore, + opctx, + *sled_id, + ) { + Ok(found) => found, + Err(e) => { + warn!( + opctx.log, + "failed to resolve sled for M2P/forwarding"; + "sled_id" => %sled_id, + "error" => %e + ); + failed_lookups += 1; + continue; + } + }; + + match lookup.fetch().await { + Ok((.., sled)) => { + member_sled_ips.push((*sled_id, sled.ip())); + } + Err(e) => { + warn!( + opctx.log, + "failed to resolve sled for M2P/forwarding"; + "sled_id" => %sled_id, + "error" => %e + ); + failed_lookups += 1; + } + } + } + + // Abort before mutating sled state if any member lookups failed. + // Pushing the partial member set would prune forwarding entries + // for the unresolved sleds, turning a transient lookup failure + // into packet loss for still-joined members. + if failed_lookups > 0 { + anyhow::bail!( + "aborting convergence: {failed_lookups} member sled \ + lookup(s) failed out of {} joined members", + member_sled_ids.len() + ); + } + + // The group is active if any members are "Joined". M2P and + // forwarding are pushed to all sleds when active, cleared + // from all sleds when inactive. + let group_is_active = !member_sled_ids.is_empty(); + + // Query all VPC-routing sleds for current state and converge. + let all_sleds = self + .datastore + .sled_list_all_batched(opctx, SledFilter::VpcRouting) + .await + .context("failed to enumerate sleds")?; + + let convergence_params = GroupConvergenceParams { + group_ip, + underlay_ip, + group_is_active, + desired_m2p: &desired_m2p, + member_sled_ips: &member_sled_ips, + }; + + let mut failed_sleds: usize = 0; + + for sled in &all_sleds { + let sled_id: SledUuid = sled.id(); + let client = match self.sled_client(opctx, sled_id).await { + Ok(c) => c, + Err(e) => { + warn!( + opctx.log, + "failed to create sled-agent client for \ + M2P/forwarding convergence"; + "sled_id" => %sled_id, + "error" => %e + ); + failed_sleds += 1; + continue; + } + }; + + if let Err(e) = converge_sled_m2p_and_forwarding( + &client, + sled_id, + &convergence_params, + ) + .await + { + warn!( + opctx.log, + "failed to converge M2P/forwarding on sled"; + "sled_id" => %sled_id, + "group_ip" => %group_ip, + "error" => %e + ); + failed_sleds += 1; + } + } + + info!( + opctx.log, + "converged M2P and forwarding state"; + "group_id" => %group.id(), + "group_ip" => %group_ip, + "underlay_ip" => %underlay_ip, + "member_sleds" => member_sled_ids.len(), + "total_sleds_checked" => all_sleds.len(), + "failed_sleds" => failed_sleds + ); + + if failed_sleds > 0 { + anyhow::bail!( + "failed to converge M2P/forwarding: \ + {failed_sleds} sled convergence failures \ + (out of {} sleds)", + all_sleds.len() + ); + } + + Ok(()) + } + + /// Clear M2P mappings and forwarding entries from all sleds for + /// this group. + /// + /// Delegates to the convergent [`propagate_m2p_and_forwarding`] which + /// will detect that no joined members remain and clear stale state + /// from all sleds. + /// + /// [`propagate_m2p_and_forwarding`]: Self::propagate_m2p_and_forwarding + pub(crate) async fn clear_m2p_and_forwarding( + &self, + opctx: &OpContext, + group: &MulticastGroup, + ) -> Result<(), anyhow::Error> { + self.propagate_m2p_and_forwarding(opctx, group).await + } +} + +/// Resolved group state used to converge M2P and forwarding on each sled. +struct GroupConvergenceParams<'a> { + group_ip: IpAddr, + underlay_ip: Ipv6Addr, + group_is_active: bool, + desired_m2p: &'a Mcast2PhysMapping, + member_sled_ips: &'a [(SledUuid, Ipv6Addr)], +} + +/// Per-sled convergence of M2P and forwarding state. +/// +/// # Errors +/// +/// Returns an error when any sled-agent RPC fails (list, set, or clear). +/// The caller increments `failed_sleds` and continues to the next sled. +async fn converge_sled_m2p_and_forwarding( + client: &sled_agent_client::Client, + sled_id: SledUuid, + params: &GroupConvergenceParams<'_>, +) -> Result<(), anyhow::Error> { + converge_m2p(client, params).await?; + converge_forwarding(client, sled_id, params).await?; + Ok(()) +} + +/// Converge a single sled's M2P mapping for one group. +/// +/// Sets the mapping when the group is active and missing, clears it +/// when the group is inactive and present. Already-correct state +/// is left alone. +async fn converge_m2p( + client: &sled_agent_client::Client, + params: &GroupConvergenceParams<'_>, +) -> Result<(), anyhow::Error> { + let found = client + .list_mcast_m2p() + .await + .context("failed to list M2P mappings on sled")? + .into_inner(); + + let has_m2p = found.iter().any(|m| { + m.group == params.group_ip && m.underlay == params.underlay_ip + }); + + match (params.group_is_active, has_m2p) { + // Active group missing M2P: install it. + (true, false) => { + client + .set_mcast_m2p(params.desired_m2p) + .await + .context("failed to add M2P mapping to sled")?; + } + // Inactive group has stale M2P: remove it. + (false, true) => { + let clear = ClearMcast2Phys { + group: params.group_ip, + underlay: params.underlay_ip, + }; + client + .clear_mcast_m2p(&clear) + .await + .context("failed to clear stale M2P from sled")?; + } + // Already converged. + _ => {} + } + + Ok(()) +} + +/// Converge a single sled's forwarding entries for one group. +/// +/// When the group is active, computes desired next hops (all member +/// sleds except this one) and updates only if the current state +/// differs. When inactive, clears any stale entries. +async fn converge_forwarding( + client: &sled_agent_client::Client, + sled_id: SledUuid, + params: &GroupConvergenceParams<'_>, +) -> Result<(), anyhow::Error> { + let found = client + .list_mcast_fwd() + .await + .context("failed to list forwarding on sled")? + .into_inner(); + + let current_entry = found.iter().find(|f| f.underlay == params.underlay_ip); + + if !params.group_is_active { + if current_entry.is_some() { + let clear = ClearMcastForwarding { underlay: params.underlay_ip }; + client + .clear_mcast_fwd(&clear) + .await + .context("failed to clear stale forwarding from sled")?; + } + return Ok(()); + } + + let desired_next_hops: Vec = params + .member_sled_ips + .iter() + .filter(|(id, _)| *id != sled_id) + .map(|(_, ip)| McastForwardingNextHop { + next_hop: *ip, + replication: McastReplication::Underlay, + filter: McastSourceFilter { + mode: McastFilterMode::Exclude, + sources: Vec::new(), + }, + }) + .collect(); + + let needs_update = + current_entry.map(|f| &f.next_hops) != Some(&desired_next_hops); + + if needs_update { + // OPTE's set_mcast_fwd handler is additive: it inserts next + // hops but never removes stale ones. Clear first so the + // subsequent set produces an exact replacement. + if current_entry.is_some() { + let clear = ClearMcastForwarding { underlay: params.underlay_ip }; + client + .clear_mcast_fwd(&clear) + .await + .context("failed to clear forwarding before update")?; + } + let desired_fwd = McastForwardingEntry { + underlay: params.underlay_ip, + next_hops: desired_next_hops, + }; + client + .set_mcast_fwd(&desired_fwd) + .await + .context("failed to set forwarding on sled")?; + } + + Ok(()) +} diff --git a/nexus/tests/integration_tests/multicast/instances.rs b/nexus/tests/integration_tests/multicast/instances.rs index 245e284248e..521d85d0405 100644 --- a/nexus/tests/integration_tests/multicast/instances.rs +++ b/nexus/tests/integration_tests/multicast/instances.rs @@ -377,7 +377,7 @@ async fn test_multicast_group_attach_conflicts( } #[nexus_test] -async fn test_multicast_group_attach_limits( +async fn test_multicast_group_attach_multiple( cptestctx: &ControlPlaneTestContext, ) { let client = &cptestctx.external_client; @@ -390,14 +390,8 @@ async fn test_multicast_group_attach_limits( ) .await; - // Group names for implicit groups (implicitly created when first member joins) - let group_names = [ - "limit-test-group-0", - "limit-test-group-1", - "limit-test-group-2", - "limit-test-group-3", - "limit-test-group-4", - ]; + let group_names = + ["limit-test-group-0", "limit-test-group-1", "limit-test-group-2"]; // Create instance first (groups will be implicitly created when attached) let instance = instance_for_multicast_groups( @@ -409,8 +403,8 @@ async fn test_multicast_group_attach_limits( ) .await; - // Attach instance to 3 groups (implicitly creates each group) - let multicast_group_names = &group_names[0..3]; + // Attach instance to multiple groups (implicitly creates each group) + let multicast_group_names = &group_names; for group_name in multicast_group_names { multicast_group_attach( cptestctx, @@ -585,29 +579,23 @@ async fn test_multicast_concurrent_operations( // Wait for final state to be consistent (should still have 2 members) wait_for_member_count(client, "concurrent-test-group", 2).await; - // Concurrent operations during reconciler processing - - // Start a member addition and immediately follow with another operation - // This tests handling of operations that arrive while reconciler is processing - let rapid_ops_future = async { - multicast_group_attach( - cptestctx, - PROJECT_NAME, - "concurrent-instance-3", - "concurrent-test-group", - ) - .await; - // Don't wait for reconciler; immediately do another operation - multicast_group_detach( - client, - PROJECT_NAME, - "concurrent-instance-4", - "concurrent-test-group", - ) - .await; - }; - - rapid_ops_future.await; + // Back-to-back operations without waiting for reconciler between them. + // Tests that the reconciler handles state changes that arrive while it + // is still processing a previous batch. + multicast_group_attach( + cptestctx, + PROJECT_NAME, + "concurrent-instance-3", + "concurrent-test-group", + ) + .await; + multicast_group_detach( + client, + PROJECT_NAME, + "concurrent-instance-4", + "concurrent-test-group", + ) + .await; // Wait for system to reach consistent final state (should have 2 members) wait_for_member_count(client, "concurrent-test-group", 2).await; @@ -896,6 +884,94 @@ async fn test_multicast_migration_scenarios( .await .expect("Group should exist in DPD after migration"); + // Verify sled-agent state after migration: the target sled should + // have the VMM subscription and M2P mapping. The source sled should + // not have any subscription for the old propolis. + { + let datastore = nexus.datastore(); + let opctx = OpContext::for_tests( + cptestctx.logctx.log.clone(), + datastore.clone(), + ); + + let external_group = datastore + .multicast_group_lookup_by_ip(&opctx, multicast_ip) + .await + .expect("Should look up multicast group by IP"); + + let underlay_group_id = external_group + .underlay_group_id + .expect("Active group should have underlay_group_id"); + + let underlay_group = datastore + .underlay_multicast_group_fetch(&opctx, underlay_group_id) + .await + .expect("Should fetch underlay group"); + + let underlay_ipv6 = match underlay_group.multicast_ip.ip() { + IpAddr::V6(v6) => v6, + other => { + panic!("Expected IPv6 underlay address, got {other}") + } + }; + + // Target sled should have the VMM subscription after the + // reconciler pushes it via verify_members. Poll because the + // reconciler may still be propagating state to the sled-agent. + let post_info = nexus + .active_instance_info(&instance1_id, None) + .await + .unwrap() + .unwrap(); + + let target_agent = cptestctx + .sled_agents + .iter() + .find(|sa| sa.sled_agent_id() == target_sled) + .unwrap() + .sled_agent(); + + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let groups = target_agent.multicast_groups.lock().unwrap(); + let has_sub = + groups.get(&post_info.propolis_id).map_or(false, |g| { + g.iter().any(|m| m.group_ip == multicast_ip) + }); + if has_sub { Ok(()) } else { Err(CondCheckError::NotYet::<()>) } + }, + &POLL_INTERVAL, + &POLL_TIMEOUT, + ) + .await + .expect("Target sled should have VMM subscription after migration"); + + // Target sled should have M2P mapping. + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let m2p = target_agent.m2p_mappings.lock().unwrap(); + if m2p.contains(&(multicast_ip, underlay_ipv6)) { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &POLL_TIMEOUT, + ) + .await + .expect("Target sled should have M2P mapping after migration"); + + // TODO: assert the source sled no longer holds a multicast + // subscription for the old propolis_id. On real hardware, + // VMM teardown (release_opte_ports -> PortTicket::release_inner) + // clears it. The sim does not model per-propolis cleanup on + // unregister for any of the networking maps (external_ips, + // attached_subnets, multicast_groups). + } + // Case: Concurrent migrations let group2_name = "concurrent-migration-group"; @@ -911,7 +987,9 @@ async fn test_multicast_migration_scenarios( group2_name, ) .await; + wait_for_group_active(client, group2_name).await; + multicast_group_attach( cptestctx, project_name, @@ -1787,14 +1865,13 @@ async fn test_multicast_ipv6_lifecycle(cptestctx: &ControlPlaneTestContext) { instance_wait_for_state(client, instance_id, InstanceState::Running).await; wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - let member_joined = wait_for_member_state( + wait_for_member_state( cptestctx, group_name, instance.identity.id, nexus_db_model::MulticastGroupMemberState::Joined, ) .await; - assert_eq!(member_joined.state, "Joined"); // Stop the instance - member should transition to "Left" let stop_url = @@ -1813,14 +1890,13 @@ async fn test_multicast_ipv6_lifecycle(cptestctx: &ControlPlaneTestContext) { instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - let member_left = wait_for_member_state( + wait_for_member_state( cptestctx, group_name, instance.identity.id, nexus_db_model::MulticastGroupMemberState::Left, ) .await; - assert_eq!(member_left.state, "Left"); // Delete the instance - this should delete the group since it's the only member cleanup_instances(cptestctx, client, project_name, &["ipv6-instance"]) diff --git a/nexus/tests/integration_tests/multicast/mod.rs b/nexus/tests/integration_tests/multicast/mod.rs index cc3c947008c..3416ae4b1db 100644 --- a/nexus/tests/integration_tests/multicast/mod.rs +++ b/nexus/tests/integration_tests/multicast/mod.rs @@ -69,6 +69,7 @@ mod pool_selection; // Timeout constants for test operations const POLL_INTERVAL: Duration = Duration::from_millis(50); +const POLL_TIMEOUT: Duration = Duration::from_secs(30); const MULTICAST_OPERATION_TIMEOUT: Duration = Duration::from_secs(120); /// Generic helper for PUT upsert requests that return 201 Created. @@ -307,8 +308,8 @@ where /// This function verifies that inventory has SP data for EVERY in-service sled, /// not just that inventory completed. /// -/// This is required for multicast member operations which map `sled_id` → `sp_slot` -/// → switch ports via inventory. +/// This is required for multicast member operations which map `sled_id` to +/// `sp_slot` to switch ports via inventory. pub(crate) async fn ensure_inventory_ready( cptestctx: &ControlPlaneTestContext, ) { @@ -358,9 +359,8 @@ pub(crate) async fn ensure_inventory_ready( let mut missing_sleds = Vec::new(); for sled in &sleds { let has_sp = inventory.sps.iter().any(|(bb, _)| { - (bb.serial_number == sled.serial_number() - && bb.part_number == sled.part_number()) - || bb.serial_number == sled.serial_number() + bb.serial_number == sled.serial_number() + && bb.part_number == sled.part_number() }); if !has_sp { @@ -385,8 +385,8 @@ pub(crate) async fn ensure_inventory_ready( Err(CondCheckError::::NotYet) } }, - &Duration::from_millis(500), // Check every 500ms - &Duration::from_secs(120), // Wait up to 120s + &Duration::from_millis(500), + &MULTICAST_OPERATION_TIMEOUT, ) .await { @@ -448,8 +448,8 @@ pub(crate) async fn ensure_dpd_ready(cptestctx: &ControlPlaneTestContext) { } } }, - &Duration::from_millis(200), // Check every 200ms - &Duration::from_secs(30), // Wait up to 30 seconds for switches + &Duration::from_millis(200), + &POLL_TIMEOUT, ) .await { @@ -1067,19 +1067,16 @@ pub(crate) async fn wait_for_group_deleted( lockstep_client, || async { let group_url = mcast_group_url(group_name); - match NexusRequest::object_get(client, &group_url) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - { - Ok(response) => { - if response.status == StatusCode::NOT_FOUND { - Ok(()) - } else { - Err(CondCheckError::<()>::NotYet) - } - } - Err(_) => Ok(()), // Assume 404 or similar error means deleted + let response = NexusRequest::new( + RequestBuilder::new(client, Method::GET, &group_url) + .expect_status(Some(StatusCode::NOT_FOUND)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await; + match response { + Ok(_) => Ok(()), + Err(_) => Err(CondCheckError::<()>::NotYet), } }, &POLL_INTERVAL, diff --git a/nexus/tests/integration_tests/multicast/networking_integration.rs b/nexus/tests/integration_tests/multicast/networking_integration.rs index 3b28892ef82..6103633e9ef 100644 --- a/nexus/tests/integration_tests/multicast/networking_integration.rs +++ b/nexus/tests/integration_tests/multicast/networking_integration.rs @@ -8,10 +8,14 @@ //! //! - External IPs: Instances with ephemeral/floating IPs can join multicast groups //! - Floating IP attach/detach: Multicast membership unaffected by IP changes +//! - Sled-agent M2P/forwarding propagation on member join and group deletion +//! - Per-VMM multicast subscriptions via sled-agent -use std::time::Duration; +use std::net::IpAddr; use http::{Method, StatusCode}; +use nexus_db_lookup::LookupPath; +use nexus_db_queries::context::OpContext; use nexus_test_utils::http_testing::{AuthnMode, NexusRequest, RequestBuilder}; use nexus_test_utils::resource_helpers::create_floating_ip; use nexus_test_utils::resource_helpers::{ @@ -30,6 +34,7 @@ use omicron_common::api::external::{ ByteCount, IdentityMetadataCreateParams, Instance, InstanceCpuCount, NameOrId, }; +use omicron_nexus::TestInterfaces; use omicron_test_utils::dev::poll::{CondCheckError, wait_for_condition}; use omicron_uuid_kinds::{GenericUuid, InstanceUuid}; @@ -637,13 +642,13 @@ async fn test_multicast_with_floating_ip_basic( Err(CondCheckError::::NotYet) } }, - &Duration::from_millis(200), - &Duration::from_secs(30), + &POLL_INTERVAL, + &POLL_TIMEOUT, ) .await .unwrap_or_else(|e| { panic!( - "instance did not show floating IP {} as attached within 30s: {e:?}", + "instance did not show floating IP {} as attached within {POLL_TIMEOUT:?}: {e:?}", floating_ip.ip ) }); @@ -694,13 +699,13 @@ async fn test_multicast_with_floating_ip_basic( Err(CondCheckError::::NotYet) } }, - &Duration::from_millis(200), - &Duration::from_secs(30), + &POLL_INTERVAL, + &POLL_TIMEOUT, ) .await .unwrap_or_else(|e| { panic!( - "instance still showed floating IP {} as attached after 30s: {e:?}", + "instance still showed floating IP {} as attached after {POLL_TIMEOUT:?}: {e:?}", floating_ip.ip ) }); @@ -713,3 +718,987 @@ async fn test_multicast_with_floating_ip_basic( cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; wait_for_group_deleted(cptestctx, group_name).await; } + +/// Verify that when an instance joins a multicast group, the reconciler +/// pushes M2P mappings, forwarding entries, and per-VMM subscriptions +/// to the sim sled-agent. Also verify cleanup on instance deletion. +#[nexus_test] +async fn test_multicast_sled_agent_m2p_and_subscriptions( + cptestctx: &nexus_test_utils::ControlPlaneTestContext< + omicron_nexus::Server, + >, +) { + let client = &cptestctx.external_client; + let project_name = "sled-agent-mcast-project"; + let group_name = "sled-agent-mcast-group"; + let instance_name = "sled-agent-mcast-instance"; + + ops::join3( + create_project(client, project_name), + create_default_ip_pools(client), + create_multicast_ip_pool_with_range( + client, + "sled-agent-mcast-pool", + (224, 150, 0, 1), + (224, 150, 0, 255), + ), + ) + .await; + + ensure_multicast_test_ready(cptestctx).await; + + // Create and start an instance. + let instance_params = InstanceCreate { + identity: IdentityMetadataCreateParams { + name: instance_name.parse().unwrap(), + description: "Instance for sled-agent multicast test".to_string(), + }, + ncpus: InstanceCpuCount::try_from(1).unwrap(), + memory: ByteCount::from_gibibytes_u32(1), + hostname: instance_name.parse().unwrap(), + user_data: vec![], + ssh_public_keys: None, + network_interfaces: InstanceNetworkInterfaceAttachment::DefaultIpv4, + external_ips: vec![], + multicast_groups: vec![], + disks: vec![], + boot_disk: None, + cpu_platform: None, + start: true, + auto_restart_policy: Default::default(), + anti_affinity_groups: Vec::new(), + }; + + let instance_url = format!("/v1/instances?project={project_name}"); + let instance: Instance = + object_create(client, &instance_url, &instance_params).await; + let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); + + instance_wait_for_running_with_simulation(cptestctx, instance_id).await; + wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; + + // Attach instance to a multicast group. + multicast_group_attach(cptestctx, project_name, instance_name, group_name) + .await; + wait_for_group_active(client, group_name).await; + + // Wait for the member to reach "Joined" state (reconciler processes it). + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Joined, + ) + .await; + + // Look up the underlay multicast IPv6 address for verification. + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.clone(), datastore.clone()); + + let group_view = get_multicast_group(client, group_name).await; + let multicast_ip = group_view.multicast_ip; + + let external_group = datastore + .multicast_group_lookup_by_ip(&opctx, multicast_ip) + .await + .expect("Should look up multicast group by IP"); + + let underlay_group_id = external_group + .underlay_group_id + .expect("Active group should have underlay_group_id"); + + let underlay_group = datastore + .underlay_multicast_group_fetch(&opctx, underlay_group_id) + .await + .expect("Should fetch underlay multicast group"); + + let underlay_ipv6 = match underlay_group.multicast_ip.ip() { + IpAddr::V6(v6) => v6, + other => panic!("Expected IPv6 underlay address, got {other}"), + }; + + // Verify M2P mapping on the sim sled-agent. + let sled_agent = cptestctx.first_sled_agent(); + { + let m2p = sled_agent.m2p_mappings.lock().unwrap(); + assert!( + m2p.contains(&(multicast_ip, underlay_ipv6)), + "Sled-agent should have M2P mapping ({multicast_ip}, \ + {underlay_ipv6}), got: {m2p:?}" + ); + } + + // Verify forwarding entries on the sim sled-agent. + // With a single sled, the forwarding entry exists but has no next hops + // (no other sleds to forward to). + { + let fwd = sled_agent.mcast_fwd.lock().unwrap(); + assert!( + fwd.contains_key(&underlay_ipv6), + "Sled-agent should have forwarding entry for {underlay_ipv6}, \ + got: {fwd:?}" + ); + let next_hops = &fwd[&underlay_ipv6]; + assert!( + next_hops.is_empty(), + "Single-sled setup should have empty next_hops, got: {next_hops:?}" + ); + } + + // Verify per-VMM multicast subscription on the sim sled-agent. + { + let info = nexus + .active_instance_info(&instance_id, None) + .await + .unwrap() + .expect("Running instance should have active info"); + + let groups = sled_agent.multicast_groups.lock().unwrap(); + let vmm_groups = groups + .get(&info.propolis_id) + .expect("Sled-agent should have multicast groups for propolis"); + + assert!( + vmm_groups.iter().any(|m| m.group_ip == multicast_ip), + "VMM should be subscribed to multicast group {multicast_ip}, \ + got: {vmm_groups:?}" + ); + } + + // Stop the instance. The member transitions "Joined" -> "Left". + let stop_url = + format!("/v1/instances/{instance_name}/stop?project={project_name}"); + NexusRequest::new( + RequestBuilder::new(client, Method::POST, &stop_url) + .body(None as Option<&serde_json::Value>) + .expect_status(Some(StatusCode::ACCEPTED)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("Should stop instance"); + + wait_for_instance_stopped(cptestctx, client, instance_id, instance_name) + .await; + + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Left, + ) + .await; + + // Per-VMM subscription cleanup after stop is not asserted here. + // In production, destroying the VMM tears down the OPTE port, which + // implicitly removes multicast subscriptions. The reconciler's + // unsubscribe path correctly skips when the propolis_id is gone + // (matching production semantics where the port no longer exists). + // + // V2P follows the same pattern: sled-agent cleanup is keyed by + // network identity, not VMM identity. + + // M2P and forwarding should be cleared since there are no "Joined" + // members remaining. + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let m2p = sled_agent.m2p_mappings.lock().unwrap(); + if !m2p.contains(&(multicast_ip, underlay_ipv6)) { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .expect("M2P should be cleared when no Joined members remain"); + + // Forwarding should also be cleared when no "Joined" members remain. + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let fwd = sled_agent.mcast_fwd.lock().unwrap(); + if !fwd.contains_key(&underlay_ipv6) { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .expect("Forwarding should be cleared when no Joined members remain"); + + // Delete the instance, which should trigger group deletion. + cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; + wait_for_group_deleted(cptestctx, group_name).await; + + // Verify M2P and forwarding are cleared. + { + let m2p = sled_agent.m2p_mappings.lock().unwrap(); + assert!( + !m2p.contains(&(multicast_ip, underlay_ipv6)), + "M2P mapping should be cleared after group deletion, got: {m2p:?}" + ); + } + { + let fwd = sled_agent.mcast_fwd.lock().unwrap(); + assert!( + !fwd.contains_key(&underlay_ipv6), + "Forwarding entry should be cleared after group deletion, \ + got: {fwd:?}" + ); + } +} + +/// Verify M2P and forwarding entries propagate to all sleds, not just the +/// hosting sled. Analogous to `test_instance_v2p_mappings` which verifies +/// V2P mappings on all sleds. +/// +/// Also verifies cleanup: after instance deletion, M2P and forwarding +/// entries are removed from every sled. +#[nexus_test(extra_sled_agents = 1)] +async fn test_multicast_multi_sled_m2p_propagation( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let nexus = &cptestctx.server.server_context().nexus; + let project_name = "multi-sled-mcast-project"; + let group_name = "multi-sled-mcast-group"; + let instance_name = "multi-sled-mcast-instance"; + + ops::join3( + create_project(client, project_name), + create_default_ip_pools(client), + create_multicast_ip_pool_with_range( + client, + "multi-sled-mcast-pool", + (224, 160, 0, 1), + (224, 160, 0, 255), + ), + ) + .await; + + ensure_multicast_test_ready(cptestctx).await; + + // Collect all sled agents (2 total: 1 default + 1 extra). + // We use extra_sled_agents = 1 (not 2) because the gateway sim only + // provides SP data for the two well-known sled UUIDs. A 3rd sled with + // a random UUID would have no SP entry, causing inventory readiness + // to time out. Two sleds is sufficient to verify cross-sled propagation. + let all_sled_agents: Vec<_> = + cptestctx.sled_agents.iter().map(|sa| sa.sled_agent()).collect(); + assert_eq!(all_sled_agents.len(), 2, "expected 2 sled agents"); + + // Create and start an instance. + let instance = instance_for_multicast_groups( + cptestctx, + project_name, + instance_name, + true, + &[], + ) + .await; + let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); + + instance_wait_for_running_with_simulation(cptestctx, instance_id).await; + wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; + + // Attach to a multicast group. + multicast_group_attach(cptestctx, project_name, instance_name, group_name) + .await; + wait_for_group_active(client, group_name).await; + + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Joined, + ) + .await; + + // Look up the underlay IPv6 address for verification. + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.clone(), datastore.clone()); + + let group_view = get_multicast_group(client, group_name).await; + let multicast_ip = group_view.multicast_ip; + + let external_group = datastore + .multicast_group_lookup_by_ip(&opctx, multicast_ip) + .await + .expect("Should look up multicast group by IP"); + + let underlay_group_id = external_group + .underlay_group_id + .expect("Active group should have underlay_group_id"); + + let underlay_group = datastore + .underlay_multicast_group_fetch(&opctx, underlay_group_id) + .await + .expect("Should fetch underlay multicast group"); + + let underlay_ipv6 = match underlay_group.multicast_ip.ip() { + IpAddr::V6(v6) => v6, + other => panic!("Expected IPv6 underlay address, got {other}"), + }; + + // Look up the hosting sled for subscription verification. + let info = nexus + .active_instance_info(&instance_id, None) + .await + .unwrap() + .expect("Running instance should have active info"); + + let hosting_sled_id = info.sled_id; + + // M2P and forwarding are pushed to all sleds (like V2P). Any + // instance on any sled may send to a multicast group; without the + // M2P mapping OPTE's overlay layer silently drops the packet. + // Forwarding entries let sender sleds replicate to member sleds. + for (i, sled_agent) in cptestctx.sled_agents.iter().enumerate() { + let agent = sled_agent.sled_agent(); + + // Wait for M2P on every sled. The reconciler may need an + // additional pass after the member reaches "Joined": during + // reconcile_member_states, propagate_m2p_and_forwarding may + // see member_sleds=0 (member still "Joining" in DB), so the + // actual push happens in reconcile_active_groups or the next + // full pass. + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let m2p = agent.m2p_mappings.lock().unwrap(); + if m2p.contains(&(multicast_ip, underlay_ipv6)) { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!("Sled {i} should have M2P mapping within timeout: {e:?}") + }); + + // Verify forwarding on every sled. With a single member on + // one sled, the hosting sled's forwarding has no next hops + // (local delivery via subscription). Non-hosting sleds list + // the hosting sled as a next hop so senders can reach it. + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let fwd = agent.mcast_fwd.lock().unwrap(); + if fwd.contains_key(&underlay_ipv6) { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!( + "Sled {i} should have forwarding entry within timeout: {e:?}" + ) + }); + + let fwd = agent.mcast_fwd.lock().unwrap(); + let next_hops = &fwd[&underlay_ipv6]; + if sled_agent.sled_agent_id() == hosting_sled_id { + // Hosting sled: no next hops (only local member, OPTE + // delivers locally via subscription). + assert!( + next_hops.is_empty(), + "Hosting sled forwarding should have empty next_hops, \ + got: {next_hops:?}" + ); + } else { + // Non-hosting sled: next hop is the hosting sled so + // senders on this sled can reach the member. + assert_eq!( + next_hops.len(), + 1, + "Non-hosting sled {i} should have 1 next_hop (the hosting \ + sled), got: {next_hops:?}" + ); + } + } + + // Verify per-VMM subscription on the hosting sled only. + // Subscriptions are member-sled-only (not all sleds). + let hosting_agent = cptestctx + .sled_agents + .iter() + .find(|sa| sa.sled_agent_id() == hosting_sled_id) + .unwrap() + .sled_agent(); + + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let groups = hosting_agent.multicast_groups.lock().unwrap(); + match groups.get(&info.propolis_id) { + Some(vmm_groups) + if vmm_groups + .iter() + .any(|m| m.group_ip == multicast_ip) => + { + Ok(()) + } + _ => Err(CondCheckError::NotYet::<()>), + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!( + "VMM should be subscribed to {multicast_ip} within timeout: {e:?}" + ) + }); + + // Delete the instance, which triggers group deletion. + cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; + wait_for_group_deleted(cptestctx, group_name).await; + + // Verify cleanup on every sled: M2P and forwarding removed. + for (i, sled_agent) in all_sled_agents.iter().enumerate() { + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let m2p = sled_agent.m2p_mappings.lock().unwrap(); + let fwd = sled_agent.mcast_fwd.lock().unwrap(); + if !m2p.contains(&(multicast_ip, underlay_ipv6)) + && !fwd.contains_key(&underlay_ipv6) + { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!( + "Sled {i} M2P/forwarding not cleaned up within timeout: {e:?}" + ) + }); + } +} + +/// Verify cross-sled forwarding when members exist on both sleds. +/// +/// With one member on sled A and another on sled B, each sled's forwarding +/// entry should list the other sled as its sole next hop (self-exclusion). +/// This exercises the `.filter(|(id, _)| *id != sled_id)` logic in +/// `converge_forwarding`. +#[nexus_test(extra_sled_agents = 1)] +async fn test_multicast_cross_sled_forwarding( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.clone(), datastore.clone()); + let project_name = "bidir-fwd-project"; + let group_name = "bidir-fwd-group"; + let instance_a_name = "bidir-instance-a"; + let instance_b_name = "bidir-instance-b"; + + ops::join3( + create_project(client, project_name), + create_default_ip_pools(client), + create_multicast_ip_pool_with_range( + client, + "bidir-fwd-pool", + (224, 170, 0, 1), + (224, 170, 0, 255), + ), + ) + .await; + + ensure_multicast_test_ready(cptestctx).await; + + let sled_a_id = cptestctx.sled_agents[0].sled_agent_id(); + let sled_b_id = cptestctx.sled_agents[1].sled_agent_id(); + + // Pin instance A to sled A by making sled B non-provisionable. + { + let (authz_sled, ..) = LookupPath::new(&opctx, datastore) + .sled_id(sled_b_id) + .lookup_for(nexus_auth::authz::Action::Modify) + .await + .expect("lookup sled B"); + datastore + .sled_set_provision_policy( + &opctx, + &authz_sled, + nexus_types::external_api::sled::SledProvisionPolicy::NonProvisionable, + ) + .await + .expect("set sled B non-provisionable"); + } + + let instance_a = instance_for_multicast_groups( + cptestctx, + project_name, + instance_a_name, + true, + &[], + ) + .await; + let instance_a_id = InstanceUuid::from_untyped_uuid(instance_a.identity.id); + instance_wait_for_running_with_simulation(cptestctx, instance_a_id).await; + + // Verify instance A landed on sled A. + let info_a = nexus + .active_instance_info(&instance_a_id, None) + .await + .unwrap() + .expect("instance A should be running"); + assert_eq!(info_a.sled_id, sled_a_id, "instance A should be on sled A"); + + // Swap provisionability: sled A non-provisionable, sled B provisionable. + { + let (authz_sled_a, ..) = LookupPath::new(&opctx, datastore) + .sled_id(sled_a_id) + .lookup_for(nexus_auth::authz::Action::Modify) + .await + .expect("lookup sled A"); + let (authz_sled_b, ..) = LookupPath::new(&opctx, datastore) + .sled_id(sled_b_id) + .lookup_for(nexus_auth::authz::Action::Modify) + .await + .expect("lookup sled B"); + datastore + .sled_set_provision_policy( + &opctx, + &authz_sled_a, + nexus_types::external_api::sled::SledProvisionPolicy::NonProvisionable, + ) + .await + .expect("set sled A non-provisionable"); + datastore + .sled_set_provision_policy( + &opctx, + &authz_sled_b, + nexus_types::external_api::sled::SledProvisionPolicy::Provisionable, + ) + .await + .expect("set sled B provisionable"); + } + + let instance_b = instance_for_multicast_groups( + cptestctx, + project_name, + instance_b_name, + true, + &[], + ) + .await; + + let instance_b_id = InstanceUuid::from_untyped_uuid(instance_b.identity.id); + instance_wait_for_running_with_simulation(cptestctx, instance_b_id).await; + + // Verify instance B landed on sled B. + let info_b = nexus + .active_instance_info(&instance_b_id, None) + .await + .unwrap() + .expect("instance B should be running"); + + assert_eq!(info_b.sled_id, sled_b_id, "instance B should be on sled B"); + + // Both instances join the same multicast group. + multicast_group_attach( + cptestctx, + project_name, + instance_a_name, + group_name, + ) + .await; + + multicast_group_attach( + cptestctx, + project_name, + instance_b_name, + group_name, + ) + .await; + + wait_for_group_active(client, group_name).await; + + // Wait for both members to reach "Joined". + for instance in [&instance_a, &instance_b] { + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Joined, + ) + .await; + } + + // Resolve underlay IPv6 for forwarding assertions. + let group_view = get_multicast_group(client, group_name).await; + let external_group = datastore + .multicast_group_lookup_by_ip(&opctx, group_view.multicast_ip) + .await + .expect("lookup group by IP"); + + let underlay_group = datastore + .underlay_multicast_group_fetch( + &opctx, + external_group + .underlay_group_id + .expect("active group should have underlay_group_id"), + ) + .await + .expect("fetch underlay group"); + + let underlay_ipv6 = match underlay_group.multicast_ip.ip() { + IpAddr::V6(v6) => v6, + other => panic!("Expected IPv6 underlay address, got {other}"), + }; + + // Wait for forwarding entries on both sleds, then verify each sled's + // forwarding lists exactly the other sled (not itself). + let agent_a = cptestctx.sled_agents[0].sled_agent(); + let agent_b = cptestctx.sled_agents[1].sled_agent(); + + for (label, agent) in [("sled A", &agent_a), ("sled B", &agent_b)] { + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let fwd = agent.mcast_fwd.lock().unwrap(); + match fwd.get(&underlay_ipv6) { + Some(hops) if hops.len() == 1 => Ok(()), + _ => Err(CondCheckError::NotYet::<()>), + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!("{label} should have exactly 1 forwarding next_hop: {e:?}") + }); + } + + // Cleanup. + cleanup_instances( + cptestctx, + client, + project_name, + &[instance_a_name, instance_b_name], + ) + .await; + wait_for_group_deleted(cptestctx, group_name).await; +} + +/// Verify multicast state is re-established after simulated cold start. +/// Analogous to `test_instance_start_creates_networking_state` which tests +/// V2P re-establishment after forcibly clearing sled-agent state. +/// +/// Steps: a) create instance, b) join multicast, c) stop instance, +/// d) forcibly clear all sim sled-agent multicast state, e) restart +/// instance, f) verify M2P, forwarding, and per-VMM subscriptions are +/// re-established. +#[nexus_test(extra_sled_agents = 1)] +async fn test_multicast_cold_start_reestablishment( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let nexus = &cptestctx.server.server_context().nexus; + let project_name = "cold-start-mcast-project"; + let group_name = "cold-start-mcast-group"; + let instance_name = "cold-start-mcast-instance"; + + ops::join3( + create_project(client, project_name), + create_default_ip_pools(client), + create_multicast_ip_pool_with_range( + client, + "cold-start-mcast-pool", + (224, 170, 0, 1), + (224, 170, 0, 255), + ), + ) + .await; + + ensure_multicast_test_ready(cptestctx).await; + + let all_sled_agents: Vec<_> = + cptestctx.sled_agents.iter().map(|sa| sa.sled_agent()).collect(); + + // Create and start an instance, join a multicast group. + let instance = instance_for_multicast_groups( + cptestctx, + project_name, + instance_name, + true, + &[], + ) + .await; + let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); + + instance_wait_for_running_with_simulation(cptestctx, instance_id).await; + wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; + + multicast_group_attach(cptestctx, project_name, instance_name, group_name) + .await; + wait_for_group_active(client, group_name).await; + + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Joined, + ) + .await; + + // Look up the underlay IPv6. + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.clone(), datastore.clone()); + + let group_view = get_multicast_group(client, group_name).await; + let multicast_ip = group_view.multicast_ip; + + let external_group = datastore + .multicast_group_lookup_by_ip(&opctx, multicast_ip) + .await + .expect("Should look up multicast group by IP"); + + let underlay_group_id = external_group + .underlay_group_id + .expect("Active group should have underlay_group_id"); + + let underlay_group = datastore + .underlay_multicast_group_fetch(&opctx, underlay_group_id) + .await + .expect("Should fetch underlay multicast group"); + + let underlay_ipv6 = match underlay_group.multicast_ip.ip() { + IpAddr::V6(v6) => v6, + other => panic!("Expected IPv6 underlay address, got {other}"), + }; + + // M2P and forwarding are pushed to all sleds. Verify at least the + // hosting sled has M2P before we clear state. + let pre_info = nexus + .active_instance_info(&instance_id, None) + .await + .unwrap() + .expect("Running instance should have active info"); + + let pre_hosting_agent = cptestctx + .sled_agents + .iter() + .find(|sa| sa.sled_agent_id() == pre_info.sled_id) + .unwrap() + .sled_agent(); + + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let m2p = pre_hosting_agent.m2p_mappings.lock().unwrap(); + if m2p.contains(&(multicast_ip, underlay_ipv6)) { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .expect("Hosting sled M2P should exist before cold start simulation"); + + // Stop the instance. + let stop_url = + format!("/v1/instances/{instance_name}/stop?project={project_name}"); + NexusRequest::new( + RequestBuilder::new(client, Method::POST, &stop_url) + .body(None as Option<&serde_json::Value>) + .expect_status(Some(StatusCode::ACCEPTED)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("Should stop instance"); + + wait_for_instance_stopped(cptestctx, client, instance_id, instance_name) + .await; + + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Left, + ) + .await; + + // Forcibly clear all sim sled-agent multicast state, simulating a cold + // start where sled-agents lose in-memory state. + for sled_agent in &all_sled_agents { + sled_agent.m2p_mappings.lock().unwrap().clear(); + sled_agent.mcast_fwd.lock().unwrap().clear(); + sled_agent.multicast_groups.lock().unwrap().clear(); + } + + // Restart the instance. + let start_url = + format!("/v1/instances/{instance_name}/start?project={project_name}"); + NexusRequest::new( + RequestBuilder::new(client, Method::POST, &start_url) + .body(None as Option<&serde_json::Value>) + .expect_status(Some(StatusCode::ACCEPTED)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("Should start instance"); + + // Use `try_instance_simulate` here instead of `instance_wait_for_running_with_simulation` + // because the old VMM may still be draining from the sim collection after + // the stop. `instance_simulate` would panic if it pokes a VMM that was just + // removed; `try_instance_simulate` handles that gracefully. + wait_for_condition( + || async { + let _ = + instance_helpers::try_instance_simulate(nexus, &instance_id) + .await; + + let url = format!("/v1/instances/{instance_id}"); + let instance: Instance = NexusRequest::object_get(client, &url) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .map_err(|_| CondCheckError::<()>::NotYet)? + .parsed_body() + .map_err(|_| CondCheckError::<()>::NotYet)?; + + if instance.runtime.run_state == InstanceState::Running { + Ok(()) + } else { + Err(CondCheckError::<()>::NotYet) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .expect("Instance should reach Running after restart"); + + // Wait for the reconciler to re-establish multicast state. + wait_for_member_state( + cptestctx, + group_name, + instance.identity.id, + nexus_db_model::MulticastGroupMemberState::Joined, + ) + .await; + + // Verify M2P and forwarding re-established on all sleds. + for (i, sled_agent) in all_sled_agents.iter().enumerate() { + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let m2p = sled_agent.m2p_mappings.lock().unwrap(); + if m2p.contains(&(multicast_ip, underlay_ipv6)) { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!("Sled {i} M2P not re-established within timeout: {e:?}") + }); + + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let fwd = sled_agent.mcast_fwd.lock().unwrap(); + if fwd.contains_key(&underlay_ipv6) { + Ok(()) + } else { + Err(CondCheckError::NotYet::<()>) + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!( + "Sled {i} forwarding not re-established within timeout: {e:?}" + ) + }); + } + + // Verify per-VMM subscription on the hosting sled (new propolis_id + // since restart creates a new VMM). + let post_info = nexus + .active_instance_info(&instance_id, None) + .await + .unwrap() + .expect("Restarted instance should have active info"); + + let post_hosting_agent = cptestctx + .sled_agents + .iter() + .find(|sa| sa.sled_agent_id() == post_info.sled_id) + .unwrap() + .sled_agent(); + + wait_for_condition_with_reconciler( + &cptestctx.lockstep_client, + || async { + let groups = post_hosting_agent.multicast_groups.lock().unwrap(); + match groups.get(&post_info.propolis_id) { + Some(vmm_groups) + if vmm_groups + .iter() + .any(|m| m.group_ip == multicast_ip) => + { + Ok(()) + } + _ => Err(CondCheckError::NotYet::<()>), + } + }, + &POLL_INTERVAL, + &MULTICAST_OPERATION_TIMEOUT, + ) + .await + .unwrap_or_else(|e| { + panic!( + "New VMM should be subscribed to {multicast_ip} after restart: \ + {e:?}" + ) + }); + + // Cleanup. + cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; + wait_for_group_deleted(cptestctx, group_name).await; +} diff --git a/openapi/sled-agent/sled-agent-28.0.0-415efe.json.gitstub b/openapi/sled-agent/sled-agent-28.0.0-415efe.json.gitstub new file mode 100644 index 00000000000..a8377752515 --- /dev/null +++ b/openapi/sled-agent/sled-agent-28.0.0-415efe.json.gitstub @@ -0,0 +1 @@ +789f68549117d5f7cf59b3679969301cfcb72443:openapi/sled-agent/sled-agent-28.0.0-415efe.json diff --git a/openapi/sled-agent/sled-agent-28.0.0-415efe.json b/openapi/sled-agent/sled-agent-29.0.0-0f4904.json similarity index 96% rename from openapi/sled-agent/sled-agent-28.0.0-415efe.json rename to openapi/sled-agent/sled-agent-29.0.0-0f4904.json index d85b782acbd..e3274a251f5 100644 --- a/openapi/sled-agent/sled-agent-28.0.0-415efe.json +++ b/openapi/sled-agent/sled-agent-29.0.0-0f4904.json @@ -7,7 +7,7 @@ "url": "https://oxide.computer", "email": "api@oxide.computer" }, - "version": "28.0.0" + "version": "29.0.0" }, "paths": { "/artifacts": { @@ -389,6 +389,162 @@ } } }, + "/networking/mcast-fwd": { + "get": { + "summary": "List multicast forwarding entries present on this sled.", + "operationId": "list_mcast_fwd", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "title": "Array_of_McastForwardingEntry", + "type": "array", + "items": { + "$ref": "#/components/schemas/McastForwardingEntry" + } + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "put": { + "summary": "Set multicast forwarding entries for an underlay address.", + "operationId": "set_mcast_fwd", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/McastForwardingEntry" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "delete": { + "summary": "Clear multicast forwarding entries for an underlay address.", + "operationId": "clear_mcast_fwd", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ClearMcastForwarding" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, + "/networking/mcast-m2p": { + "get": { + "summary": "List M2P mappings present on this sled.", + "operationId": "list_mcast_m2p", + "responses": { + "200": { + "description": "successful operation", + "content": { + "application/json": { + "schema": { + "title": "Array_of_Mcast2PhysMapping", + "type": "array", + "items": { + "$ref": "#/components/schemas/Mcast2PhysMapping" + } + } + } + } + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "put": { + "summary": "Set a multicast-to-physical (M2P) mapping in OPTE.", + "operationId": "set_mcast_m2p", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Mcast2PhysMapping" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + }, + "delete": { + "summary": "Clear a multicast-to-physical (M2P) mapping in OPTE.", + "operationId": "clear_mcast_m2p", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ClearMcast2Phys" + } + } + }, + "required": true + }, + "responses": { + "204": { + "description": "resource updated" + }, + "4XX": { + "$ref": "#/components/responses/Error" + }, + "5XX": { + "$ref": "#/components/responses/Error" + } + } + } + }, "/omicron-config": { "put": { "operationId": "omicron_config_put", @@ -2310,7 +2466,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/InstanceMulticastBody" + "$ref": "#/components/schemas/InstanceMulticastMembership" } } }, @@ -2344,7 +2500,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/InstanceMulticastBody" + "$ref": "#/components/schemas/InstanceMulticastMembership" } } }, @@ -3998,6 +4154,40 @@ } ] }, + "ClearMcast2Phys": { + "description": "Clear a mapping from an overlay multicast group to an underlay multicast address.", + "type": "object", + "properties": { + "group": { + "description": "Overlay multicast group address.", + "type": "string", + "format": "ip" + }, + "underlay": { + "description": "Underlay IPv6 multicast address. See [`Mcast2PhysMapping::underlay`].", + "type": "string", + "format": "ipv6" + } + }, + "required": [ + "group", + "underlay" + ] + }, + "ClearMcastForwarding": { + "description": "Clear all forwarding entries for an underlay multicast address.", + "type": "object", + "properties": { + "underlay": { + "description": "Underlay IPv6 multicast address. See [`Mcast2PhysMapping::underlay`].", + "type": "string", + "format": "ipv6" + } + }, + "required": [ + "underlay" + ] + }, "CombineError": { "type": "string", "enum": [ @@ -5830,35 +6020,6 @@ "src_propolis_addr" ] }, - "InstanceMulticastBody": { - "description": "Request body for multicast group operations.", - "oneOf": [ - { - "type": "object", - "properties": { - "join": { - "$ref": "#/components/schemas/InstanceMulticastMembership" - } - }, - "required": [ - "join" - ], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "leave": { - "$ref": "#/components/schemas/InstanceMulticastMembership" - } - }, - "required": [ - "leave" - ], - "additionalProperties": false - } - ] - }, "InstanceMulticastMembership": { "description": "Represents a multicast group membership for an instance.\n\nIntroduced in v7.", "type": "object", @@ -6677,6 +6838,151 @@ "minimum": 1, "maximum": 32 }, + "Mcast2PhysMapping": { + "description": "Mapping from an overlay multicast group to an underlay multicast address.\n\nThe underlay address must be within `UNDERLAY_MULTICAST_SUBNET` (ff04::/64, a subset of admin-local scope per [RFC 7346]). This invariant is enforced by mapping in Nexus, not validated at this layer.\n\n[RFC 7346]: https://www.rfc-editor.org/rfc/rfc7346", + "type": "object", + "properties": { + "group": { + "description": "Overlay multicast group address.", + "type": "string", + "format": "ip" + }, + "underlay": { + "description": "Underlay IPv6 multicast address (ff04::/64).", + "type": "string", + "format": "ipv6" + } + }, + "required": [ + "group", + "underlay" + ] + }, + "McastFilterMode": { + "description": "Filter mode for multicast source filtering.", + "oneOf": [ + { + "description": "Accept only packets from listed sources (SSM).", + "type": "string", + "enum": [ + "include" + ] + }, + { + "description": "Accept packets from all sources except those listed. With an empty sources list this is any-source multicast (ASM).", + "type": "string", + "enum": [ + "exclude" + ] + } + ] + }, + "McastForwardingEntry": { + "description": "Forwarding entry for an underlay multicast address, specifying which next hops should receive replicated packets.", + "type": "object", + "properties": { + "next_hops": { + "description": "Next hops with replication and source filter configuration.", + "type": "array", + "items": { + "$ref": "#/components/schemas/McastForwardingNextHop" + } + }, + "underlay": { + "description": "Underlay IPv6 multicast address. See [`Mcast2PhysMapping::underlay`].", + "type": "string", + "format": "ipv6" + } + }, + "required": [ + "next_hops", + "underlay" + ] + }, + "McastForwardingNextHop": { + "description": "A forwarding next hop with replication mode and aggregated source filter.", + "type": "object", + "properties": { + "filter": { + "description": "Aggregated source filter for this destination.", + "allOf": [ + { + "$ref": "#/components/schemas/McastSourceFilter" + } + ] + }, + "next_hop": { + "description": "Unicast IPv6 address of the destination sled.", + "type": "string", + "format": "ipv6" + }, + "replication": { + "description": "Replication mode for this next hop.", + "allOf": [ + { + "$ref": "#/components/schemas/McastReplication" + } + ] + } + }, + "required": [ + "filter", + "next_hop", + "replication" + ] + }, + "McastReplication": { + "description": "Replication mode for multicast forwarding.", + "oneOf": [ + { + "description": "Replicate to front panel ports (egress to external networks).", + "type": "string", + "enum": [ + "external" + ] + }, + { + "description": "Replicate to sled underlay ports.", + "type": "string", + "enum": [ + "underlay" + ] + }, + { + "description": "Replicate to both external and underlay ports.", + "type": "string", + "enum": [ + "both" + ] + } + ] + }, + "McastSourceFilter": { + "description": "Source filter for multicast forwarding.", + "type": "object", + "properties": { + "mode": { + "description": "Filter mode.", + "allOf": [ + { + "$ref": "#/components/schemas/McastFilterMode" + } + ] + }, + "sources": { + "description": "Source addresses to include or exclude.", + "type": "array", + "items": { + "type": "string", + "format": "ip" + } + } + }, + "required": [ + "mode", + "sources" + ] + }, "Measurement": { "description": "An RoT provided measurement which represents a digest of some component in the trusted computing base (TCB) for the attestor.", "oneOf": [ diff --git a/openapi/sled-agent/sled-agent-latest.json b/openapi/sled-agent/sled-agent-latest.json index 7f2b470816a..37c49c4d171 120000 --- a/openapi/sled-agent/sled-agent-latest.json +++ b/openapi/sled-agent/sled-agent-latest.json @@ -1 +1 @@ -sled-agent-28.0.0-415efe.json \ No newline at end of file +sled-agent-29.0.0-0f4904.json \ No newline at end of file diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index a326908216b..82b52489fae 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -15,8 +15,9 @@ use dropshot_api_manager_types::api_versions; use omicron_common::api::internal::{ nexus::{DiskRuntimeState, SledVmmState}, shared::{ - ExternalIpGatewayMap, ResolvedVpcRouteSet, ResolvedVpcRouteState, - SledIdentifiers, VirtualNetworkInterfaceHost, + ClearMcast2Phys, ClearMcastForwarding, ExternalIpGatewayMap, + Mcast2PhysMapping, McastForwardingEntry, ResolvedVpcRouteSet, + ResolvedVpcRouteState, SledIdentifiers, VirtualNetworkInterfaceHost, }, }; use sled_agent_types_versions::{ @@ -37,6 +38,7 @@ api_versions!([ // | example for the next person. // v // (next_int, IDENT), + (29, MCAST_M2P_FORWARDING), (28, MODIFY_SERVICES_IN_INVENTORY), (27, RENAME_SWITCH_LOCATION_TO_SWITCH_SLOT), (26, RACK_NETWORK_CONFIG_NOT_OPTIONAL), @@ -582,25 +584,79 @@ pub trait SledAgentApi { #[endpoint { method = PUT, path = "/vmms/{propolis_id}/multicast-group", - versions = VERSION_MULTICAST_SUPPORT.., + versions = VERSION_MCAST_M2P_FORWARDING.., }] async fn vmm_join_multicast_group( rqctx: RequestContext, path_params: Path, - body: TypedBody, + body: TypedBody, ) -> Result; #[endpoint { method = DELETE, path = "/vmms/{propolis_id}/multicast-group", - versions = VERSION_MULTICAST_SUPPORT.., + versions = VERSION_MCAST_M2P_FORWARDING.., }] async fn vmm_leave_multicast_group( rqctx: RequestContext, path_params: Path, - body: TypedBody, + body: TypedBody, ) -> Result; + /// Join a multicast group. + /// + /// Accepts a tagged `InstanceMulticastBody` request. + /// Superseded in MCAST_M2P_FORWARDING. + #[endpoint { + operation_id = "vmm_join_multicast_group", + method = PUT, + path = "/vmms/{propolis_id}/multicast-group", + versions = VERSION_MULTICAST_SUPPORT..VERSION_MCAST_M2P_FORWARDING, + }] + async fn vmm_join_multicast_group_v7( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result { + let body = body.try_map(|b| match b { + v7::instance::InstanceMulticastBody::Join(m) => Ok(m), + v7::instance::InstanceMulticastBody::Leave(_) => { + Err(HttpError::for_bad_request( + None, + "Join endpoint cannot process Leave operations".to_string(), + )) + } + })?; + Self::vmm_join_multicast_group(rqctx, path_params, body).await + } + + /// Leave a multicast group. + /// + /// Accepts a tagged `InstanceMulticastBody` request. + /// Superseded in MCAST_M2P_FORWARDING. + #[endpoint { + operation_id = "vmm_leave_multicast_group", + method = DELETE, + path = "/vmms/{propolis_id}/multicast-group", + versions = VERSION_MULTICAST_SUPPORT..VERSION_MCAST_M2P_FORWARDING, + }] + async fn vmm_leave_multicast_group_v7( + rqctx: RequestContext, + path_params: Path, + body: TypedBody, + ) -> Result { + let body = body.try_map(|b| match b { + v7::instance::InstanceMulticastBody::Leave(m) => Ok(m), + v7::instance::InstanceMulticastBody::Join(_) => { + Err(HttpError::for_bad_request( + None, + "Leave endpoint cannot process Join operations".to_string(), + )) + } + })?; + Self::vmm_leave_multicast_group(rqctx, path_params, body).await + } + #[endpoint { method = PUT, path = "/disks/{disk_id}", @@ -741,6 +797,70 @@ pub trait SledAgentApi { rqctx: RequestContext, ) -> Result>, HttpError>; + /// Set a multicast-to-physical (M2P) mapping in OPTE. + #[endpoint { + method = PUT, + path = "/networking/mcast-m2p", + versions = VERSION_MCAST_M2P_FORWARDING.., + }] + async fn set_mcast_m2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result; + + /// Clear a multicast-to-physical (M2P) mapping in OPTE. + #[endpoint { + method = DELETE, + path = "/networking/mcast-m2p", + versions = VERSION_MCAST_M2P_FORWARDING.., + }] + async fn clear_mcast_m2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result; + + /// Set multicast forwarding entries for an underlay address. + #[endpoint { + method = PUT, + path = "/networking/mcast-fwd", + versions = VERSION_MCAST_M2P_FORWARDING.., + }] + async fn set_mcast_fwd( + rqctx: RequestContext, + body: TypedBody, + ) -> Result; + + /// Clear multicast forwarding entries for an underlay address. + #[endpoint { + method = DELETE, + path = "/networking/mcast-fwd", + versions = VERSION_MCAST_M2P_FORWARDING.., + }] + async fn clear_mcast_fwd( + rqctx: RequestContext, + body: TypedBody, + ) -> Result; + + /// List M2P mappings present on this sled. + #[endpoint { + method = GET, + path = "/networking/mcast-m2p", + versions = VERSION_MCAST_M2P_FORWARDING.., + }] + async fn list_mcast_m2p( + rqctx: RequestContext, + ) -> Result>, HttpError>; + + /// List multicast forwarding entries present on this sled. + #[endpoint { + method = GET, + path = "/networking/mcast-fwd", + versions = VERSION_MCAST_M2P_FORWARDING.., + }] + async fn list_mcast_fwd( + rqctx: RequestContext, + ) -> Result>, HttpError>; + #[endpoint { method = POST, path = "/switch-ports", diff --git a/sled-agent/src/http_entrypoints.rs b/sled-agent/src/http_entrypoints.rs index f7811a418e3..bb78a83675f 100644 --- a/sled-agent/src/http_entrypoints.rs +++ b/sled-agent/src/http_entrypoints.rs @@ -19,8 +19,9 @@ use dropshot::{ use omicron_common::api::external::Error; use omicron_common::api::internal::nexus::{DiskRuntimeState, SledVmmState}; use omicron_common::api::internal::shared::{ - ExternalIpGatewayMap, ResolvedVpcRouteSet, ResolvedVpcRouteState, - SledIdentifiers, VirtualNetworkInterfaceHost, + ClearMcast2Phys, ClearMcastForwarding, ExternalIpGatewayMap, + Mcast2PhysMapping, McastForwardingEntry, ResolvedVpcRouteSet, + ResolvedVpcRouteState, SledIdentifiers, VirtualNetworkInterfaceHost, }; use range_requests::PotentialRange; use sled_agent_api::*; @@ -44,7 +45,7 @@ use sled_agent_types::disk::{DiskEnsureBody, DiskPathParam}; use sled_agent_types::early_networking::EarlyNetworkConfigEnvelope; use sled_agent_types::firewall_rules::VpcFirewallRulesEnsureBody; use sled_agent_types::instance::{ - InstanceEnsureBody, InstanceExternalIpBody, InstanceMulticastBody, + InstanceEnsureBody, InstanceExternalIpBody, InstanceMulticastMembership, VmmIssueDiskSnapshotRequestBody, VmmIssueDiskSnapshotRequestPathParam, VmmIssueDiskSnapshotRequestResponse, VmmPathParam, VmmPutStateBody, VmmPutStateResponse, VmmUnregisterResponse, VpcPathParam, @@ -709,14 +710,14 @@ impl SledAgentApi for SledAgentImpl { async fn vmm_join_multicast_group( rqctx: RequestContext, path_params: Path, - body: TypedBody, + body: TypedBody, ) -> Result { let sa = rqctx.context(); let id = path_params.into_inner().propolis_id; - let body_args = body.into_inner(); + let membership = body.into_inner(); sa.latencies() .instrument_dropshot_handler(&rqctx, async { - sa.instance_join_multicast_group(id, &body_args).await?; + sa.instance_join_multicast_group(id, &membership).await?; Ok(HttpResponseUpdatedNoContent()) }) .await @@ -725,14 +726,14 @@ impl SledAgentApi for SledAgentImpl { async fn vmm_leave_multicast_group( rqctx: RequestContext, path_params: Path, - body: TypedBody, + body: TypedBody, ) -> Result { let sa = rqctx.context(); let id = path_params.into_inner().propolis_id; - let body_args = body.into_inner(); + let membership = body.into_inner(); sa.latencies() .instrument_dropshot_handler(&rqctx, async { - sa.instance_leave_multicast_group(id, &body_args).await?; + sa.instance_leave_multicast_group(id, &membership).await?; Ok(HttpResponseUpdatedNoContent()) }) .await @@ -931,6 +932,86 @@ impl SledAgentApi for SledAgentImpl { .await } + async fn set_mcast_m2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let body_args = body.into_inner(); + sa.latencies() + .instrument_dropshot_handler(&rqctx, async { + sa.set_mcast_m2p(&body_args).await.map_err(Error::from)?; + Ok(HttpResponseUpdatedNoContent()) + }) + .await + } + + async fn clear_mcast_m2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let body_args = body.into_inner(); + sa.latencies() + .instrument_dropshot_handler(&rqctx, async { + sa.clear_mcast_m2p(&body_args).await.map_err(Error::from)?; + Ok(HttpResponseUpdatedNoContent()) + }) + .await + } + + async fn set_mcast_fwd( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let body_args = body.into_inner(); + sa.latencies() + .instrument_dropshot_handler(&rqctx, async { + sa.set_mcast_fwd(&body_args).await.map_err(Error::from)?; + Ok(HttpResponseUpdatedNoContent()) + }) + .await + } + + async fn clear_mcast_fwd( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + let body_args = body.into_inner(); + sa.latencies() + .instrument_dropshot_handler(&rqctx, async { + sa.clear_mcast_fwd(&body_args).await.map_err(Error::from)?; + Ok(HttpResponseUpdatedNoContent()) + }) + .await + } + + async fn list_mcast_m2p( + rqctx: RequestContext, + ) -> Result>, HttpError> { + let sa = rqctx.context(); + sa.latencies() + .instrument_dropshot_handler(&rqctx, async { + let m2p = sa.list_mcast_m2p().await.map_err(Error::from)?; + Ok(HttpResponseOk(m2p)) + }) + .await + } + + async fn list_mcast_fwd( + rqctx: RequestContext, + ) -> Result>, HttpError> { + let sa = rqctx.context(); + sa.latencies() + .instrument_dropshot_handler(&rqctx, async { + let fwd = sa.list_mcast_fwd().await.map_err(Error::from)?; + Ok(HttpResponseOk(fwd)) + }) + .await + } + async fn uplink_ensure( rqctx: RequestContext, body: TypedBody, diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 66016d6af81..a29afa9c942 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -2497,7 +2497,11 @@ impl InstanceRunner { // for them. let mut opte_ports = Vec::with_capacity(self.requested_nics.len()); let mut opte_port_names = Vec::with_capacity(self.requested_nics.len()); + let mcast_cfg = self.multicast_group_cfgs(); for nic in self.requested_nics.iter() { + // Multicast subscriptions target the primary NIC only. + // See the TODO on ensure_multicast_groups. + let groups: &[_] = if nic.primary { &mcast_cfg } else { &[] }; let port = self.port_manager.create_port(PortCreateParams { nic, external_ips: &self.external_ips, @@ -2509,6 +2513,7 @@ impl InstanceRunner { .copied() .map(Into::into) .collect(), + multicast_groups: groups, })?; opte_port_names.push(port.0.name().to_string()); opte_ports.push(port); @@ -2794,12 +2799,13 @@ impl InstanceRunner { &mut self, membership: &InstanceMulticastMembership, ) -> Result<(), Error> { - // Similar logic to add_external_ip - save state for rollback + // Save pre-call state so rollback restores exactly what was + // present, mirroring add_external_ip's old_config pattern. + let old_groups = self.multicast_groups.clone(); let out = self.join_multicast_group_inner(membership).await; if out.is_err() { - // Rollback state on error - self.multicast_groups.retain(|m| m != membership); + self.multicast_groups = old_groups; } out } @@ -2808,14 +2814,13 @@ impl InstanceRunner { &mut self, membership: &InstanceMulticastMembership, ) -> Result<(), Error> { - // Similar logic to delete_external_ip - save state for rollback + // Save pre-call state so rollback restores exactly what was + // present, mirroring delete_external_ip's old_config pattern. + let old_groups = self.multicast_groups.clone(); let out = self.leave_multicast_group_inner(membership).await; if out.is_err() { - // Rollback state on error - readd the membership if it was removed - if !self.multicast_groups.contains(membership) { - self.multicast_groups.push(membership.clone()); - } + self.multicast_groups = old_groups; } out } @@ -2824,105 +2829,75 @@ impl InstanceRunner { self.refresh_multicast_groups_inner() } - async fn join_multicast_group_inner( - &mut self, - membership: &InstanceMulticastMembership, - ) -> Result<(), Error> { - // Check for duplicate membership (idempotency) - if self.multicast_groups.contains(membership) { - return Ok(()); - } - - // Add to local state - self.multicast_groups.push(membership.clone()); + /// Convert `InstanceMulticastMembership` list to OPTE + /// `MulticastGroupCfg` list. + fn multicast_group_cfgs( + &self, + ) -> Vec { + self.multicast_groups + .iter() + .map(|m| illumos_utils::opte::MulticastGroupCfg { + group_ip: m.group_ip, + sources: m.sources.clone(), + }) + .collect() + } - // Update OPTE configuration + /// Sync the current multicast group memberships to OPTE via the + /// port manager. + /// + // TODO: subscriptions target the primary NIC only. + // InstanceMulticastMembership carries no NIC identifier, same as + // external IPs and attached subnets (though not firewall rules, + // which fan out across all VPC ports by VNI). If per-NIC multicast + // is needed, the membership type needs a NIC field and both this + // function and setup_propolis_zone must be updated. + fn ensure_multicast_groups(&self) -> Result<(), Error> { let Some(primary_nic) = self.primary_nic() else { return Err(Error::Opte(illumos_utils::opte::Error::NoPrimaryNic)); }; - // Convert InstanceMulticastMembership to MulticastGroupCfg - let multicast_cfg: Vec = self - .multicast_groups - .iter() - .map(|membership| illumos_utils::opte::MulticastGroupCfg { - group_ip: membership.group_ip, - sources: membership.sources.clone(), - }) - .collect(); - - // Validate multicast configuration with OPTE self.port_manager.multicast_groups_ensure( primary_nic.id, primary_nic.kind, - &multicast_cfg, + &self.multicast_group_cfgs(), )?; - // TODO: Configure underlay multicast group addresses on the zone's vNIC. - // This should add the multicast group addresses to the zone's network - // interface so it can receive underlay multicast traffic (physical - // network layer). Rack-wide dataplane forwarding is handled by the - // RPW reconciler + DPD. - // See also: port_manager.rs multicast_groups_ensure() TODO about - // configuring OPTE port-level multicast group membership. - Ok(()) } - async fn leave_multicast_group_inner( + async fn join_multicast_group_inner( &mut self, membership: &InstanceMulticastMembership, ) -> Result<(), Error> { - // Remove from local state - self.multicast_groups.retain(|m| m != membership); - - // Update OPTE configuration - let Some(primary_nic) = self.primary_nic() else { - return Err(Error::Opte(illumos_utils::opte::Error::NoPrimaryNic)); - }; + // Idempotent -> skip if already subscribed. + if self.multicast_groups.contains(membership) { + return Ok(()); + } - // Convert InstanceMulticastMembership to MulticastGroupCfg - let multicast_cfg: Vec = self - .multicast_groups - .iter() - .map(|membership| illumos_utils::opte::MulticastGroupCfg { - group_ip: membership.group_ip, - sources: membership.sources.clone(), - }) - .collect(); + self.multicast_groups.push(membership.clone()); + self.ensure_multicast_groups()?; - self.port_manager.multicast_groups_ensure( - primary_nic.id, - primary_nic.kind, - &multicast_cfg, - )?; + // OPTE's xde driver uses mac_siphon_set on the underlay NIC to + // receive all packets (including multicast) at the MAC layer. + // + // Subscription filtering and delivery happen inside OPTE via + // mcast_subscribe. Rack-wide dataplane forwarding is handled by + // the RPW reconciler + DPD. Ok(()) } - fn refresh_multicast_groups_inner(&mut self) -> Result<(), Error> { - // Update OPTE configuration - let Some(primary_nic) = self.primary_nic() else { - return Err(Error::Opte(illumos_utils::opte::Error::NoPrimaryNic)); - }; - - // Convert InstanceMulticastMembership to MulticastGroupCfg - let multicast_cfg: Vec = self - .multicast_groups - .iter() - .map(|membership| illumos_utils::opte::MulticastGroupCfg { - group_ip: membership.group_ip, - sources: membership.sources.clone(), - }) - .collect(); - - self.port_manager.multicast_groups_ensure( - primary_nic.id, - primary_nic.kind, - &multicast_cfg, - )?; + async fn leave_multicast_group_inner( + &mut self, + membership: &InstanceMulticastMembership, + ) -> Result<(), Error> { + self.multicast_groups.retain(|m| m != membership); + self.ensure_multicast_groups() + } - Ok(()) + fn refresh_multicast_groups_inner(&mut self) -> Result<(), Error> { + self.ensure_multicast_groups() } } diff --git a/sled-agent/src/instance_manager.rs b/sled-agent/src/instance_manager.rs index 71c39e93d82..5d941c9cc86 100644 --- a/sled-agent/src/instance_manager.rs +++ b/sled-agent/src/instance_manager.rs @@ -26,7 +26,9 @@ use sled_agent_config_reconciler::CurrentlyManagedZpoolsReceiver; use sled_agent_types::attached_subnet::AttachedSubnet; use sled_agent_types::attached_subnet::AttachedSubnets; use sled_agent_types::instance::*; -use sled_agent_types::instance::{InstanceEnsureBody, InstanceMulticastBody}; +use sled_agent_types::instance::{ + InstanceEnsureBody, InstanceMulticastMembership, +}; use slog::Logger; use slog_error_chain::InlineErrorChain; use std::collections::BTreeMap; @@ -308,14 +310,14 @@ impl InstanceManager { pub async fn join_multicast_group( &self, propolis_id: PropolisUuid, - multicast_body: &InstanceMulticastBody, + membership: &InstanceMulticastMembership, ) -> Result<(), Error> { let (tx, rx) = oneshot::channel(); self.inner .tx .send(InstanceManagerRequest::JoinMulticastGroup { propolis_id, - multicast_body: multicast_body.clone(), + membership: membership.clone(), tx, }) .await @@ -327,14 +329,14 @@ impl InstanceManager { pub async fn leave_multicast_group( &self, propolis_id: PropolisUuid, - multicast_body: &InstanceMulticastBody, + membership: &InstanceMulticastMembership, ) -> Result<(), Error> { let (tx, rx) = oneshot::channel(); self.inner .tx .send(InstanceManagerRequest::LeaveMulticastGroup { propolis_id, - multicast_body: multicast_body.clone(), + membership: membership.clone(), tx, }) .await @@ -486,12 +488,12 @@ enum InstanceManagerRequest { }, JoinMulticastGroup { propolis_id: PropolisUuid, - multicast_body: InstanceMulticastBody, + membership: InstanceMulticastMembership, tx: oneshot::Sender>, }, LeaveMulticastGroup { propolis_id: PropolisUuid, - multicast_body: InstanceMulticastBody, + membership: InstanceMulticastMembership, tx: oneshot::Sender>, }, GetState { @@ -632,11 +634,11 @@ impl InstanceManagerRunner { Some(RefreshExternalIps { tx }) => { self.refresh_external_ips(tx) }, - Some(JoinMulticastGroup { propolis_id, multicast_body, tx }) => { - self.join_multicast_group(tx, propolis_id, &multicast_body) + Some(JoinMulticastGroup { propolis_id, membership, tx }) => { + self.join_multicast_group(tx, propolis_id, &membership) }, - Some(LeaveMulticastGroup { propolis_id, multicast_body, tx }) => { - self.leave_multicast_group(tx, propolis_id, &multicast_body) + Some(LeaveMulticastGroup { propolis_id, membership, tx }) => { + self.leave_multicast_group(tx, propolis_id, &membership) } Some(GetState { propolis_id, tx }) => { // TODO(eliza): it could potentially be nice to @@ -909,20 +911,12 @@ impl InstanceManagerRunner { &self, tx: oneshot::Sender>, propolis_id: PropolisUuid, - multicast_body: &InstanceMulticastBody, + membership: &InstanceMulticastMembership, ) -> Result<(), Error> { let Some(instance) = self.get_propolis(propolis_id) else { return Err(Error::NoSuchVmm(propolis_id)); }; - - match multicast_body { - InstanceMulticastBody::Join(membership) => { - instance.join_multicast_group(tx, membership)?; - } - InstanceMulticastBody::Leave(membership) => { - instance.leave_multicast_group(tx, membership)?; - } - } + instance.join_multicast_group(tx, membership)?; Ok(()) } @@ -930,20 +924,12 @@ impl InstanceManagerRunner { &self, tx: oneshot::Sender>, propolis_id: PropolisUuid, - multicast_body: &InstanceMulticastBody, + membership: &InstanceMulticastMembership, ) -> Result<(), Error> { let Some(instance) = self.get_propolis(propolis_id) else { return Err(Error::NoSuchVmm(propolis_id)); }; - - match multicast_body { - InstanceMulticastBody::Join(membership) => { - instance.join_multicast_group(tx, membership)?; - } - InstanceMulticastBody::Leave(membership) => { - instance.leave_multicast_group(tx, membership)?; - } - } + instance.leave_multicast_group(tx, membership)?; Ok(()) } diff --git a/sled-agent/src/probe_manager.rs b/sled-agent/src/probe_manager.rs index af54de782e9..f65f495a8bc 100644 --- a/sled-agent/src/probe_manager.rs +++ b/sled-agent/src/probe_manager.rs @@ -378,6 +378,7 @@ impl ProbeManagerInner { // but probes are supposed to mimic instances as closely as // possible. We should consider if we want to support them here. attached_subnets: vec![], + multicast_groups: &[], })?; let installed_zone = ZoneBuilderFactory::new() diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 86192d3b94a..2d693c4aaab 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -1219,6 +1219,7 @@ impl ServiceManager { dhcp_config: DhcpCfg::default(), // Services do not use attached subnets, only instances. attached_subnets: vec![], + multicast_groups: &[], }) .map_err(|err| Error::ServicePortCreation { service: zone_kind, diff --git a/sled-agent/src/sim/collection.rs b/sled-agent/src/sim/collection.rs index 5a86d3c867a..b93f12aa429 100644 --- a/sled-agent/src/sim/collection.rs +++ b/sled-agent/src/sim/collection.rs @@ -236,16 +236,17 @@ impl SimCollection { while should_step { let (new_state, to_destroy) = { - // The object must be present in `objects` because it only gets - // removed when it comes to rest in the "Destroyed" state, but - // we can only get here if there's an asynchronous state - // transition desired. - // // We do as little as possible with the lock held. In // particular, we want to finish this work before calling out to // notify the nexus. let mut objects = self.objects.lock().await; - let mut object = objects.remove(&id).unwrap(); + + // The object may already have been destroyed and removed by a + // concurrent poke (e.g., sim_step racing with an explicit poke + // from a test). In that case there is nothing left to do. + let Some(mut object) = objects.remove(&id) else { + return; + }; object.transition_finish(); let after = object.object.current().clone(); diff --git a/sled-agent/src/sim/http_entrypoints.rs b/sled-agent/src/sim/http_entrypoints.rs index 2866f23d58b..56cd7cdc99f 100644 --- a/sled-agent/src/sim/http_entrypoints.rs +++ b/sled-agent/src/sim/http_entrypoints.rs @@ -26,7 +26,11 @@ use dropshot::TypedBody; use dropshot::endpoint; use omicron_common::api::internal::nexus::DiskRuntimeState; use omicron_common::api::internal::nexus::SledVmmState; +use omicron_common::api::internal::shared::ClearMcast2Phys; +use omicron_common::api::internal::shared::ClearMcastForwarding; use omicron_common::api::internal::shared::ExternalIpGatewayMap; +use omicron_common::api::internal::shared::Mcast2PhysMapping; +use omicron_common::api::internal::shared::McastForwardingEntry; use omicron_common::api::internal::shared::SledIdentifiers; use omicron_common::api::internal::shared::VirtualNetworkInterfaceHost; use omicron_common::api::internal::shared::{ @@ -56,7 +60,7 @@ use sled_agent_types::disk::{DiskEnsureBody, DiskPathParam}; use sled_agent_types::early_networking::EarlyNetworkConfigEnvelope; use sled_agent_types::firewall_rules::VpcFirewallRulesEnsureBody; use sled_agent_types::instance::{ - InstanceEnsureBody, InstanceExternalIpBody, InstanceMulticastBody, + InstanceEnsureBody, InstanceExternalIpBody, InstanceMulticastMembership, VmmIssueDiskSnapshotRequestBody, VmmIssueDiskSnapshotRequestPathParam, VmmIssueDiskSnapshotRequestResponse, VmmPathParam, VmmPutStateBody, VmmPutStateResponse, VmmUnregisterResponse, VpcPathParam, @@ -189,52 +193,24 @@ impl SledAgentApi for SledAgentSimImpl { async fn vmm_join_multicast_group( rqctx: RequestContext, path_params: Path, - body: TypedBody, + body: TypedBody, ) -> Result { let sa = rqctx.context(); let propolis_id = path_params.into_inner().propolis_id; - let body_args = body.into_inner(); - - match body_args { - InstanceMulticastBody::Join(membership) => { - sa.instance_join_multicast_group(propolis_id, &membership) - .await?; - } - InstanceMulticastBody::Leave(_) => { - // This endpoint is for joining - reject leave operations - return Err(HttpError::for_bad_request( - None, - "Join endpoint cannot process Leave operations".to_string(), - )); - } - } - + let membership = body.into_inner(); + sa.instance_join_multicast_group(propolis_id, &membership).await?; Ok(HttpResponseUpdatedNoContent()) } async fn vmm_leave_multicast_group( rqctx: RequestContext, path_params: Path, - body: TypedBody, + body: TypedBody, ) -> Result { let sa = rqctx.context(); let propolis_id = path_params.into_inner().propolis_id; - let body_args = body.into_inner(); - - match body_args { - InstanceMulticastBody::Leave(membership) => { - sa.instance_leave_multicast_group(propolis_id, &membership) - .await?; - } - InstanceMulticastBody::Join(_) => { - // This endpoint is for leaving - reject join operations - return Err(HttpError::for_bad_request( - None, - "Leave endpoint cannot process Join operations".to_string(), - )); - } - } - + let membership = body.into_inner(); + sa.instance_leave_multicast_group(propolis_id, &membership).await?; Ok(HttpResponseUpdatedNoContent()) } @@ -388,6 +364,66 @@ impl SledAgentApi for SledAgentSimImpl { Ok(HttpResponseOk(vnics)) } + async fn set_mcast_m2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + sa.set_mcast_m2p(&body.into_inner()) + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + Ok(HttpResponseUpdatedNoContent()) + } + + async fn clear_mcast_m2p( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + sa.clear_mcast_m2p(&body.into_inner()) + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + Ok(HttpResponseUpdatedNoContent()) + } + + async fn set_mcast_fwd( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + sa.set_mcast_fwd(&body.into_inner()) + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + Ok(HttpResponseUpdatedNoContent()) + } + + async fn clear_mcast_fwd( + rqctx: RequestContext, + body: TypedBody, + ) -> Result { + let sa = rqctx.context(); + sa.clear_mcast_fwd(&body.into_inner()) + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + Ok(HttpResponseUpdatedNoContent()) + } + + async fn list_mcast_m2p( + rqctx: RequestContext, + ) -> Result>, HttpError> { + let sa = rqctx.context(); + let m2p = sa + .list_mcast_m2p() + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + Ok(HttpResponseOk(m2p)) + } + + async fn list_mcast_fwd( + rqctx: RequestContext, + ) -> Result>, HttpError> { + let sa = rqctx.context(); + let fwd = sa + .list_mcast_fwd() + .map_err(|e| HttpError::for_internal_error(e.to_string()))?; + Ok(HttpResponseOk(fwd)) + } + async fn uplink_ensure( _rqctx: RequestContext, _body: TypedBody, diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index aa304477a56..1ce67d44c72 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -33,8 +33,10 @@ use omicron_common::api::internal::nexus::{ DiskRuntimeState, MigrationRuntimeState, MigrationState, SledVmmState, }; use omicron_common::api::internal::shared::{ - ResolvedVpcRoute, ResolvedVpcRouteSet, ResolvedVpcRouteState, RouterId, - RouterKind, RouterVersion, VirtualNetworkInterfaceHost, + ClearMcast2Phys, ClearMcastForwarding, Mcast2PhysMapping, + McastForwardingEntry, McastForwardingNextHop, ResolvedVpcRoute, + ResolvedVpcRouteSet, ResolvedVpcRouteState, RouterId, RouterKind, + RouterVersion, VirtualNetworkInterfaceHost, }; use omicron_common::disk::{ DatasetsConfig, DatasetsManagementResult, DiskIdentity, DiskVariant, @@ -99,6 +101,8 @@ pub struct SledAgent { pub nexus_client: Arc, pub simulated_upstairs: Arc, pub v2p_mappings: Mutex>, + pub m2p_mappings: Mutex>, + pub mcast_fwd: Mutex>>, mock_propolis: futures::lock::Mutex< Option<(propolis_mock_server::Server, PropolisClient)>, >, @@ -188,6 +192,8 @@ impl SledAgent { nexus_client, simulated_upstairs, v2p_mappings: Mutex::new(HashSet::new()), + m2p_mappings: Mutex::new(HashSet::new()), + mcast_fwd: Mutex::new(HashMap::new()), external_ips: Mutex::new(HashMap::new()), attached_subnets: Mutex::new(HashMap::new()), multicast_groups: Mutex::new(HashMap::new()), @@ -676,6 +682,58 @@ impl SledAgent { Ok(Vec::from_iter(v2p_mappings.clone())) } + pub fn set_mcast_m2p(&self, req: &Mcast2PhysMapping) -> Result<(), Error> { + let mut m2p = self.m2p_mappings.lock().unwrap(); + m2p.insert((req.group, req.underlay)); + Ok(()) + } + + pub fn clear_mcast_m2p(&self, req: &ClearMcast2Phys) -> Result<(), Error> { + let mut m2p = self.m2p_mappings.lock().unwrap(); + m2p.remove(&(req.group, req.underlay)); + Ok(()) + } + + pub fn set_mcast_fwd( + &self, + req: &McastForwardingEntry, + ) -> Result<(), Error> { + let mut fwd = self.mcast_fwd.lock().unwrap(); + fwd.insert(req.underlay, req.next_hops.clone()); + Ok(()) + } + + pub fn clear_mcast_fwd( + &self, + req: &ClearMcastForwarding, + ) -> Result<(), Error> { + let mut fwd = self.mcast_fwd.lock().unwrap(); + fwd.remove(&req.underlay); + Ok(()) + } + + pub fn list_mcast_m2p(&self) -> Result, Error> { + let m2p = self.m2p_mappings.lock().unwrap(); + Ok(m2p + .iter() + .map(|(group, underlay)| Mcast2PhysMapping { + group: *group, + underlay: *underlay, + }) + .collect()) + } + + pub fn list_mcast_fwd(&self) -> Result, Error> { + let fwd = self.mcast_fwd.lock().unwrap(); + Ok(fwd + .iter() + .map(|(underlay, next_hops)| McastForwardingEntry { + underlay: *underlay, + next_hops: next_hops.clone(), + }) + .collect()) + } + pub async fn instance_put_external_ip( &self, propolis_id: PropolisUuid, diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 9540ae750bc..184a1a7ba42 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -51,8 +51,10 @@ use omicron_common::api::external::{ByteCount, ByteCountRangeError, Vni}; use omicron_common::api::internal::nexus::{DiskRuntimeState, SledVmmState}; use omicron_common::api::internal::shared::DelegatedZvol; use omicron_common::api::internal::shared::{ - ExternalIpGatewayMap, ResolvedVpcFirewallRule, ResolvedVpcRouteSet, - ResolvedVpcRouteState, SledIdentifiers, VirtualNetworkInterfaceHost, + ClearMcast2Phys, ClearMcastForwarding, ExternalIpGatewayMap, + Mcast2PhysMapping, McastForwardingEntry, ResolvedVpcFirewallRule, + ResolvedVpcRouteSet, ResolvedVpcRouteState, SledIdentifiers, + VirtualNetworkInterfaceHost, }; use omicron_common::backoff::{ BackoffError, retry_notify, retry_policy_internal_service_aggressive, @@ -79,7 +81,7 @@ use sled_agent_types::disk::DiskStateRequested; use sled_agent_types::early_networking::EarlyNetworkConfigEnvelope; use sled_agent_types::early_networking::RackNetworkConfig; use sled_agent_types::instance::{ - InstanceEnsureBody, InstanceExternalIpBody, InstanceMulticastBody, + InstanceEnsureBody, InstanceExternalIpBody, InstanceMulticastMembership, VmmPutStateResponse, VmmStateRequested, VmmUnregisterResponse, }; use sled_agent_types::inventory::{Inventory, OmicronSledConfig, SledRole}; @@ -415,7 +417,6 @@ struct SledAgentInner { // A handle to the trust quorum. trust_quorum: trust_quorum::NodeTaskHandle, - // A handle to the hardware monitor. hardware_monitor: HardwareMonitorHandle, @@ -1007,26 +1008,28 @@ impl SledAgent { .map_err(|e| Error::Instance(e)) } + /// Subscribe a VMM's OPTE port to a multicast group. pub async fn instance_join_multicast_group( &self, propolis_id: PropolisUuid, - multicast_body: &InstanceMulticastBody, + membership: &InstanceMulticastMembership, ) -> Result<(), Error> { self.inner .instances - .join_multicast_group(propolis_id, multicast_body) + .join_multicast_group(propolis_id, membership) .await .map_err(|e| Error::Instance(e)) } + /// Unsubscribe a VMM's OPTE port from a multicast group. pub async fn instance_leave_multicast_group( &self, propolis_id: PropolisUuid, - multicast_body: &InstanceMulticastBody, + membership: &InstanceMulticastMembership, ) -> Result<(), Error> { self.inner .instances - .leave_multicast_group(propolis_id, multicast_body) + .leave_multicast_group(propolis_id, membership) .await .map_err(|e| Error::Instance(e)) } @@ -1111,6 +1114,52 @@ impl SledAgent { .map_err(Error::from) } + /// Install a multicast overlay-to-underlay (M2P) mapping in OPTE. + pub async fn set_mcast_m2p( + &self, + req: &Mcast2PhysMapping, + ) -> Result<(), Error> { + self.inner.port_manager.set_mcast_m2p(req).map_err(Error::from) + } + + /// Remove a multicast overlay-to-underlay (M2P) mapping from OPTE. + pub async fn clear_mcast_m2p( + &self, + req: &ClearMcast2Phys, + ) -> Result<(), Error> { + self.inner.port_manager.clear_mcast_m2p(req).map_err(Error::from) + } + + /// Set multicast forwarding next hops for an underlay group address. + pub async fn set_mcast_fwd( + &self, + req: &McastForwardingEntry, + ) -> Result<(), Error> { + self.inner.port_manager.set_mcast_fwd(req).map_err(Error::from) + } + + /// Remove multicast forwarding entries for an underlay group address. + pub async fn clear_mcast_fwd( + &self, + req: &ClearMcastForwarding, + ) -> Result<(), Error> { + self.inner.port_manager.clear_mcast_fwd(req).map_err(Error::from) + } + + /// List all multicast M2P mappings from OPTE. + pub async fn list_mcast_m2p( + &self, + ) -> Result, Error> { + self.inner.port_manager.list_mcast_m2p().map_err(Error::from) + } + + /// List all multicast forwarding entries from OPTE. + pub async fn list_mcast_fwd( + &self, + ) -> Result, Error> { + self.inner.port_manager.list_mcast_fwd().map_err(Error::from) + } + pub async fn ensure_scrimlet_host_ports( &self, uplinks: Vec, diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 6288b30b730..ffdb9a61cb2 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -155,7 +155,7 @@ usdt = { version = "0.6.0" } usdt-impl-3b31131e45eafb45 = { package = "usdt-impl", version = "0.6.0", default-features = false, features = ["des"] } usdt-impl-d8f496e17d97b5cb = { package = "usdt-impl", version = "0.5.0", default-features = false, features = ["asm", "des"] } uuid = { version = "1.21.0", features = ["serde", "v4"] } -winnow = { version = "0.7.14" } +winnow = { version = "0.7.15" } x509-cert = { version = "0.2.5" } zerocopy = { version = "0.8.40", default-features = false, features = ["derive", "simd"] } zeroize = { version = "1.8.2", features = ["std", "zeroize_derive"] } @@ -307,7 +307,7 @@ usdt-impl-d8f496e17d97b5cb = { package = "usdt-impl", version = "0.5.0", default uuid = { version = "1.21.0", features = ["serde", "v4"] } vergen = { version = "9.0.6", features = ["cargo", "rustc"] } vergen-lib = { version = "0.1.6", features = ["cargo", "git", "rustc"] } -winnow = { version = "0.7.14" } +winnow = { version = "0.7.15" } x509-cert = { version = "0.2.5" } zerocopy = { version = "0.8.40", default-features = false, features = ["derive", "simd"] } zeroize = { version = "1.8.2", features = ["std", "zeroize_derive"] }