From bd9d1c28856ffb60041a0d8e9355c50655b9a610 Mon Sep 17 00:00:00 2001 From: James Nesbitt Date: Wed, 27 May 2026 16:43:09 +0300 Subject: [PATCH] Add swarmAddress override for Swarm advertise address per host MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In stretched/multi-DC environments, nodes have a private NIC IP (InternalAddress) that is not routable across DCs, and a floating/public SSH address that is. Launchpad previously used InternalAddress unconditionally for all Swarm init and join operations, causing Swarm to fail to form across DCs. Changes: - Add SwarmAddressOverride field (yaml: swarmAddress) to Host config. When set, SwarmAddress() returns this value instead of InternalAddress. The field is optional — behaviour is unchanged for hosts that omit it. - Thread --advertise-addr=h.SwarmAddress() into the swarm join command for both manager and worker joins. Previously only swarm init set an advertise address; joining nodes auto-detected their address (always the private NIC), which also broke cross-DC membership. - Add TestHostSwarmAddressOverride and TestHostSwarmAddressOverrideEmpty. PRODENG-3336 --- PRODENG-3336-plan.md | 130 ++++++++++++++++++++++ pkg/product/mke/config/host.go | 17 ++- pkg/product/mke/config/host_test.go | 28 +++++ pkg/product/mke/phase/join_controllers.go | 2 +- pkg/product/mke/phase/join_workers.go | 2 +- 5 files changed, 175 insertions(+), 4 deletions(-) create mode 100644 PRODENG-3336-plan.md diff --git a/PRODENG-3336-plan.md b/PRODENG-3336-plan.md new file mode 100644 index 00000000..654b8454 --- /dev/null +++ b/PRODENG-3336-plan.md @@ -0,0 +1,130 @@ +# PRODENG-3336 — Launchpad uses private IPs for Swarm instead of config addresses + +**Priority:** Major | **Reporter:** Dzmitry Stremkouski + +## Problem + +In stretched/multi-DC environments, hosts have: +- A private interface IP (e.g. `192.168.x.x`) — not routable across DCs +- A floating/public SSH address (e.g. `172.19.x.x`) — routable across DCs + +Launchpad unconditionally uses `Metadata.InternalAddress` (resolved from +`privateInterface`) as the Swarm advertise address and join target. +In stretched topologies this means Swarm nodes cannot reach each other and +the cluster fails to form. + +Reporter's working workaround: +```bash +docker swarm init --advertise-addr 172.19.121.30 --force-new-cluster +docker swarm join --token --advertise-addr 172.19.124.62 172.19.121.30:2377 +``` +i.e. using the SSH/floating addresses from the launchpad config. + +## Root Cause + +`pkg/product/mke/config/host.go`: +```go +func (h *Host) SwarmAddress() string { + return fmt.Sprintf("%s:%d", h.Metadata.InternalAddress, 2377) +} +``` + +`InternalAddress` is set in `gather_facts.go` by querying the `privateInterface` +NIC — always a local, possibly non-routable IP. + +Additionally, `swarm join` does not set `--advertise-addr` for the joining node at all: +```go +// join_controllers.go / join_workers.go +joinCmd := h.Configurer.DockerCommandf("swarm join --token %s %s", + token, swarmLeader.SwarmAddress()) +``` +The joining node will advertise its own auto-detected address, which will again +be the private NIC IP and not reachable across DCs. + +## Acceptance Criteria + +1. Users can set an explicit `swarmAddress` per host in the launchpad YAML. +2. When set, that address is used for `swarm init --advertise-addr` (leader) + and `swarm join --advertise-addr` (joining nodes). +3. When not set, behaviour is unchanged (falls back to `InternalAddress`). +4. The join command for both managers and workers passes + `--advertise-addr=`. + +## Implementation Plan + +### Step 1 — Add `swarmAddress` field to Host config +**File:** `pkg/product/mke/config/host.go` + +```go +// Host struct — add field: +SwarmAddress string `yaml:"swarmAddress,omitempty"` +``` + +Update `SwarmAddress()` method (rename to avoid collision): +```go +// SwarmAddr returns the address used for Swarm clustering. +// Uses the explicit swarmAddress config field when set, +// otherwise falls back to the discovered InternalAddress. +func (h *Host) SwarmAddr() string { + addr := h.Metadata.InternalAddress + if h.SwarmAddress != "" { + addr = h.SwarmAddress + } + return fmt.Sprintf("%s:%d", addr, 2377) +} +``` + +Note: rename from `SwarmAddress()` to `SwarmAddr()` to avoid the field/method +name collision, OR keep the field as `swarmAddress` yaml but name the struct +field `SwarmAddressOverride`. Pick one and apply consistently. + +### Step 2 — Update all SwarmAddress() call sites +**Files:** `pkg/product/mke/phase/init_swarm.go`, +`pkg/product/mke/phase/join_controllers.go`, +`pkg/product/mke/phase/join_workers.go`, +`pkg/product/mke/config/cluster_spec.go` + +Run `lsp references` on `SwarmAddress` before touching anything. +Update each call site to use the renamed method. + +### Step 3 — Add `--advertise-addr` to join commands +**File:** `pkg/product/mke/phase/join_controllers.go` + +```go +joinCmd := h.Configurer.DockerCommandf( + "swarm join --advertise-addr=%s --token %s %s", + h.SwarmAddr(), token, swarmLeader.SwarmAddr()) +``` + +**File:** `pkg/product/mke/phase/join_workers.go` — same pattern. + +### Step 4 — Update tests +**File:** `pkg/product/mke/config/host_test.go` + +- `TestHostSwarmAddress`: add cases for explicit `swarmAddress` override. +- Add case: override takes precedence over `InternalAddress`. +- Add case: empty override falls back to `InternalAddress`. + +### Step 5 — Verification +- `make unit-test` passes. +- Manual: deploy with `swarmAddress` set on each host to the SSH address; + confirm Swarm forms correctly across DCs. + +## Files in Scope + +| File | Change | +|---|---| +| `pkg/product/mke/config/host.go` | Add `SwarmAddress` field; rename/update `SwarmAddr()` method | +| `pkg/product/mke/config/host_test.go` | Test override behaviour | +| `pkg/product/mke/phase/init_swarm.go` | Update call site | +| `pkg/product/mke/phase/join_controllers.go` | Update call site; add `--advertise-addr` for joining node | +| `pkg/product/mke/phase/join_workers.go` | Update call site; add `--advertise-addr` for joining node | +| `pkg/product/mke/config/cluster_spec.go` | Update call site if present | + +## Notes + +- No migration needed — the field is purely additive and optional. +- The `privateInterface` / `InternalAddress` path is unchanged for users who + don't need stretched topology. +- Consider adding a validation warning when `SwarmAddress` is set but equals + `InternalAddress` (no-op override, probably a config mistake). diff --git a/pkg/product/mke/config/host.go b/pkg/product/mke/config/host.go index 06ade13e..e8565eec 100644 --- a/pkg/product/mke/config/host.go +++ b/pkg/product/mke/config/host.go @@ -80,6 +80,12 @@ type Host struct { SudoDocker bool `yaml:"sudodocker"` SudoOverride bool `yaml:"sudooverride"` // some customers can't allow the default rig connection sudo detection MCRUpgradeSkip bool `yaml:"mcrupgradeskip"` // don't upgrade this host when upgraing MCR (to allow upgrades in batches + // SwarmAddressOverride, when set, is used as the advertise address for + // Docker Swarm init and join operations instead of the discovered + // InternalAddress. Use this in stretched/multi-DC environments where the + // private NIC IP is not routable across DCs but the SSH/floating address is. + SwarmAddressOverride string `yaml:"swarmAddress,omitempty"` + Metadata *HostMetadata `yaml:"-"` MSRMetadata *MSRMetadata `yaml:"-"` @@ -190,9 +196,16 @@ func (h *Host) AuthenticateDocker(imageRepo string) error { return nil } -// SwarmAddress determines the swarm address for the host. +// SwarmAddress returns the address used for Swarm clustering. +// When swarmAddress is set in the host config it takes precedence over the +// discovered InternalAddress, allowing users in stretched/multi-DC environments +// to specify a routable floating address instead of the private NIC IP. func (h *Host) SwarmAddress() string { - return fmt.Sprintf("%s:%d", h.Metadata.InternalAddress, 2377) + addr := h.Metadata.InternalAddress + if h.SwarmAddressOverride != "" { + addr = h.SwarmAddressOverride + } + return fmt.Sprintf("%s:%d", addr, 2377) } // MCRVersion returns the current engine version installed on the host. diff --git a/pkg/product/mke/config/host_test.go b/pkg/product/mke/config/host_test.go index f4230df3..b6d82ca4 100644 --- a/pkg/product/mke/config/host_test.go +++ b/pkg/product/mke/config/host_test.go @@ -35,6 +35,34 @@ func TestHostSwarmAddress(t *testing.T) { require.Equal(t, "1.2.3.4:2377", h.SwarmAddress()) } +func TestHostSwarmAddressOverride(t *testing.T) { + // When SwarmAddressOverride is set it takes precedence over InternalAddress. + h := Host{ + Connection: rig.Connection{ + SSH: &rig.SSH{Address: "172.19.121.30"}, + }, + SwarmAddressOverride: "172.19.121.30", + Metadata: &HostMetadata{ + InternalAddress: "192.168.1.10", + }, + } + require.Equal(t, "172.19.121.30:2377", h.SwarmAddress()) +} + +func TestHostSwarmAddressOverrideEmpty(t *testing.T) { + // An empty SwarmAddressOverride falls back to InternalAddress. + h := Host{ + Connection: rig.Connection{ + SSH: &rig.SSH{Address: "172.19.121.30"}, + }, + SwarmAddressOverride: "", + Metadata: &HostMetadata{ + InternalAddress: "192.168.1.10", + }, + } + require.Equal(t, "192.168.1.10:2377", h.SwarmAddress()) +} + func TestHostAddress(t *testing.T) { h := Host{ Connection: rig.Connection{ diff --git a/pkg/product/mke/phase/join_controllers.go b/pkg/product/mke/phase/join_controllers.go index e7818da7..4dbcbdb8 100644 --- a/pkg/product/mke/phase/join_controllers.go +++ b/pkg/product/mke/phase/join_controllers.go @@ -29,7 +29,7 @@ func (p *JoinManagers) Run() error { log.Infof("%s: already a swarm node", h) continue } - joinCmd := h.Configurer.DockerCommandf("swarm join --token %s %s", p.Config.Spec.MCR.Metadata.ManagerJoinToken, swarmLeader.SwarmAddress()) + joinCmd := h.Configurer.DockerCommandf("swarm join --advertise-addr=%s --token %s %s", h.SwarmAddress(), p.Config.Spec.MCR.Metadata.ManagerJoinToken, swarmLeader.SwarmAddress()) log.Debugf("%s: joining as manager", h) err := h.Exec(joinCmd, exec.StreamOutput(), exec.RedactString(p.Config.Spec.MCR.Metadata.ManagerJoinToken)) if err != nil { diff --git a/pkg/product/mke/phase/join_workers.go b/pkg/product/mke/phase/join_workers.go index ce6f1ad9..e161f345 100644 --- a/pkg/product/mke/phase/join_workers.go +++ b/pkg/product/mke/phase/join_workers.go @@ -33,7 +33,7 @@ func (p *JoinWorkers) Run() error { log.Infof("%s: already a swarm node", h) continue } - joinCmd := h.Configurer.DockerCommandf("swarm join --token %s %s", p.Config.Spec.MCR.Metadata.WorkerJoinToken, swarmLeader.SwarmAddress()) + joinCmd := h.Configurer.DockerCommandf("swarm join --advertise-addr=%s --token %s %s", h.SwarmAddress(), p.Config.Spec.MCR.Metadata.WorkerJoinToken, swarmLeader.SwarmAddress()) log.Debugf("%s: joining as worker", h) err := h.Exec(joinCmd, exec.RedactString(p.Config.Spec.MCR.Metadata.WorkerJoinToken)) if err != nil {