diff --git a/PRODENG-3342-plan.md b/PRODENG-3342-plan.md new file mode 100644 index 00000000..87914933 --- /dev/null +++ b/PRODENG-3342-plan.md @@ -0,0 +1,114 @@ +# PRODENG-3342 — Cannot deploy MKE 3.8 using templates from the doc + +**Priority:** Critical | **Reporter:** Dzmitry Stremkouski | **Customer:** Nordea PreSales + +## Problem + +Two sequential failures when following the MKE 3.8 doc template with launchpad 1.5.15 +on Ubuntu 22.04. PRODENG-3393 is a clean repro of failure #1 on fresh nodes. + +--- + +## Failure 1 — MCR install fails (shared with PRODENG-3393) + +Doc template omits the `mcr` block. Launchpad used the `get.mirantis.com` install +script with the old default version (20.10.13), which fails on Ubuntu 22.04 with +exit status 1 after 10 retries on all nodes. + +**This failure is being fixed in the PRODENG-3393 worktree.** Once that fix lands, +cherry-pick or rebase this branch on top of it. Do not duplicate the MCR install +fix here. + +--- + +## Failure 2 — MKE bootstrap connection timeout (this worktree's scope) + +After manually adding the `mcr` block and retrying, MKE install fails: + +``` +INFO MKE install: Possible conflict between Kubernetes service CIDR range + 10.96.0.0/16 and default address pool for Swarm overlay networks 10.0.0.0/8 +... +FATA failed to apply cluster: failed to apply MKE: phase failure: + Upgrade MKE components => [ssh] ...: read: connection timed out +``` + +MKE logs detect the CIDR conflict and warn, but proceed. The install agent then +loses its SSH connection — likely because the Docker daemon restart during MKE +bootstrap tears down the overlay network that the SSH session is riding on, +and the new overlay is in a conflicting address space that disrupts routing. + +The reporter's config had: +```yaml +mke: + installFlags: + - --pod-cidr 10.0.0.0/16 +``` + +`10.0.0.0/16` is a subnet of Swarm's default overlay pool `10.0.0.0/8` — this +is a direct conflict. Docker itself warns about it. + +### Root cause options + +**A — User config error, no code change needed.** +`--pod-cidr 10.0.0.0/16` conflicting with `10.0.0.0/8` is a known Docker Swarm +constraint. The doc should warn users to pick a non-overlapping CIDR. + +**B — Launchpad should detect and reject this at validation time.** +The `ValidateFacts` or a dedicated validation phase could parse `--pod-cidr` from +`mke.installFlags`, compare it against the Swarm default overlay pool (`10.0.0.0/8`), +and fail fast with a clear error before any install begins. + +## Recommendation + +Implement option B. A silent connection timeout after 20+ minutes of install is a +poor user experience for a detectable misconfiguration. Fast-fail with a clear message. + +## Acceptance Criteria + +1. `launchpad apply` with `--pod-cidr` that overlaps `10.0.0.0/8` fails immediately + at validation with a descriptive error naming the conflict. +2. `launchpad apply` with a non-overlapping `--pod-cidr` (e.g. `192.168.0.0/16`) + proceeds normally. +3. Configs without `--pod-cidr` are unaffected. + +## Implementation Plan + +### Step 1 — Add pod-CIDR overlap validation +**File:** `pkg/product/mke/phase/validate_facts.go` (or a new +`pkg/product/mke/phase/validate_config.go`) + +- Parse `--pod-cidr` value from `p.Config.Spec.MKE.InstallFlags`. +- Parse the Swarm default overlay pool: `10.0.0.0/8` (constant or from + `mcr.swarmInstallFlags` if `--default-addr-pool` is set there). +- Use `net.ParseCIDR` + overlap check; fail with actionable message: + ``` + FATA invalid config: --pod-cidr 10.0.0.0/16 overlaps with the Swarm default + overlay address pool 10.0.0.0/8; choose a non-overlapping range or set + mcr.swarmInstallFlags --default-addr-pool to a non-conflicting pool + ``` + +### Step 2 — Unit tests +**File:** `pkg/product/mke/phase/validate_facts_test.go` (or new test file) + +- Overlapping CIDR → validation error. +- Non-overlapping CIDR → no error. +- No `--pod-cidr` flag → no error. +- Custom `--default-addr-pool` in swarmInstallFlags → validate against that instead. + +### Step 3 — Verification +- `make unit-test` passes. +- Manual: `launchpad apply` with conflicting CIDR → immediate clear error. + +## Files in Scope + +| File | Change | +|---|---| +| `pkg/product/mke/phase/validate_facts.go` | Add pod-CIDR / Swarm overlap check | +| `pkg/product/mke/phase/validate_facts_test.go` | Unit tests for new validation | +| `pkg/product/mke/config/` | Possibly add `Flags.GetValue` helper if not present | + +## Dependencies + +- MCR install fix (PRODENG-3393) must land first or be cherry-picked in, since + failure 1 blocks reaching failure 2 in any real test. diff --git a/pkg/product/mke/phase/validate_facts.go b/pkg/product/mke/phase/validate_facts.go index 83ad853f..3db73bad 100644 --- a/pkg/product/mke/phase/validate_facts.go +++ b/pkg/product/mke/phase/validate_facts.go @@ -3,6 +3,7 @@ package phase import ( "errors" "fmt" + "net" "strconv" "github.com/Mirantis/launchpad/pkg/mke" @@ -69,6 +70,10 @@ func (p *ValidateFacts) Run() error { } } + if err := p.validatePodCIDR(); err != nil { + return errors.Join(ErrFactsArentValid, err) + } + return nil } @@ -195,3 +200,48 @@ func (p *ValidateFacts) validateDataPlane() error { return nil } + +var errInvalidPodCIDR = errors.New("invalid pod CIDR configuration") + +// swarmDefaultAddrPool is the Docker Swarm default overlay address pool. +const swarmDefaultAddrPool = "10.0.0.0/8" + +// validatePodCIDR checks that --pod-cidr in mke.installFlags does not overlap +// with the Swarm overlay address pool. Overlapping CIDRs cause the Docker daemon +// to restart into a broken network state during MKE bootstrap, which silently +// drops the SSH connection and produces a connection timeout after 20+ minutes. +// +// If mcr.swarmInstallFlags contains --default-addr-pool, that value is used as +// the Swarm pool instead of the compiled-in default (10.0.0.0/8). +func (p *ValidateFacts) validatePodCIDR() error { + podCIDRStr := p.Config.Spec.MKE.InstallFlags.GetValue("--pod-cidr") + if podCIDRStr == "" { + return nil + } + + swarmPoolStr := p.Config.Spec.MCR.SwarmInstallFlags.GetValue("--default-addr-pool") + if swarmPoolStr == "" { + swarmPoolStr = swarmDefaultAddrPool + } + + _, podNet, err := net.ParseCIDR(podCIDRStr) + if err != nil { + return fmt.Errorf("%w: cannot parse --pod-cidr %q: %w", errInvalidPodCIDR, podCIDRStr, err) + } + + _, swarmNet, err := net.ParseCIDR(swarmPoolStr) + if err != nil { + return fmt.Errorf("%w: cannot parse Swarm address pool %q: %w", errInvalidPodCIDR, swarmPoolStr, err) + } + + if swarmNet.Contains(podNet.IP) || podNet.Contains(swarmNet.IP) { + return fmt.Errorf( + "%w: --pod-cidr %s overlaps with the Swarm overlay address pool %s; "+ + "choose a non-overlapping range or set mcr.swarmInstallFlags --default-addr-pool to a non-conflicting pool", + errInvalidPodCIDR, podCIDRStr, swarmPoolStr, + ) + } + + log.Debugf("pod CIDR %s does not overlap with Swarm pool %s", podCIDRStr, swarmPoolStr) + return nil +} diff --git a/pkg/product/mke/phase/validate_facts_test.go b/pkg/product/mke/phase/validate_facts_test.go index a8b4185b..16a2bbcd 100644 --- a/pkg/product/mke/phase/validate_facts_test.go +++ b/pkg/product/mke/phase/validate_facts_test.go @@ -236,3 +236,70 @@ func TestValidateInvalidMCRConfig(t *testing.T) { require.Error(t, phase.Run(), "MCR version validated an invalid config") } + +func makePhaseWithPodCIDR(podCIDR string, swarmPool string) ValidateFacts { + p := ValidateFacts{} + installFlags := commonconfig.Flags{"--san=10.0.0.1"} + if podCIDR != "" { + installFlags = append(installFlags, "--pod-cidr "+podCIDR) + } + swarmFlags := commonconfig.Flags{} + if swarmPool != "" { + swarmFlags = append(swarmFlags, "--default-addr-pool "+swarmPool) + } + p.Config = &mkeconfig.ClusterConfig{ + Spec: &mkeconfig.ClusterSpec{ + Hosts: mkeconfig.Hosts{ + &mkeconfig.Host{Connection: rig.Connection{SSH: &rig.SSH{Address: "10.0.0.1"}}, Role: "manager"}, + }, + MCR: commonconfig.MCRConfig{ + Channel: "stable-29.4", + SwarmInstallFlags: swarmFlags, + Metadata: &commonconfig.MCRMetadata{}, + }, + MKE: mkeconfig.MKEConfig{ + Version: "3.8.2", + Metadata: &mkeconfig.MKEMetadata{}, + InstallFlags: installFlags, + }, + }, + } + return p +} + +func TestValidatePodCIDROverlapsDefaultPool(t *testing.T) { + // 10.0.0.0/16 is a subnet of the default Swarm pool 10.0.0.0/8 — must fail. + p := makePhaseWithPodCIDR("10.0.0.0/16", "") + err := p.validatePodCIDR() + require.Error(t, err) + require.ErrorIs(t, err, errInvalidPodCIDR) + require.ErrorContains(t, err, "10.0.0.0/16") + require.ErrorContains(t, err, "10.0.0.0/8") +} + +func TestValidatePodCIDRNoOverlap(t *testing.T) { + // 192.168.0.0/16 does not overlap with 10.0.0.0/8. + p := makePhaseWithPodCIDR("192.168.0.0/16", "") + require.NoError(t, p.validatePodCIDR()) +} + +func TestValidatePodCIDRAbsent(t *testing.T) { + // No --pod-cidr flag — validation must be a no-op. + p := makePhaseWithPodCIDR("", "") + require.NoError(t, p.validatePodCIDR()) +} + +func TestValidatePodCIDRCustomSwarmPool(t *testing.T) { + // --default-addr-pool overrides the compiled-in default. + // 10.0.0.0/16 would overlap 10.0.0.0/8 but not 172.16.0.0/12. + p := makePhaseWithPodCIDR("10.0.0.0/16", "172.16.0.0/12") + require.NoError(t, p.validatePodCIDR()) +} + +func TestValidatePodCIDRCustomSwarmPoolOverlap(t *testing.T) { + // Custom pool 192.168.0.0/16 overlaps with pod-cidr 192.168.1.0/24. + p := makePhaseWithPodCIDR("192.168.1.0/24", "192.168.0.0/16") + err := p.validatePodCIDR() + require.Error(t, err) + require.ErrorIs(t, err, errInvalidPodCIDR) +}