From 35f71ba905eec3268d7ca578e176f9cf349d1f3c Mon Sep 17 00:00:00 2001
From: Dave Woodruff <david.l.woodruff@gmail.com>
Date: Sat, 23 May 2026 19:05:15 -0700
Subject: [PATCH 1/9] flex-ranks phase 1: rank apportionment by target ratio

Add apportion_ranks(ratios, n_proc): largest-remainder (Hare quota)
apportionment of a fixed rank pool across cylinders by target ratio,
with a floor of one rank per cylinder. Counts sum to exactly n_proc and
every cylinder gets at least one rank. Raises when there are more
cylinders than ranks. Pure function; not yet wired into WheelSpinner.

Per doc/designs/flexible_rank_assignments.md, the User Interface
apportionment algorithm.

Tests cover the worked design-doc example, tie-breaking by declaration
order, scale invariance, the floor-of-one rescue, C==np and C>np, and a
property sweep (sum==n_proc, all >=1) across many cases. Wired into
run_coverage.bash and test_pr_and_main.yml.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/test_pr_and_main.yml   |   3 +-
 mpisppy/tests/test_rank_apportionment.py | 110 +++++++++++++++++++++++
 mpisppy/utils/rank_apportionment.py      |  84 +++++++++++++++++
 run_coverage.bash                        |   3 +
 4 files changed, 199 insertions(+), 1 deletion(-)
 create mode 100644 mpisppy/tests/test_rank_apportionment.py
 create mode 100644 mpisppy/utils/rank_apportionment.py

diff --git a/.github/workflows/test_pr_and_main.yml b/.github/workflows/test_pr_and_main.yml
index f8ea33ac2..9b292ffd3 100644
--- a/.github/workflows/test_pr_and_main.yml
+++ b/.github/workflows/test_pr_and_main.yml
@@ -1127,7 +1127,8 @@ jobs:
             mpisppy/tests/test_reduced_costs_rho_bundles.py \
             mpisppy/tests/test_rho_deprecations.py \
             mpisppy/tests/test_buffer_inspect.py \
-            mpisppy/tests/test_comm_lor_check.py
+            mpisppy/tests/test_comm_lor_check.py \
+            mpisppy/tests/test_rank_apportionment.py
 
       - name: Upload coverage data
         if: always()
diff --git a/mpisppy/tests/test_rank_apportionment.py b/mpisppy/tests/test_rank_apportionment.py
new file mode 100644
index 000000000..1bc49e565
--- /dev/null
+++ b/mpisppy/tests/test_rank_apportionment.py
@@ -0,0 +1,110 @@
+###############################################################################
+# mpi-sppy: MPI-based Stochastic Programming in PYthon
+#
+# Copyright (c) 2024, Lawrence Livermore National Security, LLC, Alliance for
+# Sustainable Energy, LLC, The Regents of the University of California, et al.
+# All rights reserved. Please see the files COPYRIGHT.md and LICENSE.md for
+# full copyright and license information.
+###############################################################################
+"""Unit tests for mpisppy.utils.rank_apportionment.apportion_ranks."""
+
+import unittest
+
+from mpisppy.utils.rank_apportionment import apportion_ranks
+
+
+class TestApportionRanks(unittest.TestCase):
+
+    def test_equal_ratios_divisible(self):
+        # Clean split when ranks divide evenly among equal cylinders.
+        self.assertEqual(apportion_ranks([1.0, 1.0, 1.0], 9), [3, 3, 3])
+
+    def test_equal_ratios_indivisible_breaks_ties_by_order(self):
+        # 10 ranks, 3 equal cylinders: each floors to 3 (sum 9), and the
+        # single leftover goes to the lowest-index cylinder on the tie.
+        self.assertEqual(apportion_ranks([1.0, 1.0, 1.0], 10), [4, 3, 3])
+
+    def test_two_equal_cylinders_tie(self):
+        # 3 ranks, 2 equal cylinders: shares 1.5 / 1.5, leftover -> idx 0.
+        self.assertEqual(apportion_ranks([1.0, 1.0], 3), [2, 1])
+
+    def test_design_doc_example(self):
+        # The worked example from the design doc: hub 1.0, lagrangian 0.5,
+        # xhat 0.25, np 14  ->  8 / 4 / 2 (exact, no remainder).
+        self.assertEqual(apportion_ranks([1.0, 0.5, 0.25], 14), [8, 4, 2])
+
+    def test_ratio_scale_invariance(self):
+        # Only relative magnitudes matter.
+        self.assertEqual(
+            apportion_ranks([4.0, 2.0, 1.0], 14),
+            apportion_ranks([1.0, 0.5, 0.25], 14),
+        )
+
+    def test_floor_of_one_enforced(self):
+        # A tiny ratio would round to zero; the floor-of-one pass rescues it
+        # by taking a rank from the largest cylinder.
+        counts = apportion_ranks([10.0, 0.01], 3)
+        self.assertEqual(counts, [2, 1])
+
+    def test_floor_of_one_many_small(self):
+        # Several near-zero cylinders all get rescued to 1.
+        counts = apportion_ranks([100.0, 0.001, 0.001, 0.001], 7)
+        self.assertEqual(sum(counts), 7)
+        self.assertTrue(all(c >= 1 for c in counts))
+        self.assertEqual(counts[0], 4)  # big cylinder keeps the rest
+
+    def test_cylinders_equal_ranks(self):
+        # C == n_proc: everyone gets exactly one.
+        self.assertEqual(apportion_ranks([1.0, 2.0, 3.0], 3), [1, 1, 1])
+
+    def test_single_cylinder_gets_all(self):
+        self.assertEqual(apportion_ranks([1.0], 5), [5])
+
+    def test_more_cylinders_than_ranks_raises(self):
+        with self.assertRaises(ValueError):
+            apportion_ranks([1.0, 1.0, 1.0, 1.0], 3)
+
+    def test_invalid_inputs_raise(self):
+        with self.assertRaises(ValueError):
+            apportion_ranks([], 4)
+        with self.assertRaises(ValueError):
+            apportion_ranks([1.0, 1.0], 0)
+        with self.assertRaises(ValueError):
+            apportion_ranks([1.0, -1.0], 4)
+        with self.assertRaises(ValueError):
+            apportion_ranks([1.0, 0.0], 4)
+
+    def test_invariants_over_many_cases(self):
+        # Property sweep: counts always sum to n_proc, are all >= 1, and
+        # match the cylinder count.
+        ratio_pools = [
+            [1.0, 1.0, 1.0],
+            [1.0, 0.5, 0.25],
+            [3.0, 1.0],
+            [5.0, 2.0, 2.0, 1.0],
+            [1.0, 1.0, 1.0, 1.0, 1.0],
+        ]
+        for ratios in ratio_pools:
+            for n_proc in range(len(ratios), len(ratios) + 25):
+                counts = apportion_ranks(ratios, n_proc)
+                self.assertEqual(len(counts), len(ratios))
+                self.assertEqual(sum(counts), n_proc)
+                self.assertTrue(all(c >= 1 for c in counts),
+                                msg=f"{ratios=} {n_proc=} -> {counts=}")
+
+    def test_monotone_in_ratio(self):
+        # With enough ranks, a larger ratio never gets fewer ranks than a
+        # smaller one (sanity on the apportionment ordering).
+        counts = apportion_ranks([3.0, 2.0, 1.0], 30)
+        self.assertGreaterEqual(counts[0], counts[1])
+        self.assertGreaterEqual(counts[1], counts[2])
+
+    def test_order_preserved(self):
+        # Output position i corresponds to input ratio i.
+        a = apportion_ranks([1.0, 0.5, 0.25], 14)
+        b = apportion_ranks([0.25, 0.5, 1.0], 14)
+        self.assertEqual(a, list(reversed(b)))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/mpisppy/utils/rank_apportionment.py b/mpisppy/utils/rank_apportionment.py
new file mode 100644
index 000000000..df374ad15
--- /dev/null
+++ b/mpisppy/utils/rank_apportionment.py
@@ -0,0 +1,84 @@
+###############################################################################
+# mpi-sppy: MPI-based Stochastic Programming in PYthon
+#
+# Copyright (c) 2024, Lawrence Livermore National Security, LLC, Alliance for
+# Sustainable Energy, LLC, The Regents of the University of California, et al.
+# All rights reserved. Please see the files COPYRIGHT.md and LICENSE.md for
+# full copyright and license information.
+###############################################################################
+"""Apportion a fixed pool of MPI ranks across cylinders by target ratio.
+
+Supports flexible (per-cylinder) rank counts for WheelSpinner.  See
+``doc/designs/flexible_rank_assignments.md`` (the "User Interface"
+section) for the algorithm and its rationale.
+"""
+
+import math
+
+
+def apportion_ranks(ratios, n_proc):
+    """Apportion ``n_proc`` ranks across cylinders by target ratio.
+
+    Uses the largest-remainder (Hare quota) method, then enforces a
+    floor of one rank per cylinder.  The returned counts sum to exactly
+    ``n_proc`` and every entry is at least 1, so every cylinder runs.
+
+    Args:
+        ratios: ordered sequence of positive target ratios, one per
+            cylinder, in declaration order (the hub is conventionally
+            first with ratio 1.0).  Only relative magnitudes matter.
+        n_proc: total number of MPI ranks to distribute (> 0).
+
+    Returns:
+        list[int]: rank count per cylinder, in the same order as
+        ``ratios``, summing to ``n_proc``, each at least 1.
+
+    Raises:
+        ValueError: if there are more cylinders than ranks (a floor of
+            one each is then infeasible), or if any ratio is
+            non-positive, or if ``n_proc`` is non-positive, or if no
+            cylinders are given.
+    """
+    n_cyl = len(ratios)
+    if n_cyl == 0:
+        raise ValueError("apportion_ranks: need at least one cylinder")
+    if n_proc <= 0:
+        raise ValueError(f"apportion_ranks: n_proc must be positive (got {n_proc})")
+    if any(r <= 0 for r in ratios):
+        raise ValueError(f"apportion_ranks: ratios must be positive (got {list(ratios)})")
+    if n_cyl > n_proc:
+        raise ValueError(
+            f"apportion_ranks: {n_cyl} cylinders cannot each receive at least "
+            f"one rank from only {n_proc} ranks"
+        )
+
+    total = math.fsum(ratios)
+    real_share = [r / total * n_proc for r in ratios]
+    counts = [math.floor(x) for x in real_share]
+
+    # Largest-remainder: hand out the leftover ranks to the biggest
+    # fractional remainders, breaking ties by declaration order (lowest
+    # index first).
+    leftover = n_proc - sum(counts)
+    if leftover > 0:
+        by_remainder = sorted(
+            range(n_cyl),
+            key=lambda i: (real_share[i] - counts[i], -i),
+            reverse=True,
+        )
+        for i in by_remainder[:leftover]:
+            counts[i] += 1
+
+    # Floor of one: move a rank from the currently-largest cylinder to a
+    # cylinder stuck at zero, until none remain at zero.  This is always
+    # feasible because n_cyl <= n_proc, so whenever a zero exists some
+    # other cylinder holds at least two.
+    while True:
+        zeros = [i for i, c in enumerate(counts) if c == 0]
+        if not zeros:
+            break
+        donor = max(range(n_cyl), key=lambda i: counts[i])
+        counts[donor] -= 1
+        counts[zeros[0]] += 1
+
+    return counts
diff --git a/run_coverage.bash b/run_coverage.bash
index 1cfb7a1cf..996a6c295 100755
--- a/run_coverage.bash
+++ b/run_coverage.bash
@@ -121,6 +121,9 @@ run_phase "test_reduced_costs_rho_bundles (serial)" \
 run_phase "test_rho_deprecations (serial)" \
     coverage run --rcfile=.coveragerc -m pytest mpisppy/tests/test_rho_deprecations.py -v
 
+run_phase "test_rank_apportionment (serial)" \
+    coverage run --rcfile=.coveragerc -m pytest mpisppy/tests/test_rank_apportionment.py -v
+
 run_phase "test_xhat_from_file (serial)" \
     coverage run --rcfile=.coveragerc -m pytest mpisppy/tests/test_xhat_from_file.py -v
 

From fc63b39b67f49893d95d9a9be910adee008a9d33 Mon Sep 17 00:00:00 2001
From: Dave Woodruff <david.l.woodruff@gmail.com>
Date: Sat, 23 May 2026 19:07:57 -0700
Subject: [PATCH 2/9] flex-ranks phase 1: overlap-map computation

Add OverlapSegment and compute_overlap_segments(): given a local rank's
scenarios, a peer cylinder's contiguous scenario partition (the slices
from scen_names_to_ranks), and a per-scenario item count, produce the
list of segments that back the local field buffer from the peer's
buffers. Contiguous same-rank runs are coalesced; equal rank counts
degenerate to a single identity segment per rank.

Pure (no MPI/Pyomo), so it unit tests directly. Not yet wired into the
communicator. Per doc/designs/flexible_rank_assignments.md, Overlap Maps.

Tests: equal-rank identity (single + multi-item), fewer/more remote
ranks, straddling the peer split, run coalescing, variable per-scenario
item counts, and a tiling invariant. Wired into run_coverage.bash and
test_pr_and_main.yml.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/test_pr_and_main.yml |   3 +-
 mpisppy/cylinders/overlap_map.py       |  93 ++++++++++++++++++++
 mpisppy/tests/test_overlap_map.py      | 115 +++++++++++++++++++++++++
 run_coverage.bash                      |   3 +
 4 files changed, 213 insertions(+), 1 deletion(-)
 create mode 100644 mpisppy/cylinders/overlap_map.py
 create mode 100644 mpisppy/tests/test_overlap_map.py

diff --git a/.github/workflows/test_pr_and_main.yml b/.github/workflows/test_pr_and_main.yml
index 9b292ffd3..5a25c600d 100644
--- a/.github/workflows/test_pr_and_main.yml
+++ b/.github/workflows/test_pr_and_main.yml
@@ -1128,7 +1128,8 @@ jobs:
             mpisppy/tests/test_rho_deprecations.py \
             mpisppy/tests/test_buffer_inspect.py \
             mpisppy/tests/test_comm_lor_check.py \
-            mpisppy/tests/test_rank_apportionment.py
+            mpisppy/tests/test_rank_apportionment.py \
+            mpisppy/tests/test_overlap_map.py
 
       - name: Upload coverage data
         if: always()
diff --git a/mpisppy/cylinders/overlap_map.py b/mpisppy/cylinders/overlap_map.py
new file mode 100644
index 000000000..794d4179e
--- /dev/null
+++ b/mpisppy/cylinders/overlap_map.py
@@ -0,0 +1,93 @@
+###############################################################################
+# mpi-sppy: MPI-based Stochastic Programming in PYthon
+#
+# Copyright (c) 2024, Lawrence Livermore National Security, LLC, Alliance for
+# Sustainable Energy, LLC, The Regents of the University of California, et al.
+# All rights reserved. Please see the files COPYRIGHT.md and LICENSE.md for
+# full copyright and license information.
+###############################################################################
+"""Overlap maps for reading a local-sized field across cylinders that have
+different MPI rank counts.
+
+When two cylinders hold the same scenarios but split them over a different
+number of ranks, a reader rank's local buffer for a per-scenario field is
+backed by one or more *segments* living in remote ranks' buffers.  An
+``OverlapSegment`` records, for one such segment, which remote rank holds
+it and the matching offsets/length (in field *items*, e.g. nonants) in the
+remote and local buffers.
+
+This module is deliberately free of MPI and Pyomo: it consumes the
+contiguous per-rank scenario partition (the ``slices`` produced by
+``_ScenTree.scen_names_to_ranks``) plus a per-scenario item count, so it
+can be unit tested directly.  See
+``doc/designs/flexible_rank_assignments.md`` (the "Overlap Maps" section).
+"""
+
+from dataclasses import dataclass
+
+
+@dataclass
+class OverlapSegment:
+    """One contiguous run of a local field buffer sourced from a single
+    remote rank's buffer.  All offsets/lengths are in field items."""
+    remote_rank: int     # rank in the peer cylinder to read from
+    remote_offset: int   # item offset of this run within the remote buffer
+    local_offset: int    # item offset of this run within the local buffer
+    count: int           # number of items in this run
+
+
+def compute_overlap_segments(local_scen_idxs, remote_slices, items_per_scen):
+    """Segments mapping a local rank's field buffer onto remote buffers.
+
+    Args:
+        local_scen_idxs: ordered global scenario indices held by this
+            (local) rank -- i.e. ``local_slices[local_rank]``.
+        remote_slices: the peer cylinder's partition, ``rank -> ordered
+            list of global scenario indices`` (the ``slices`` output of
+            ``scen_names_to_ranks`` for that cylinder).  Determines both
+            which remote rank holds each scenario and the scenario's
+            offset within that rank's buffer.
+        items_per_scen: sequence indexed by global scenario index giving
+            the number of field items each scenario contributes (e.g.
+            ``len(nonant_indices)``).  Uniform two-stage problems pass a
+            constant for every scenario; multistage problems may vary.
+
+    Returns:
+        list[OverlapSegment]: segments covering the local buffer in order,
+        with contiguous same-rank runs coalesced.  When both cylinders
+        have the same rank count this degenerates to a single identity
+        segment ``(remote_rank=local_rank, remote_offset=0,
+        local_offset=0, count=<all items>)``.
+    """
+    # Invert remote_slices: global scenario index -> (remote rank, item
+    # offset of that scenario within the remote rank's buffer).
+    remote_rank_of_scen = {}
+    remote_offset_of_scen = {}
+    for rank, scen_idxs in enumerate(remote_slices):
+        offset = 0
+        for scen in scen_idxs:
+            remote_rank_of_scen[scen] = rank
+            remote_offset_of_scen[scen] = offset
+            offset += items_per_scen[scen]
+
+    segments = []
+    cur = None
+    local_offset = 0
+    for scen in local_scen_idxs:
+        rank = remote_rank_of_scen[scen]
+        r_off = remote_offset_of_scen[scen]
+        n = items_per_scen[scen]
+        # Coalesce only while we stay on the same remote rank AND the
+        # remote items remain contiguous with the run so far.
+        if cur is not None and cur.remote_rank == rank \
+                and cur.remote_offset + cur.count == r_off:
+            cur.count += n
+        else:
+            if cur is not None:
+                segments.append(cur)
+            cur = OverlapSegment(remote_rank=rank, remote_offset=r_off,
+                                 local_offset=local_offset, count=n)
+        local_offset += n
+    if cur is not None:
+        segments.append(cur)
+    return segments
diff --git a/mpisppy/tests/test_overlap_map.py b/mpisppy/tests/test_overlap_map.py
new file mode 100644
index 000000000..ed6a718f8
--- /dev/null
+++ b/mpisppy/tests/test_overlap_map.py
@@ -0,0 +1,115 @@
+###############################################################################
+# mpi-sppy: MPI-based Stochastic Programming in PYthon
+#
+# Copyright (c) 2024, Lawrence Livermore National Security, LLC, Alliance for
+# Sustainable Energy, LLC, The Regents of the University of California, et al.
+# All rights reserved. Please see the files COPYRIGHT.md and LICENSE.md for
+# full copyright and license information.
+###############################################################################
+"""Unit tests for mpisppy.cylinders.overlap_map.compute_overlap_segments."""
+
+import unittest
+
+from mpisppy.cylinders.overlap_map import OverlapSegment, compute_overlap_segments
+
+
+def _seg(remote_rank, remote_offset, local_offset, count):
+    return OverlapSegment(remote_rank, remote_offset, local_offset, count)
+
+
+class TestOverlapMap(unittest.TestCase):
+
+    def _assert_tiles_local(self, segments, local_scen_idxs, items_per_scen):
+        # Segments must tile the local buffer contiguously from offset 0,
+        # with total length equal to the local rank's item count.
+        expected_local = sum(items_per_scen[s] for s in local_scen_idxs)
+        running = 0
+        for seg in segments:
+            self.assertEqual(seg.local_offset, running)
+            running += seg.count
+        self.assertEqual(running, expected_local)
+
+    def test_equal_ranks_is_identity_single_segment(self):
+        # Same rank count on both cylinders -> one identity segment per
+        # local rank (the no-op-at-equal-ranks backbone).
+        remote_slices = [[0, 1], [2, 3], [4, 5], [6, 7]]
+        items = [1] * 8
+        for rank, local in enumerate(remote_slices):
+            segs = compute_overlap_segments(local, remote_slices, items)
+            self.assertEqual(segs, [_seg(rank, 0, 0, len(local))])
+
+    def test_equal_ranks_identity_multi_item(self):
+        # Identity holds with >1 item per scenario.
+        remote_slices = [[0, 1], [2, 3]]
+        items = [2] * 4
+        segs = compute_overlap_segments([2, 3], remote_slices, items)
+        self.assertEqual(segs, [_seg(1, 0, 0, 4)])
+
+    def test_fewer_remote_ranks_doc_example(self):
+        # 4-rank hub reading a 2-rank spoke, 10 scenarios, 1 item each.
+        # Spoke split is contiguous: rank0 = scen0-4, rank1 = scen5-9.
+        remote_slices = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+        items = [1] * 10
+
+        # Hub rank holding scen0,1: wholly inside spoke rank 0.
+        self.assertEqual(
+            compute_overlap_segments([0, 1], remote_slices, items),
+            [_seg(0, 0, 0, 2)],
+        )
+        # Hub rank holding scen4,5: straddles the spoke split -> two
+        # segments (scen4 from spoke rank0 @offset4, scen5 from spoke
+        # rank1 @offset0).
+        self.assertEqual(
+            compute_overlap_segments([4, 5], remote_slices, items),
+            [_seg(0, 4, 0, 1), _seg(1, 0, 1, 1)],
+        )
+        # Hub rank holding scen7,8,9: wholly inside spoke rank1, coalesced.
+        self.assertEqual(
+            compute_overlap_segments([7, 8, 9], remote_slices, items),
+            [_seg(1, 2, 0, 3)],
+        )
+
+    def test_more_remote_ranks(self):
+        # 2-rank local cylinder reading a 4-rank remote cylinder.
+        remote_slices = [[0, 1], [2, 3, 4], [5, 6], [7, 8, 9]]
+        items = [1] * 10
+        segs = compute_overlap_segments([0, 1, 2, 3, 4], remote_slices, items)
+        self.assertEqual(segs, [_seg(0, 0, 0, 2), _seg(1, 0, 2, 3)])
+        self._assert_tiles_local(segs, [0, 1, 2, 3, 4], items)
+
+    def test_coalesces_contiguous_run_within_remote_rank(self):
+        remote_slices = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+        items = [1] * 10
+        segs = compute_overlap_segments([2, 3, 4], remote_slices, items)
+        self.assertEqual(segs, [_seg(0, 2, 0, 3)])
+
+    def test_variable_items_per_scenario(self):
+        # Multistage-style: scenarios contribute different item counts.
+        items = [1, 2, 3, 1, 2]
+        remote_slices = [[0, 1, 2], [3, 4]]
+        # Local rank holds scen1,2 (both in remote rank0). scen1 starts at
+        # remote offset items[0]=1; coalesced count = 2+3 = 5.
+        segs = compute_overlap_segments([1, 2], remote_slices, items)
+        self.assertEqual(segs, [_seg(0, 1, 0, 5)])
+        self._assert_tiles_local(segs, [1, 2], items)
+
+    def test_variable_items_across_remote_ranks(self):
+        items = [1, 2, 3, 1, 2]
+        remote_slices = [[0, 1], [2, 3, 4]]
+        # Local holds scen1 (remote rank0 @offset1, count2) then scen2
+        # (remote rank1 @offset0, count3).
+        segs = compute_overlap_segments([1, 2], remote_slices, items)
+        self.assertEqual(segs, [_seg(0, 1, 0, 2), _seg(1, 0, 2, 3)])
+        self._assert_tiles_local(segs, [1, 2], items)
+
+    def test_single_scenario(self):
+        remote_slices = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+        items = [1] * 10
+        self.assertEqual(
+            compute_overlap_segments([6], remote_slices, items),
+            [_seg(1, 1, 0, 1)],
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/run_coverage.bash b/run_coverage.bash
index 996a6c295..5fba670f4 100755
--- a/run_coverage.bash
+++ b/run_coverage.bash
@@ -124,6 +124,9 @@ run_phase "test_rho_deprecations (serial)" \
 run_phase "test_rank_apportionment (serial)" \
     coverage run --rcfile=.coveragerc -m pytest mpisppy/tests/test_rank_apportionment.py -v
 
+run_phase "test_overlap_map (serial)" \
+    coverage run --rcfile=.coveragerc -m pytest mpisppy/tests/test_overlap_map.py -v
+
 run_phase "test_xhat_from_file (serial)" \
     coverage run --rcfile=.coveragerc -m pytest mpisppy/tests/test_xhat_from_file.py -v
 

From bf01a038e5fa22c49d9f2ac4f545dad695a0364a Mon Sep 17 00:00:00 2001
From: Dave Woodruff <david.l.woodruff@gmail.com>
Date: Sat, 23 May 2026 19:46:15 -0700
Subject: [PATCH 3/9] doc(flex-ranks): fix the overlap-map worked example

Two errors in the 4-hub/2-spoke example, both verified against
compute_overlap_segments:
- Hub rank 2 (scen4,scen5) straddles the spoke's scen5 split, so it
  reads two segments (scen4 from spoke rank 0, scen5 from spoke rank 1),
  not one segment of count 2 from spoke rank 0.
- Hub rank 3 (scen6-9) sits at offsets 1-4 within spoke rank 1's buffer
  (which begins at scen5), so remote_offset is 1, not 0.

Also notes the split is illustrative (chosen to show a straddle) and
states the one-nonant-per-scenario assumption.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 doc/designs/flexible_rank_assignments.md | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/doc/designs/flexible_rank_assignments.md b/doc/designs/flexible_rank_assignments.md
index 54b91feca..ff38a29d0 100644
--- a/doc/designs/flexible_rank_assignments.md
+++ b/doc/designs/flexible_rank_assignments.md
@@ -243,7 +243,9 @@ count to get each cylinder's distribution, then compute pairwise
 overlaps.
 
 For example, with a 4-rank hub and a 2-rank spoke, both handling 10
-scenarios:
+scenarios (the per-rank split below is illustrative — the real split is
+whatever `scen_names_to_ranks` produces — and is chosen so that a hub
+rank straddles the spoke's boundary):
 
 ========  ==================  ==================
           Hub (4 ranks)       Spoke (2 ranks)
@@ -254,23 +256,28 @@ Rank 2    scen4, scen5
 Rank 3    scen6--scen9
 ========  ==================  ==================
 
-Hub rank 0 needs to read from spoke rank 0 (which has scen0--scen4),
-extracting only the portion for scen0--scen1.  Hub rank 2 also reads
-from spoke rank 0, extracting scen4--scen5.
+Hub rank 0 reads from spoke rank 0 (which holds scen0--scen4),
+extracting only the portion for scen0--scen1.  Hub rank 2 holds
+scen4--scen5, which *straddles* the spoke's split: scen4 lives in spoke
+rank 0 (at offset 4) and scen5 in spoke rank 1 (at offset 0), so hub
+rank 2 reads one segment from each.  Hub rank 3's scen6--scen9 sit at
+offsets 1--4 within spoke rank 1's buffer (which starts at scen5).
 
-The overlap map for hub rank 0 reading from the spoke would be:
+Taking one nonant per scenario for simplicity, the overlap map (in
+nonant units) for hub rank 0 reading from the spoke would be:
 ```
 [(spoke_rank=0, remote_offset=0, local_offset=0, count=2)]
 ```
 
-For hub rank 2:
+For hub rank 2 (the straddle — two segments):
 ```
-[(spoke_rank=0, remote_offset=4, local_offset=0, count=2)]
+[(spoke_rank=0, remote_offset=4, local_offset=0, count=1),
+ (spoke_rank=1, remote_offset=0, local_offset=1, count=1)]
 ```
 
 For hub rank 3:
 ```
-[(spoke_rank=1, remote_offset=0, local_offset=0, count=4)]
+[(spoke_rank=1, remote_offset=1, local_offset=0, count=4)]
 ```
 
 These maps are computed once at startup and reused for every

From 203e39ff786bfbceb8d696ac6a578b95ba96c15e Mon Sep 17 00:00:00 2001
From: Dave Woodruff <david.l.woodruff@gmail.com>
Date: Sat, 23 May 2026 19:50:00 -0700
Subject: [PATCH 4/9] flex-ranks phase 1: partial-read support in
 SPWindow.get()

Add optional item_offset / item_count to SPWindow.get() so a reader can
fetch a sub-range of a field via MPI_Get displacement. Defaults
(item_count=None) reproduce the original whole-padded-field transfer
exactly, so existing callers are unaffected. Partial reads are the
primitive multi-source assembly will use under unequal rank counts
(see overlap_map.py).

Tests exercise a real single-process RMA window (MPI.COMM_SELF):
full read unchanged, prefix/middle/suffix/single-item partial reads,
full-via-count, and out-of-range / dest-size-mismatch assertions. Wired
into run_coverage.bash and test_pr_and_main.yml.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/test_pr_and_main.yml     |  3 +-
 mpisppy/cylinders/spwindow.py              | 26 ++++++-
 mpisppy/tests/test_spwindow_partial_get.py | 82 ++++++++++++++++++++++
 run_coverage.bash                          |  3 +
 4 files changed, 110 insertions(+), 4 deletions(-)
 create mode 100644 mpisppy/tests/test_spwindow_partial_get.py

diff --git a/.github/workflows/test_pr_and_main.yml b/.github/workflows/test_pr_and_main.yml
index 5a25c600d..575523d71 100644
--- a/.github/workflows/test_pr_and_main.yml
+++ b/.github/workflows/test_pr_and_main.yml
@@ -1129,7 +1129,8 @@ jobs:
             mpisppy/tests/test_buffer_inspect.py \
             mpisppy/tests/test_comm_lor_check.py \
             mpisppy/tests/test_rank_apportionment.py \
-            mpisppy/tests/test_overlap_map.py
+            mpisppy/tests/test_overlap_map.py \
+            mpisppy/tests/test_spwindow_partial_get.py
 
       - name: Upload coverage data
         if: always()
diff --git a/mpisppy/cylinders/spwindow.py b/mpisppy/cylinders/spwindow.py
index 547bdcf6a..003694262 100644
--- a/mpisppy/cylinders/spwindow.py
+++ b/mpisppy/cylinders/spwindow.py
@@ -178,18 +178,38 @@ def free(self):
         return
 
     #### Functions ####
-    def get(self, dest: nptyping.ArrayLike, strata_rank: int, field: Field):
+    def get(self, dest: nptyping.ArrayLike, strata_rank: int, field: Field,
+            item_offset: int = 0, item_count: int = None):
+        """Read a remote rank's buffer for ``field`` into ``dest``.
+
+        By default the whole padded field is transferred (``dest`` must be
+        ``padded_len`` long), preserving the original behavior.  When
+        ``item_count`` is given, only that many doubles are read, starting
+        ``item_offset`` doubles into the field -- a partial read used for
+        multi-source assembly across cylinders with different rank counts
+        (see ``overlap_map.py``).  ``dest`` must then be ``item_count`` long.
+        """
         assert (0 <= strata_rank < len(self.strata_buffer_layouts))
 
         that_layout = self.strata_buffer_layouts[strata_rank]
         assert field in that_layout
 
         (offset, logical_len, padded_len) = that_layout[field]
-        assert np.size(dest) == padded_len
+
+        if item_count is None:
+            count = padded_len
+            disp = offset
+        else:
+            assert item_offset >= 0 and item_count >= 0
+            assert item_offset + item_count <= padded_len, \
+                f"{field=} partial get {item_offset=}+{item_count=} exceeds {padded_len=}"
+            count = item_count
+            disp = offset + item_offset
+        assert np.size(dest) == count
 
         window = self.window
         window.Lock(strata_rank, MPI.LOCK_SHARED)
-        window.Get((dest, padded_len, MPI.DOUBLE), strata_rank, offset)
+        window.Get((dest, count, MPI.DOUBLE), strata_rank, disp)
         window.Unlock(strata_rank)
         return
 
diff --git a/mpisppy/tests/test_spwindow_partial_get.py b/mpisppy/tests/test_spwindow_partial_get.py
new file mode 100644
index 000000000..05487aaea
--- /dev/null
+++ b/mpisppy/tests/test_spwindow_partial_get.py
@@ -0,0 +1,82 @@
+###############################################################################
+# mpi-sppy: MPI-based Stochastic Programming in PYthon
+#
+# Copyright (c) 2024, Lawrence Livermore National Security, LLC, Alliance for
+# Sustainable Energy, LLC, The Regents of the University of California, et al.
+# All rights reserved. Please see the files COPYRIGHT.md and LICENSE.md for
+# full copyright and license information.
+###############################################################################
+"""Partial-read support in SPWindow.get(), exercised on a single-process
+RMA window (MPI.COMM_SELF). Verifies the default whole-field read is
+unchanged and that item_offset/item_count read the right sub-range."""
+
+import unittest
+
+import numpy as np
+
+from mpisppy import MPI
+from mpisppy.cylinders.spwindow import Field, SPWindow, padded_len_n_doubles
+
+
+class TestPartialGet(unittest.TestCase):
+
+    def setUp(self):
+        self.logical = 10
+        self.padded = padded_len_n_doubles(self.logical)
+        my_fields = {Field.NONANTS_VALS: (self.logical, self.padded)}
+        self.win = SPWindow(my_fields, MPI.COMM_SELF)
+        # Known, distinct values across the whole padded field.
+        self.data = np.arange(100, 100 + self.padded, dtype="d")
+        self.win.put(self.data, Field.NONANTS_VALS)
+
+    def tearDown(self):
+        self.win.free()
+
+    def test_full_read_unchanged(self):
+        # Default args: whole padded field, as before.
+        dest = np.empty(self.padded, dtype="d")
+        self.win.get(dest, 0, Field.NONANTS_VALS)
+        np.testing.assert_array_equal(dest, self.data)
+
+    def test_partial_prefix(self):
+        dest = np.empty(3, dtype="d")
+        self.win.get(dest, 0, Field.NONANTS_VALS, item_offset=0, item_count=3)
+        np.testing.assert_array_equal(dest, self.data[0:3])
+
+    def test_partial_middle(self):
+        dest = np.empty(4, dtype="d")
+        self.win.get(dest, 0, Field.NONANTS_VALS, item_offset=2, item_count=4)
+        np.testing.assert_array_equal(dest, self.data[2:6])
+
+    def test_partial_suffix(self):
+        dest = np.empty(5, dtype="d")
+        self.win.get(dest, 0, Field.NONANTS_VALS,
+                     item_offset=self.padded - 5, item_count=5)
+        np.testing.assert_array_equal(dest, self.data[self.padded - 5:])
+
+    def test_partial_full_via_count(self):
+        # item_count equal to padded length reproduces the full read.
+        dest = np.empty(self.padded, dtype="d")
+        self.win.get(dest, 0, Field.NONANTS_VALS,
+                     item_offset=0, item_count=self.padded)
+        np.testing.assert_array_equal(dest, self.data)
+
+    def test_single_item(self):
+        dest = np.empty(1, dtype="d")
+        self.win.get(dest, 0, Field.NONANTS_VALS, item_offset=7, item_count=1)
+        self.assertEqual(dest[0], self.data[7])
+
+    def test_out_of_range_raises(self):
+        dest = np.empty(2, dtype="d")
+        with self.assertRaises(AssertionError):
+            self.win.get(dest, 0, Field.NONANTS_VALS,
+                         item_offset=self.padded - 1, item_count=2)
+
+    def test_dest_size_mismatch_raises(self):
+        dest = np.empty(5, dtype="d")  # wrong size for count=3
+        with self.assertRaises(AssertionError):
+            self.win.get(dest, 0, Field.NONANTS_VALS, item_offset=0, item_count=3)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/run_coverage.bash b/run_coverage.bash
index 5fba670f4..49a1a7d38 100755
--- a/run_coverage.bash
+++ b/run_coverage.bash
@@ -127,6 +127,9 @@ run_phase "test_rank_apportionment (serial)" \
 run_phase "test_overlap_map (serial)" \
     coverage run --rcfile=.coveragerc -m pytest mpisppy/tests/test_overlap_map.py -v
 
+run_phase "test_spwindow_partial_get (serial)" \
+    coverage run --rcfile=.coveragerc -m pytest mpisppy/tests/test_spwindow_partial_get.py -v
+
 run_phase "test_xhat_from_file (serial)" \
     coverage run --rcfile=.coveragerc -m pytest mpisppy/tests/test_xhat_from_file.py -v
 

From 109e5710aca5a80de194629a2c0f714678770020 Mon Sep 17 00:00:00 2001
From: Dave Woodruff <david.l.woodruff@gmail.com>
Date: Sat, 23 May 2026 19:54:56 -0700
Subject: [PATCH 5/9] flex-ranks phase 1: read rank_ratio dict keys; gate the
 unequal path

WheelSpinner.run() now reads a per-cylinder "rank_ratio" key from each
hub/spoke dict (default 1.0). When all ratios are 1.0 the path is exactly
as before. When any differs, it apportions the rank pool
(largest-remainder, floor of one), reports the allocation via global_toc
on rank 0, then raises NotImplementedError -- building communicators for
unequal per-cylinder counts is not yet wired into the comm/window layer.

This lands the apportionment wiring and the dict-key plumbing while
keeping the uniform (default) behavior unchanged; verified by running
test_with_cylinders (PH hub + xhatshuffle spoke) end to end.

Tests use a stub comm to drive the gate without MPI: a non-uniform ratio
raises NotImplementedError; an infeasible floor-of-one raises ValueError
first; explicit all-1.0 ratios skip the gate. Wired into
run_coverage.bash and test_pr_and_main.yml.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/test_pr_and_main.yml   |  3 +-
 mpisppy/spin_the_wheel.py                | 22 +++++++
 mpisppy/tests/test_flexible_rank_gate.py | 75 ++++++++++++++++++++++++
 run_coverage.bash                        |  3 +
 4 files changed, 102 insertions(+), 1 deletion(-)
 create mode 100644 mpisppy/tests/test_flexible_rank_gate.py

diff --git a/.github/workflows/test_pr_and_main.yml b/.github/workflows/test_pr_and_main.yml
index 575523d71..904ed4b71 100644
--- a/.github/workflows/test_pr_and_main.yml
+++ b/.github/workflows/test_pr_and_main.yml
@@ -1130,7 +1130,8 @@ jobs:
             mpisppy/tests/test_comm_lor_check.py \
             mpisppy/tests/test_rank_apportionment.py \
             mpisppy/tests/test_overlap_map.py \
-            mpisppy/tests/test_spwindow_partial_get.py
+            mpisppy/tests/test_spwindow_partial_get.py \
+            mpisppy/tests/test_flexible_rank_gate.py
 
       - name: Upload coverage data
         if: always()
diff --git a/mpisppy/spin_the_wheel.py b/mpisppy/spin_the_wheel.py
index 2b3d4b8fa..1f41f32a0 100644
--- a/mpisppy/spin_the_wheel.py
+++ b/mpisppy/spin_the_wheel.py
@@ -12,6 +12,7 @@
 
 from mpisppy.utils import nice_join
 from mpisppy.utils.sputils import first_stage_nonant_writer, scenario_tree_solution_writer
+from mpisppy.utils.rank_apportionment import apportion_ranks
 
 class WheelSpinner:
 
@@ -103,6 +104,27 @@ def run(self, comm_world=None):
 
         # Create the necessary communicators
         fullcomm = comm_world
+
+        # Flexible rank assignments: each cylinder may request a rank ratio
+        # relative to the hub (default 1.0) via a "rank_ratio" dict key. The
+        # uniform case (all ratios 1.0) is handled exactly as before. A
+        # non-uniform request is apportioned (largest-remainder, floor of one)
+        # and reported, but building the communicators for unequal per-cylinder
+        # counts is not yet wired into the communicator/window layer.
+        rank_ratios = [d.get("rank_ratio", 1.0) for d in communicator_list]
+        if any(r != 1.0 for r in rank_ratios):
+            rank_counts = apportion_ranks(rank_ratios, fullcomm.Get_size())
+            global_toc(
+                f"Requested per-cylinder rank ratios {rank_ratios} -> "
+                f"rank counts {rank_counts}",
+                fullcomm.Get_rank() == 0,
+            )
+            raise NotImplementedError(
+                "Per-cylinder rank counts (flexible rank assignments) are not "
+                f"yet wired into the communicator layer; computed allocation "
+                f"{rank_counts}. Run with all rank_ratio == 1.0 for now."
+            )
+
         strata_comm, cylinder_comm = _make_comms(n_spcomms, fullcomm=fullcomm)
         strata_rank = strata_comm.Get_rank()
         cylinder_rank = cylinder_comm.Get_rank()
diff --git a/mpisppy/tests/test_flexible_rank_gate.py b/mpisppy/tests/test_flexible_rank_gate.py
new file mode 100644
index 000000000..a94fc44c2
--- /dev/null
+++ b/mpisppy/tests/test_flexible_rank_gate.py
@@ -0,0 +1,75 @@
+###############################################################################
+# mpi-sppy: MPI-based Stochastic Programming in PYthon
+#
+# Copyright (c) 2024, Lawrence Livermore National Security, LLC, Alliance for
+# Sustainable Energy, LLC, The Regents of the University of California, et al.
+# All rights reserved. Please see the files COPYRIGHT.md and LICENSE.md for
+# full copyright and license information.
+###############################################################################
+"""WheelSpinner reads per-cylinder rank_ratio dict keys and, when any is
+non-default, apportions and reports the allocation then raises (the live
+unequal-rank communicator path is not yet wired in). The uniform default
+path is exercised by the cylinder test suite, not here.
+
+The gate fires before any communicator is built, so a stub comm with just
+Get_size/Get_rank is enough to drive it without MPI."""
+
+import unittest
+
+from mpisppy.spin_the_wheel import WheelSpinner
+
+
+class _StubComm:
+    """Minimal comm: the gate only needs size and rank."""
+    def __init__(self, size, rank=0):
+        self._size = size
+        self._rank = rank
+
+    def Get_size(self):
+        return self._size
+
+    def Get_rank(self):
+        return self._rank
+
+
+def _min_dict(role, **extra):
+    # Dicts only need the keys WheelSpinner validates before the gate;
+    # the classes are never instantiated on this path.
+    d = {f"{role}_class": object, "opt_class": object}
+    d.update(extra)
+    return d
+
+
+class TestFlexibleRankGate(unittest.TestCase):
+
+    def test_nonuniform_ratio_raises_not_implemented(self):
+        hub = _min_dict("hub", rank_ratio=1.0)
+        spoke = _min_dict("spoke", rank_ratio=0.5)
+        ws = WheelSpinner(hub, [spoke])
+        with self.assertRaises(NotImplementedError):
+            ws.run(comm_world=_StubComm(size=4))
+
+    def test_nonuniform_with_too_few_ranks_raises_value_error(self):
+        # When even the floor-of-one is infeasible, apportion_ranks raises
+        # ValueError first (before the NotImplementedError gate).
+        hub = _min_dict("hub", rank_ratio=1.0)
+        spoke_a = _min_dict("spoke", rank_ratio=0.5)
+        spoke_b = _min_dict("spoke", rank_ratio=0.5)
+        ws = WheelSpinner(hub, [spoke_a, spoke_b])
+        with self.assertRaises(ValueError):
+            ws.run(comm_world=_StubComm(size=2))
+
+    def test_explicit_uniform_ratios_pass_the_gate(self):
+        # All ratios 1.0 -> gate is skipped; the next step (_make_comms) is
+        # reached and fails on the stub (no Split). Reaching that point at
+        # all proves the NotImplementedError gate did not fire.
+        hub = _min_dict("hub", rank_ratio=1.0)
+        spoke = _min_dict("spoke", rank_ratio=1.0)
+        ws = WheelSpinner(hub, [spoke])
+        with self.assertRaises(Exception) as ctx:
+            ws.run(comm_world=_StubComm(size=2))
+        self.assertNotIsInstance(ctx.exception, NotImplementedError)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/run_coverage.bash b/run_coverage.bash
index 49a1a7d38..7d96639a6 100755
--- a/run_coverage.bash
+++ b/run_coverage.bash
@@ -130,6 +130,9 @@ run_phase "test_overlap_map (serial)" \
 run_phase "test_spwindow_partial_get (serial)" \
     coverage run --rcfile=.coveragerc -m pytest mpisppy/tests/test_spwindow_partial_get.py -v
 
+run_phase "test_flexible_rank_gate (serial)" \
+    coverage run --rcfile=.coveragerc -m pytest mpisppy/tests/test_flexible_rank_gate.py -v
+
 run_phase "test_xhat_from_file (serial)" \
     coverage run --rcfile=.coveragerc -m pytest mpisppy/tests/test_xhat_from_file.py -v
 

From bb2e63eaa0a0cea145cd03bceff5e58d8e7c864d Mon Sep 17 00:00:00 2001
From: Dave Woodruff <david.l.woodruff@gmail.com>
Date: Sun, 24 May 2026 06:49:49 -0700
Subject: [PATCH 6/9] doc(flex-ranks): add a development and rollout strategy
 section

Records the procedure decisions for landing this multi-phase, low-level
MPI/RMA work safely:

- Sequence phases by blast radius: inert, gated phases are safe to land
  incrementally; the live communicator rewrite is where risk concentrates.
- Chosen workflow: stacked PRs (each phase branch targets the one below,
  merged bottom-up) -- keeps work moving despite slow reviews while every
  change stays small, reviewable, and bisectable, and main stays
  shippable; preferred over a long-lived branch / fork big-bang merge.
- Live-path rewrite: branch by abstraction with a runtime fallback (new
  fullcomm/Option-D path alongside strata_comm, old path default until
  proven).
- Gate the default flip on an MPI CI matrix (multiple MPI implementations
  and mpi4py versions).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 doc/designs/flexible_rank_assignments.md | 54 ++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/doc/designs/flexible_rank_assignments.md b/doc/designs/flexible_rank_assignments.md
index ff38a29d0..f45d2240a 100644
--- a/doc/designs/flexible_rank_assignments.md
+++ b/doc/designs/flexible_rank_assignments.md
@@ -616,6 +616,60 @@ the first pass bundled into "Phase 0" has already landed separately.
   they should compose with async senders; verify and add tests.
 
 
+### Development and rollout strategy
+
+These changes touch a low-level, implementation-sensitive corner of the
+stack -- MPI one-sided (RMA) communication through mpi4py -- where
+behavior varies across MPI implementations (OpenMPI, MPICH, vendor
+builds) and versions, and where passive-target RMA (`Lock` / `Get` /
+`Put`) is the flakiest part of the spec.  Reviews of this material are
+necessarily slow, and a regression in the communicator layer would
+affect *every* run, not just flexible-rank runs.  The phases above are
+therefore sequenced by *blast radius*:
+
+- **Inert phases** (the infrastructure phase) add only gated, unreachable
+  code: at equal rank ratios the existing code path is unchanged.  These
+  can land on `main` incrementally and safely -- each is small,
+  reviewable on its own, and cannot change behavior for current users.
+- **Live-path phases** (the communicator rework that replaces
+  `strata_comm`) rewrite code that every run exercises, equal-rank or
+  not.  This cannot hide behind the rank-ratio gate, and is where the
+  real risk concentrates.
+
+**Chosen workflow: stacked PRs.**  Because reviews of this material are
+slow, phases are developed as a *stack* of pull requests rather than
+waiting for each to merge before the next begins.  Each phase lives on
+its own short-lived branch whose PR targets the branch *below* it (phase
+N+1 onto phase N, phase 1 onto `main`); the stack is reviewed and merged
+bottom-up, each PR being retargeted to `main` as the one beneath it
+lands.  This keeps work moving while earlier PRs sit in review, yet --
+unlike a long-lived feature branch or a development fork -- every change
+stays small, individually reviewable, and revertible/bisectable, and
+`main` stays continuously shippable.  (A big-bang integration merge of an
+entire multi-phase feature is the opposite: a large, hard-to-review,
+hard-to-bisect surface, which is *more* dangerous for code whose failures
+are subtle.  If isolation is ever genuinely wanted, prefer a branch in
+the upstream repository over a separate fork -- same isolation, far less
+CI and merge friction.)
+
+**Live-path rework: branch by abstraction.**  Stacked PRs govern *how*
+phases are reviewed and merged; the communicator rework that replaces
+`strata_comm` additionally needs care in *how the code is structured*,
+because it changes the path every run exercises.  Implement the new
+fullcomm / Option-D path *alongside* the existing `strata_comm` path,
+selected by a flag, defaulting to the **old** path.  Land it with the old
+path still default, so current users are unaffected; exercise the new
+path opt-in; make it the default only once it is proven; remove the old
+path afterward.  This yields a **runtime fallback** a feature branch or
+fork does not: if a cluster's MPI has buggy RMA, users can switch back to
+the proven path.  For an unstable dependency we do not control, a toggle
+available in production is worth more than isolation during development.
+
+**Gate the default flip on an MPI CI matrix.**  Exercise the new path on
+at least two MPI implementations (e.g. OpenMPI and MPICH) and more than
+one mpi4py / MPI version before it becomes the default.
+
+
 ### Possible future optimization (out of scope)
 
 The first-stage portion of `BEST_XHAT` / `RECENT_XHATS` is genuinely

From b620f3364309f22d549abca4a130a2ccb6e1f4fc Mon Sep 17 00:00:00 2001
From: Dave Woodruff <david.l.woodruff@gmail.com>
Date: Sun, 24 May 2026 07:23:08 -0700
Subject: [PATCH 7/9] doc(flex-ranks): adopt gated-additive rollout, not a
 live-path rework

Rewrite the rollout strategy: flexible ranks are added as a
gated-additive feature keyed on the rank-ratio config, rather than
replacing strata_comm behind a runtime flag. At equal ratios the
existing per-strata_comm windows and single-source reader run
unchanged; the fullcomm + overlap-map path is reached only when a ratio
differs from 1.0. Disabling the feature is the production fallback, so
no second implementation of the equal-rank path is needed.

Reconcile the sections that assumed Option D replaced strata_comm
globally (Window Topology recommendation, Impact bullets for
spin_the_wheel/spwindow/spcommunicator, Phase 2, Backward Compatibility)
and add a key-decision pointer at the top.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 doc/designs/flexible_rank_assignments.md | 202 +++++++++++++++--------
 1 file changed, 135 insertions(+), 67 deletions(-)

diff --git a/doc/designs/flexible_rank_assignments.md b/doc/designs/flexible_rank_assignments.md
index f45d2240a..8d9a8deac 100644
--- a/doc/designs/flexible_rank_assignments.md
+++ b/doc/designs/flexible_rank_assignments.md
@@ -3,6 +3,14 @@
 Status: Design Document (Draft — second pass)
 Date: 2026-05-24
 
+**Key decision up front.**  Flexible ranks are added as a
+*gated-additive* feature, not a rework of the path every run exercises.
+At equal rank ratios (today's only case) the existing `strata_comm`
+windows and single-source reader run unchanged; the new `fullcomm` +
+overlap-map machinery is reached only when a rank ratio differs from
+1.0.  The rationale, trade-offs, and rollout workflow are in
+§Development and rollout strategy.
+
 ### Terminology
 
 - **Window:** An MPI one-sided ("RMA") addressing scope, created
@@ -301,7 +309,10 @@ multi-stage problems (though for two-stage problems they are uniform).
 
 #### Window Topology Options
 
-Spoiler alert: we are going to recommend option D, which is similar to A.
+Of the options below we settle on Option D, though only as the topology
+for the unequal-rank path; it is similar to Option A.  See §Development
+and rollout strategy for why it is added alongside the existing
+`strata_comm` rather than replacing it.
 
 **Option A: Single global window on `fullcomm`**
 
@@ -361,9 +372,16 @@ window creation.  Cons:
   those pairs) is an incremental fix that does not require abandoning
   Option D.
 
-**Recommendation:** Option D.  The window is on `fullcomm` (or a
-subset), and each rank knows which global ranks to read from via the
-overlap maps.
+**Recommendation:** Option D, *as the topology for the unequal-rank
+path only*.  When any rank ratio differs from 1.0, the window is on
+`fullcomm` (or a subset) and each rank knows which global ranks to read
+from via the overlap maps.  `strata_comm` is **not** removed: the
+equal-rank case keeps its existing per-`strata_comm` windows untouched,
+and the `fullcomm` topology is built additively, reached only when the
+feature is engaged (see §Development and rollout strategy).  "Abandon
+`strata_comm`" above describes Option D as a global replacement, which
+this design does *not* adopt; it borrows only Option D's addressing
+scheme for the new path.
 
 
 #### Multi-Source Read Assembly
@@ -503,9 +521,15 @@ unnecessary given the per-field analysis).
   algorithm (§User Interface) that respects per-cylinder rank counts.
 - `cylinder_comm` creation still uses `MPI_Comm_split` (all ranks in
   the same cylinder get the same color).
-- `strata_comm` is removed (Option D); the window moves to `fullcomm`.
-- Indexing that assumes a fixed `strata_rank` meaning must change;
-  each rank needs to know its cylinder index directly.
+- `strata_comm` is **retained** for the equal-rank path.  When ranks
+  are equal across cylinders, `strata_comm` and its per-strata windows
+  are used exactly as today.  When ratios differ, the window is built
+  on `fullcomm` instead (Option D's addressing), selected once at
+  startup; see §Development and rollout strategy.
+- In the unequal-rank path, indexing that assumes a fixed `strata_rank`
+  meaning does not apply; each rank addresses peers by global rank via
+  the overlap maps.  The equal-rank path's `strata_rank` indexing is
+  unchanged.
 
 #### `spbase.py`
 
@@ -514,25 +538,30 @@ unnecessary given the per-field analysis).
 
 #### `spwindow.py`
 
-- `SPWindow.get()` must support partial reads (offset + count).
-- The buffer layout exchange must use a communicator that spans all
-  cylinders (not `strata_comm`), or be computed locally from the
-  static rank-count/scenario-map data (preferred).
-- Buffer validation must account for asymmetric sizes: a receiver's
-  expected size may differ from the sender's, and that is fine as long
-  as each requested segment fits within the sender's buffer.
+- `SPWindow.get()` must support partial reads (offset + count) -- an
+  additive capability the equal-rank path does not exercise.
+- For the unequal-rank path, the buffer layout must be exchanged over a
+  communicator that spans all cylinders, or (preferred) computed
+  locally from the static rank-count/scenario-map data.  The
+  equal-rank path keeps its existing `strata_comm` allgather.
+- Buffer validation must account for asymmetric sizes on the
+  multi-source path: a receiver's expected size may differ from the
+  sender's, and that is fine as long as each requested segment fits
+  within the sender's buffer.
 - No field-length or layout changes: every field keeps its current
   per-scenario / global / scalar sizing.  (Canonicalization is *not*
   part of this design.)
 
 #### `spcommunicator.py`
 
-- `register_receive_fields()` builds overlap maps instead of assuming
-  1-to-1 rank correspondence.
-- `get_receive_buffer()` supports multi-source assembly, with the
-  per-field coherence policy applied.
+- `register_receive_fields()` builds overlap maps for the unequal-rank
+  path; the equal-rank path retains the 1-to-1 rank correspondence.
+- `get_receive_buffer()` gains a multi-source assembly path (with the
+  per-field coherence policy applied), selected when ratios differ; the
+  single-source path is unchanged at equal ranks.
 - `put_send_buffer()` is unchanged (each rank writes its own data).
-- `_validate_recv_field()` is relaxed to per-segment validation.
+- `_validate_recv_field()` gains per-segment validation for the
+  multi-source path; equal-rank validation is unchanged.
 - The `synchronize` parameter's `cylinder_comm.Barrier()` / `Allreduce`
   still work within a cylinder.
 
@@ -578,12 +607,17 @@ the first pass bundled into "Phase 0" has already landed separately.
   various rank-count combinations (including the equal-rank identity
   case).
 
-**Phase 2: Communication layer**
-
-- Replace the `strata_comm`-based layout exchange with a
-  `fullcomm`-based or locally-computed layout (Option D).
-- Implement multi-source `get_receive_buffer()` using overlap maps.
-- Relax `_validate_recv_field()` to per-segment validation.
+**Phase 2: Communication layer** (additive; reached only when a ratio
+differs from 1.0)
+
+- Add a `fullcomm`-based or locally-computed layout exchange for the
+  unequal-rank path (Option D's addressing), *alongside* the existing
+  `strata_comm`-based exchange, which the equal-rank path keeps using.
+- Implement multi-source `get_receive_buffer()` using overlap maps, as
+  a path taken only under non-default ratios; the single-source reader
+  is unchanged for the equal-rank case.
+- Add per-segment validation for the multi-source path; leave
+  `_validate_recv_field()` as-is on the equal-rank path.
 - Test the relaxed-coherence per-scenario fields first (`NONANTS_VALS`)
   with a simple case: hub with 4 ranks, Lagrangian spoke with 2 ranks.
 
@@ -624,23 +658,65 @@ behavior varies across MPI implementations (OpenMPI, MPICH, vendor
 builds) and versions, and where passive-target RMA (`Lock` / `Get` /
 `Put`) is the flakiest part of the spec.  Reviews of this material are
 necessarily slow, and a regression in the communicator layer would
-affect *every* run, not just flexible-rank runs.  The phases above are
-therefore sequenced by *blast radius*:
-
-- **Inert phases** (the infrastructure phase) add only gated, unreachable
-  code: at equal rank ratios the existing code path is unchanged.  These
-  can land on `main` incrementally and safely -- each is small,
-  reviewable on its own, and cannot change behavior for current users.
-- **Live-path phases** (the communicator rework that replaces
-  `strata_comm`) rewrite code that every run exercises, equal-rank or
-  not.  This cannot hide behind the rank-ratio gate, and is where the
-  real risk concentrates.
-
-**Chosen workflow: stacked PRs.**  Because reviews of this material are
-slow, phases are developed as a *stack* of pull requests rather than
-waiting for each to merge before the next begins.  Each phase lives on
-its own short-lived branch whose PR targets the branch *below* it (phase
-N+1 onto phase N, phase 1 onto `main`); the stack is reviewed and merged
+affect *every* run, not just flexible-rank runs.  The guiding principle
+is therefore to **keep the equal-rank path off the table entirely**: the
+common case that every current user exercises must execute the same code
+after this feature lands as before it.
+
+**Gated-additive, not a live-path rework.**  An earlier draft proposed
+replacing `strata_comm` with a single `fullcomm` window (Option D) for
+*all* runs, introduced via branch-by-abstraction behind a runtime flag
+that defaulted to the old path and was to be removed once the new path
+proved out.  We reject that here.  A replacement -- even one staged
+behind a flag -- rewrites the path every run hits, and the temporary
+"two implementations behind a flag, delete the old one later" state is
+exactly the kind of clutter that becomes permanent: removing the proven
+path is always scarier than keeping it, the two paths drift because tests
+rarely exercise both equally, and the toggle outlives its purpose.
+
+Instead, the new `fullcomm` + overlap-map machinery is **purely
+additive and gated by the feature itself**.  The rank-ratio
+configuration *is* the gate -- no separate runtime flag:
+
+- When all ratios are 1.0 (today's only possibility), the system builds
+  the existing per-`strata_comm` windows and uses the existing
+  single-source reader, unchanged.  Current users execute the same bytes
+  they execute now.
+- When any ratio differs from 1.0, `make_windows()` builds the
+  `fullcomm` window (Option D's topology) and the multi-source,
+  overlap-map reader is used.  A run uses exactly one topology
+  throughout, chosen once at startup; the two never coexist within a
+  single run.
+
+This makes *every* phase inert with respect to the equal-rank path --
+there is no live-path phase to sequence around.  Each phase adds gated
+code that is unreachable until a user opts in with a non-default ratio,
+so each can land on `main` incrementally and safely.
+
+**The production fallback comes for free.**  The branch-by-abstraction
+flag was justified as a runtime escape hatch for clusters with buggy
+RMA.  Gated-additive preserves that property without a second
+implementation of the equal-rank path: if the `fullcomm` path
+misbehaves on some MPI build, the user sets the ratios back to 1.0 and is
+on the proven path.  Disabling the feature *is* the escape hatch.
+
+**Cost we accept.**  Two window topologies live in the tree
+permanently -- per-`strata_comm` windows for the equal-rank case and the
+`fullcomm` window for the unequal case -- and we do *not* plan a later
+unification onto a single communicator.  This is benign clutter: the two
+are not competing implementations of the same behavior (which would
+drift), but the right tool for two different cases, partitioned by a
+condition decided once at startup.  The thing given up is the aesthetic
+of a single communicator; if a future benchmark ever shows `fullcomm` is
+strictly better even at equal ranks, the cutover remains available as
+separate work, but it is not undertaken speculatively for an unstable
+dependency we do not control.
+
+**Workflow: stacked PRs.**  Because reviews of this material are slow,
+phases are developed as a *stack* of pull requests rather than waiting
+for each to merge before the next begins.  Each phase lives on its own
+short-lived branch whose PR targets the branch *below* it (phase N+1 onto
+phase N, phase 1 onto `main`); the stack is reviewed and merged
 bottom-up, each PR being retargeted to `main` as the one beneath it
 lands.  This keeps work moving while earlier PRs sit in review, yet --
 unlike a long-lived feature branch or a development fork -- every change
@@ -652,22 +728,11 @@ are subtle.  If isolation is ever genuinely wanted, prefer a branch in
 the upstream repository over a separate fork -- same isolation, far less
 CI and merge friction.)
 
-**Live-path rework: branch by abstraction.**  Stacked PRs govern *how*
-phases are reviewed and merged; the communicator rework that replaces
-`strata_comm` additionally needs care in *how the code is structured*,
-because it changes the path every run exercises.  Implement the new
-fullcomm / Option-D path *alongside* the existing `strata_comm` path,
-selected by a flag, defaulting to the **old** path.  Land it with the old
-path still default, so current users are unaffected; exercise the new
-path opt-in; make it the default only once it is proven; remove the old
-path afterward.  This yields a **runtime fallback** a feature branch or
-fork does not: if a cluster's MPI has buggy RMA, users can switch back to
-the proven path.  For an unstable dependency we do not control, a toggle
-available in production is worth more than isolation during development.
-
-**Gate the default flip on an MPI CI matrix.**  Exercise the new path on
-at least two MPI implementations (e.g. OpenMPI and MPICH) and more than
-one mpi4py / MPI version before it becomes the default.
+**Gate reliance on the feature with an MPI CI matrix.**  There is no
+default to flip, but before the `fullcomm` path is advertised as
+supported, exercise it on at least two MPI implementations (e.g. OpenMPI
+and MPICH) and more than one mpi4py / MPI version, since that path is
+where the RMA-portability risk lives.
 
 
 ### Possible future optimization (out of scope)
@@ -684,15 +749,18 @@ work.
 
 ### Backward Compatibility
 
-When all rank ratios are 1.0 (the default), the system must behave
-identically to the current implementation.  The overlap maps degenerate
-to single-segment identity mappings, multi-source reads become
-single-source reads, and the strict/relaxed coherence checks are
-satisfied trivially (one source per field).  Because this design makes
-**no field-layout changes**, the on-the-wire format is unchanged at
-equal ranks — there is no analogue of the first pass's "Phase 0 changes
-the wire format even at equal ranks" caveat.  Verify by running the
-full existing test suite with the new code and default ratios.
+When all rank ratios are 1.0 (the default), the system behaves
+identically to the current implementation — not via a degenerate case of
+the new machinery, but because the new machinery is **not reached at
+all**.  At equal ratios the existing per-`strata_comm` windows,
+single-source reader, and `strata_rank` addressing run verbatim; the
+overlap maps, `fullcomm` window, and multi-source reader are gated behind
+a non-default ratio (see §Development and rollout strategy).  Because
+this design also makes **no field-layout changes**, the on-the-wire
+format is unchanged at equal ranks — there is no analogue of the first
+pass's "Phase 0 changes the wire format even at equal ranks" caveat.
+Verify by running the full existing test suite with the new code and
+default ratios.
 
 
 ---

From b6f17fd564ab6fe8e62165eac60f71b2626a3803 Mon Sep 17 00:00:00 2001
From: Dave Woodruff <david.l.woodruff@gmail.com>
Date: Sun, 24 May 2026 07:27:32 -0700
Subject: [PATCH 8/9] doc(flex-ranks): scope spin_the_wheel impact bullets to
 the unequal path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The divisibility check and uniform MPI_Comm_split are kept on the
equal-rank path (the gate runs before _make_comms, which still enforces
n_proc % n_spcomms == 0); only the unequal-rank path bypasses the check
and uses apportionment. Reword the two §Impact bullets that read as an
unconditional remove/replace, consistent with the gated-additive
rollout.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 doc/designs/flexible_rank_assignments.md | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/doc/designs/flexible_rank_assignments.md b/doc/designs/flexible_rank_assignments.md
index 8d9a8deac..0777edbb9 100644
--- a/doc/designs/flexible_rank_assignments.md
+++ b/doc/designs/flexible_rank_assignments.md
@@ -516,9 +516,13 @@ unnecessary given the per-field analysis).
 
 #### `spin_the_wheel.py`
 
-- Remove the `n_proc % n_spcomms == 0` check.
-- Replace the uniform `MPI_Comm_split` with the apportionment
-  algorithm (§User Interface) that respects per-cylinder rank counts.
+- Keep the `n_proc % n_spcomms == 0` check on the equal-rank path; the
+  unequal-rank path bypasses it, since apportionment (§User Interface)
+  removes the requirement that `n_proc` divide evenly by cylinder count.
+- On the unequal-rank path, replace the uniform `MPI_Comm_split` with
+  the apportionment algorithm (§User Interface) that respects
+  per-cylinder rank counts.  The equal-rank path keeps the uniform
+  split unchanged.
 - `cylinder_comm` creation still uses `MPI_Comm_split` (all ranks in
   the same cylinder get the same color).
 - `strata_comm` is **retained** for the equal-rank path.  When ranks

From d0983640861b11a78d263d1ccb909ef054657d62 Mon Sep 17 00:00:00 2001
From: Dave Woodruff <david.l.woodruff@gmail.com>
Date: Sun, 24 May 2026 07:36:38 -0700
Subject: [PATCH 9/9] doc(flex-ranks): clarify strata_comm is not created in an
 unequal run

"Retained" was ambiguous: it meant the strata_comm code path stays in
the codebase for equal-rank runs, but read as if strata_comm coexists
with the fullcomm window inside an unequal-rank run. It does not -- the
strata grouping (color = global_rank // n_spcomms) is undefined when
cylinders have different rank counts, which is the reason for Option D.
State the per-run rule explicitly: an equal-rank run creates strata_comm
and windows on it; an unequal-rank run does not create strata_comm and
puts the window on fullcomm; cylinder_comm is created in both.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 doc/designs/flexible_rank_assignments.md | 50 ++++++++++++++++--------
 1 file changed, 34 insertions(+), 16 deletions(-)

diff --git a/doc/designs/flexible_rank_assignments.md b/doc/designs/flexible_rank_assignments.md
index 0777edbb9..db0dd5d6b 100644
--- a/doc/designs/flexible_rank_assignments.md
+++ b/doc/designs/flexible_rank_assignments.md
@@ -373,15 +373,29 @@ window creation.  Cons:
   Option D.
 
 **Recommendation:** Option D, *as the topology for the unequal-rank
-path only*.  When any rank ratio differs from 1.0, the window is on
-`fullcomm` (or a subset) and each rank knows which global ranks to read
-from via the overlap maps.  `strata_comm` is **not** removed: the
-equal-rank case keeps its existing per-`strata_comm` windows untouched,
-and the `fullcomm` topology is built additively, reached only when the
-feature is engaged (see §Development and rollout strategy).  "Abandon
-`strata_comm`" above describes Option D as a global replacement, which
-this design does *not* adopt; it borrows only Option D's addressing
-scheme for the new path.
+path only*.  A given run uses exactly one topology, chosen at startup
+from the rank ratios:
+
+- *Equal-rank run* (all ratios 1.0): create `strata_comm` and
+  `cylinder_comm`, and put the window on `strata_comm` -- exactly as
+  today.
+- *Unequal-rank run* (any ratio differs from 1.0): **`strata_comm` is
+  not created at all.**  The strata grouping (`color =
+  global_rank // n_spcomms`, "rank *i* of every cylinder") is undefined
+  when cylinders have different rank counts -- that is the whole reason
+  for Option D.  Create `cylinder_comm` only (each cylinder is a
+  contiguous block of apportioned ranks, so the intra-cylinder grouping
+  is still well-defined), put the window on `fullcomm`, and have each
+  rank address peers by global rank via the overlap maps.  Each rank
+  takes its cylinder index from the apportionment rather than from
+  `strata_rank`.
+
+So `strata_comm` is *retained in the codebase* -- the equal-rank path
+still uses it unchanged -- but it is *not created in an unequal-rank
+run*.  This is additive, not a replacement: the textbook "abandon
+`strata_comm`" framing in the Option D description above would remove it
+for *all* runs, which this design does not do.  See §Development and
+rollout strategy.
 
 
 #### Multi-Source Read Assembly
@@ -525,15 +539,19 @@ unnecessary given the per-field analysis).
   split unchanged.
 - `cylinder_comm` creation still uses `MPI_Comm_split` (all ranks in
   the same cylinder get the same color).
-- `strata_comm` is **retained** for the equal-rank path.  When ranks
-  are equal across cylinders, `strata_comm` and its per-strata windows
-  are used exactly as today.  When ratios differ, the window is built
-  on `fullcomm` instead (Option D's addressing), selected once at
-  startup; see §Development and rollout strategy.
+- `strata_comm` is **retained in the codebase** for the equal-rank
+  path: when ranks are equal across cylinders, `strata_comm` and its
+  per-strata windows are created and used exactly as today.  In an
+  unequal-rank run `strata_comm` is **not created** -- the strata
+  grouping is undefined for cylinders of different sizes -- and the
+  window is built on `fullcomm` instead (Option D's addressing).  The
+  choice is made once at startup; a run creates one or the other, never
+  both.  `cylinder_comm` is created in both cases.  See §Development and
+  rollout strategy.
 - In the unequal-rank path, indexing that assumes a fixed `strata_rank`
   meaning does not apply; each rank addresses peers by global rank via
-  the overlap maps.  The equal-rank path's `strata_rank` indexing is
-  unchanged.
+  the overlap maps, taking its cylinder index from the apportionment.
+  The equal-rank path's `strata_rank` indexing is unchanged.
 
 #### `spbase.py`