From 35f71ba905eec3268d7ca578e176f9cf349d1f3c Mon Sep 17 00:00:00 2001 From: Dave Woodruff Date: Sat, 23 May 2026 19:05:15 -0700 Subject: [PATCH 1/9] flex-ranks phase 1: rank apportionment by target ratio Add apportion_ranks(ratios, n_proc): largest-remainder (Hare quota) apportionment of a fixed rank pool across cylinders by target ratio, with a floor of one rank per cylinder. Counts sum to exactly n_proc and every cylinder gets at least one rank. Raises when there are more cylinders than ranks. Pure function; not yet wired into WheelSpinner. Per doc/designs/flexible_rank_assignments.md, the User Interface apportionment algorithm. Tests cover the worked design-doc example, tie-breaking by declaration order, scale invariance, the floor-of-one rescue, C==np and C>np, and a property sweep (sum==n_proc, all >=1) across many cases. Wired into run_coverage.bash and test_pr_and_main.yml. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/test_pr_and_main.yml | 3 +- mpisppy/tests/test_rank_apportionment.py | 110 +++++++++++++++++++++++ mpisppy/utils/rank_apportionment.py | 84 +++++++++++++++++ run_coverage.bash | 3 + 4 files changed, 199 insertions(+), 1 deletion(-) create mode 100644 mpisppy/tests/test_rank_apportionment.py create mode 100644 mpisppy/utils/rank_apportionment.py diff --git a/.github/workflows/test_pr_and_main.yml b/.github/workflows/test_pr_and_main.yml index f8ea33ac2..9b292ffd3 100644 --- a/.github/workflows/test_pr_and_main.yml +++ b/.github/workflows/test_pr_and_main.yml @@ -1127,7 +1127,8 @@ jobs: mpisppy/tests/test_reduced_costs_rho_bundles.py \ mpisppy/tests/test_rho_deprecations.py \ mpisppy/tests/test_buffer_inspect.py \ - mpisppy/tests/test_comm_lor_check.py + mpisppy/tests/test_comm_lor_check.py \ + mpisppy/tests/test_rank_apportionment.py - name: Upload coverage data if: always() diff --git a/mpisppy/tests/test_rank_apportionment.py b/mpisppy/tests/test_rank_apportionment.py new file mode 100644 index 000000000..1bc49e565 --- /dev/null +++ b/mpisppy/tests/test_rank_apportionment.py @@ -0,0 +1,110 @@ +############################################################################### +# mpi-sppy: MPI-based Stochastic Programming in PYthon +# +# Copyright (c) 2024, Lawrence Livermore National Security, LLC, Alliance for +# Sustainable Energy, LLC, The Regents of the University of California, et al. +# All rights reserved. Please see the files COPYRIGHT.md and LICENSE.md for +# full copyright and license information. +############################################################################### +"""Unit tests for mpisppy.utils.rank_apportionment.apportion_ranks.""" + +import unittest + +from mpisppy.utils.rank_apportionment import apportion_ranks + + +class TestApportionRanks(unittest.TestCase): + + def test_equal_ratios_divisible(self): + # Clean split when ranks divide evenly among equal cylinders. + self.assertEqual(apportion_ranks([1.0, 1.0, 1.0], 9), [3, 3, 3]) + + def test_equal_ratios_indivisible_breaks_ties_by_order(self): + # 10 ranks, 3 equal cylinders: each floors to 3 (sum 9), and the + # single leftover goes to the lowest-index cylinder on the tie. + self.assertEqual(apportion_ranks([1.0, 1.0, 1.0], 10), [4, 3, 3]) + + def test_two_equal_cylinders_tie(self): + # 3 ranks, 2 equal cylinders: shares 1.5 / 1.5, leftover -> idx 0. + self.assertEqual(apportion_ranks([1.0, 1.0], 3), [2, 1]) + + def test_design_doc_example(self): + # The worked example from the design doc: hub 1.0, lagrangian 0.5, + # xhat 0.25, np 14 -> 8 / 4 / 2 (exact, no remainder). + self.assertEqual(apportion_ranks([1.0, 0.5, 0.25], 14), [8, 4, 2]) + + def test_ratio_scale_invariance(self): + # Only relative magnitudes matter. + self.assertEqual( + apportion_ranks([4.0, 2.0, 1.0], 14), + apportion_ranks([1.0, 0.5, 0.25], 14), + ) + + def test_floor_of_one_enforced(self): + # A tiny ratio would round to zero; the floor-of-one pass rescues it + # by taking a rank from the largest cylinder. + counts = apportion_ranks([10.0, 0.01], 3) + self.assertEqual(counts, [2, 1]) + + def test_floor_of_one_many_small(self): + # Several near-zero cylinders all get rescued to 1. + counts = apportion_ranks([100.0, 0.001, 0.001, 0.001], 7) + self.assertEqual(sum(counts), 7) + self.assertTrue(all(c >= 1 for c in counts)) + self.assertEqual(counts[0], 4) # big cylinder keeps the rest + + def test_cylinders_equal_ranks(self): + # C == n_proc: everyone gets exactly one. + self.assertEqual(apportion_ranks([1.0, 2.0, 3.0], 3), [1, 1, 1]) + + def test_single_cylinder_gets_all(self): + self.assertEqual(apportion_ranks([1.0], 5), [5]) + + def test_more_cylinders_than_ranks_raises(self): + with self.assertRaises(ValueError): + apportion_ranks([1.0, 1.0, 1.0, 1.0], 3) + + def test_invalid_inputs_raise(self): + with self.assertRaises(ValueError): + apportion_ranks([], 4) + with self.assertRaises(ValueError): + apportion_ranks([1.0, 1.0], 0) + with self.assertRaises(ValueError): + apportion_ranks([1.0, -1.0], 4) + with self.assertRaises(ValueError): + apportion_ranks([1.0, 0.0], 4) + + def test_invariants_over_many_cases(self): + # Property sweep: counts always sum to n_proc, are all >= 1, and + # match the cylinder count. + ratio_pools = [ + [1.0, 1.0, 1.0], + [1.0, 0.5, 0.25], + [3.0, 1.0], + [5.0, 2.0, 2.0, 1.0], + [1.0, 1.0, 1.0, 1.0, 1.0], + ] + for ratios in ratio_pools: + for n_proc in range(len(ratios), len(ratios) + 25): + counts = apportion_ranks(ratios, n_proc) + self.assertEqual(len(counts), len(ratios)) + self.assertEqual(sum(counts), n_proc) + self.assertTrue(all(c >= 1 for c in counts), + msg=f"{ratios=} {n_proc=} -> {counts=}") + + def test_monotone_in_ratio(self): + # With enough ranks, a larger ratio never gets fewer ranks than a + # smaller one (sanity on the apportionment ordering). + counts = apportion_ranks([3.0, 2.0, 1.0], 30) + self.assertGreaterEqual(counts[0], counts[1]) + self.assertGreaterEqual(counts[1], counts[2]) + + def test_order_preserved(self): + # Output position i corresponds to input ratio i. + a = apportion_ranks([1.0, 0.5, 0.25], 14) + b = apportion_ranks([0.25, 0.5, 1.0], 14) + self.assertEqual(a, list(reversed(b))) + + +if __name__ == "__main__": + unittest.main() diff --git a/mpisppy/utils/rank_apportionment.py b/mpisppy/utils/rank_apportionment.py new file mode 100644 index 000000000..df374ad15 --- /dev/null +++ b/mpisppy/utils/rank_apportionment.py @@ -0,0 +1,84 @@ +############################################################################### +# mpi-sppy: MPI-based Stochastic Programming in PYthon +# +# Copyright (c) 2024, Lawrence Livermore National Security, LLC, Alliance for +# Sustainable Energy, LLC, The Regents of the University of California, et al. +# All rights reserved. Please see the files COPYRIGHT.md and LICENSE.md for +# full copyright and license information. +############################################################################### +"""Apportion a fixed pool of MPI ranks across cylinders by target ratio. + +Supports flexible (per-cylinder) rank counts for WheelSpinner. See +``doc/designs/flexible_rank_assignments.md`` (the "User Interface" +section) for the algorithm and its rationale. +""" + +import math + + +def apportion_ranks(ratios, n_proc): + """Apportion ``n_proc`` ranks across cylinders by target ratio. + + Uses the largest-remainder (Hare quota) method, then enforces a + floor of one rank per cylinder. The returned counts sum to exactly + ``n_proc`` and every entry is at least 1, so every cylinder runs. + + Args: + ratios: ordered sequence of positive target ratios, one per + cylinder, in declaration order (the hub is conventionally + first with ratio 1.0). Only relative magnitudes matter. + n_proc: total number of MPI ranks to distribute (> 0). + + Returns: + list[int]: rank count per cylinder, in the same order as + ``ratios``, summing to ``n_proc``, each at least 1. + + Raises: + ValueError: if there are more cylinders than ranks (a floor of + one each is then infeasible), or if any ratio is + non-positive, or if ``n_proc`` is non-positive, or if no + cylinders are given. + """ + n_cyl = len(ratios) + if n_cyl == 0: + raise ValueError("apportion_ranks: need at least one cylinder") + if n_proc <= 0: + raise ValueError(f"apportion_ranks: n_proc must be positive (got {n_proc})") + if any(r <= 0 for r in ratios): + raise ValueError(f"apportion_ranks: ratios must be positive (got {list(ratios)})") + if n_cyl > n_proc: + raise ValueError( + f"apportion_ranks: {n_cyl} cylinders cannot each receive at least " + f"one rank from only {n_proc} ranks" + ) + + total = math.fsum(ratios) + real_share = [r / total * n_proc for r in ratios] + counts = [math.floor(x) for x in real_share] + + # Largest-remainder: hand out the leftover ranks to the biggest + # fractional remainders, breaking ties by declaration order (lowest + # index first). + leftover = n_proc - sum(counts) + if leftover > 0: + by_remainder = sorted( + range(n_cyl), + key=lambda i: (real_share[i] - counts[i], -i), + reverse=True, + ) + for i in by_remainder[:leftover]: + counts[i] += 1 + + # Floor of one: move a rank from the currently-largest cylinder to a + # cylinder stuck at zero, until none remain at zero. This is always + # feasible because n_cyl <= n_proc, so whenever a zero exists some + # other cylinder holds at least two. + while True: + zeros = [i for i, c in enumerate(counts) if c == 0] + if not zeros: + break + donor = max(range(n_cyl), key=lambda i: counts[i]) + counts[donor] -= 1 + counts[zeros[0]] += 1 + + return counts diff --git a/run_coverage.bash b/run_coverage.bash index 1cfb7a1cf..996a6c295 100755 --- a/run_coverage.bash +++ b/run_coverage.bash @@ -121,6 +121,9 @@ run_phase "test_reduced_costs_rho_bundles (serial)" \ run_phase "test_rho_deprecations (serial)" \ coverage run --rcfile=.coveragerc -m pytest mpisppy/tests/test_rho_deprecations.py -v +run_phase "test_rank_apportionment (serial)" \ + coverage run --rcfile=.coveragerc -m pytest mpisppy/tests/test_rank_apportionment.py -v + run_phase "test_xhat_from_file (serial)" \ coverage run --rcfile=.coveragerc -m pytest mpisppy/tests/test_xhat_from_file.py -v From fc63b39b67f49893d95d9a9be910adee008a9d33 Mon Sep 17 00:00:00 2001 From: Dave Woodruff Date: Sat, 23 May 2026 19:07:57 -0700 Subject: [PATCH 2/9] flex-ranks phase 1: overlap-map computation Add OverlapSegment and compute_overlap_segments(): given a local rank's scenarios, a peer cylinder's contiguous scenario partition (the slices from scen_names_to_ranks), and a per-scenario item count, produce the list of segments that back the local field buffer from the peer's buffers. Contiguous same-rank runs are coalesced; equal rank counts degenerate to a single identity segment per rank. Pure (no MPI/Pyomo), so it unit tests directly. Not yet wired into the communicator. Per doc/designs/flexible_rank_assignments.md, Overlap Maps. Tests: equal-rank identity (single + multi-item), fewer/more remote ranks, straddling the peer split, run coalescing, variable per-scenario item counts, and a tiling invariant. Wired into run_coverage.bash and test_pr_and_main.yml. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/test_pr_and_main.yml | 3 +- mpisppy/cylinders/overlap_map.py | 93 ++++++++++++++++++++ mpisppy/tests/test_overlap_map.py | 115 +++++++++++++++++++++++++ run_coverage.bash | 3 + 4 files changed, 213 insertions(+), 1 deletion(-) create mode 100644 mpisppy/cylinders/overlap_map.py create mode 100644 mpisppy/tests/test_overlap_map.py diff --git a/.github/workflows/test_pr_and_main.yml b/.github/workflows/test_pr_and_main.yml index 9b292ffd3..5a25c600d 100644 --- a/.github/workflows/test_pr_and_main.yml +++ b/.github/workflows/test_pr_and_main.yml @@ -1128,7 +1128,8 @@ jobs: mpisppy/tests/test_rho_deprecations.py \ mpisppy/tests/test_buffer_inspect.py \ mpisppy/tests/test_comm_lor_check.py \ - mpisppy/tests/test_rank_apportionment.py + mpisppy/tests/test_rank_apportionment.py \ + mpisppy/tests/test_overlap_map.py - name: Upload coverage data if: always() diff --git a/mpisppy/cylinders/overlap_map.py b/mpisppy/cylinders/overlap_map.py new file mode 100644 index 000000000..794d4179e --- /dev/null +++ b/mpisppy/cylinders/overlap_map.py @@ -0,0 +1,93 @@ +############################################################################### +# mpi-sppy: MPI-based Stochastic Programming in PYthon +# +# Copyright (c) 2024, Lawrence Livermore National Security, LLC, Alliance for +# Sustainable Energy, LLC, The Regents of the University of California, et al. +# All rights reserved. Please see the files COPYRIGHT.md and LICENSE.md for +# full copyright and license information. +############################################################################### +"""Overlap maps for reading a local-sized field across cylinders that have +different MPI rank counts. + +When two cylinders hold the same scenarios but split them over a different +number of ranks, a reader rank's local buffer for a per-scenario field is +backed by one or more *segments* living in remote ranks' buffers. An +``OverlapSegment`` records, for one such segment, which remote rank holds +it and the matching offsets/length (in field *items*, e.g. nonants) in the +remote and local buffers. + +This module is deliberately free of MPI and Pyomo: it consumes the +contiguous per-rank scenario partition (the ``slices`` produced by +``_ScenTree.scen_names_to_ranks``) plus a per-scenario item count, so it +can be unit tested directly. See +``doc/designs/flexible_rank_assignments.md`` (the "Overlap Maps" section). +""" + +from dataclasses import dataclass + + +@dataclass +class OverlapSegment: + """One contiguous run of a local field buffer sourced from a single + remote rank's buffer. All offsets/lengths are in field items.""" + remote_rank: int # rank in the peer cylinder to read from + remote_offset: int # item offset of this run within the remote buffer + local_offset: int # item offset of this run within the local buffer + count: int # number of items in this run + + +def compute_overlap_segments(local_scen_idxs, remote_slices, items_per_scen): + """Segments mapping a local rank's field buffer onto remote buffers. + + Args: + local_scen_idxs: ordered global scenario indices held by this + (local) rank -- i.e. ``local_slices[local_rank]``. + remote_slices: the peer cylinder's partition, ``rank -> ordered + list of global scenario indices`` (the ``slices`` output of + ``scen_names_to_ranks`` for that cylinder). Determines both + which remote rank holds each scenario and the scenario's + offset within that rank's buffer. + items_per_scen: sequence indexed by global scenario index giving + the number of field items each scenario contributes (e.g. + ``len(nonant_indices)``). Uniform two-stage problems pass a + constant for every scenario; multistage problems may vary. + + Returns: + list[OverlapSegment]: segments covering the local buffer in order, + with contiguous same-rank runs coalesced. When both cylinders + have the same rank count this degenerates to a single identity + segment ``(remote_rank=local_rank, remote_offset=0, + local_offset=0, count=)``. + """ + # Invert remote_slices: global scenario index -> (remote rank, item + # offset of that scenario within the remote rank's buffer). + remote_rank_of_scen = {} + remote_offset_of_scen = {} + for rank, scen_idxs in enumerate(remote_slices): + offset = 0 + for scen in scen_idxs: + remote_rank_of_scen[scen] = rank + remote_offset_of_scen[scen] = offset + offset += items_per_scen[scen] + + segments = [] + cur = None + local_offset = 0 + for scen in local_scen_idxs: + rank = remote_rank_of_scen[scen] + r_off = remote_offset_of_scen[scen] + n = items_per_scen[scen] + # Coalesce only while we stay on the same remote rank AND the + # remote items remain contiguous with the run so far. + if cur is not None and cur.remote_rank == rank \ + and cur.remote_offset + cur.count == r_off: + cur.count += n + else: + if cur is not None: + segments.append(cur) + cur = OverlapSegment(remote_rank=rank, remote_offset=r_off, + local_offset=local_offset, count=n) + local_offset += n + if cur is not None: + segments.append(cur) + return segments diff --git a/mpisppy/tests/test_overlap_map.py b/mpisppy/tests/test_overlap_map.py new file mode 100644 index 000000000..ed6a718f8 --- /dev/null +++ b/mpisppy/tests/test_overlap_map.py @@ -0,0 +1,115 @@ +############################################################################### +# mpi-sppy: MPI-based Stochastic Programming in PYthon +# +# Copyright (c) 2024, Lawrence Livermore National Security, LLC, Alliance for +# Sustainable Energy, LLC, The Regents of the University of California, et al. +# All rights reserved. Please see the files COPYRIGHT.md and LICENSE.md for +# full copyright and license information. +############################################################################### +"""Unit tests for mpisppy.cylinders.overlap_map.compute_overlap_segments.""" + +import unittest + +from mpisppy.cylinders.overlap_map import OverlapSegment, compute_overlap_segments + + +def _seg(remote_rank, remote_offset, local_offset, count): + return OverlapSegment(remote_rank, remote_offset, local_offset, count) + + +class TestOverlapMap(unittest.TestCase): + + def _assert_tiles_local(self, segments, local_scen_idxs, items_per_scen): + # Segments must tile the local buffer contiguously from offset 0, + # with total length equal to the local rank's item count. + expected_local = sum(items_per_scen[s] for s in local_scen_idxs) + running = 0 + for seg in segments: + self.assertEqual(seg.local_offset, running) + running += seg.count + self.assertEqual(running, expected_local) + + def test_equal_ranks_is_identity_single_segment(self): + # Same rank count on both cylinders -> one identity segment per + # local rank (the no-op-at-equal-ranks backbone). + remote_slices = [[0, 1], [2, 3], [4, 5], [6, 7]] + items = [1] * 8 + for rank, local in enumerate(remote_slices): + segs = compute_overlap_segments(local, remote_slices, items) + self.assertEqual(segs, [_seg(rank, 0, 0, len(local))]) + + def test_equal_ranks_identity_multi_item(self): + # Identity holds with >1 item per scenario. + remote_slices = [[0, 1], [2, 3]] + items = [2] * 4 + segs = compute_overlap_segments([2, 3], remote_slices, items) + self.assertEqual(segs, [_seg(1, 0, 0, 4)]) + + def test_fewer_remote_ranks_doc_example(self): + # 4-rank hub reading a 2-rank spoke, 10 scenarios, 1 item each. + # Spoke split is contiguous: rank0 = scen0-4, rank1 = scen5-9. + remote_slices = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]] + items = [1] * 10 + + # Hub rank holding scen0,1: wholly inside spoke rank 0. + self.assertEqual( + compute_overlap_segments([0, 1], remote_slices, items), + [_seg(0, 0, 0, 2)], + ) + # Hub rank holding scen4,5: straddles the spoke split -> two + # segments (scen4 from spoke rank0 @offset4, scen5 from spoke + # rank1 @offset0). + self.assertEqual( + compute_overlap_segments([4, 5], remote_slices, items), + [_seg(0, 4, 0, 1), _seg(1, 0, 1, 1)], + ) + # Hub rank holding scen7,8,9: wholly inside spoke rank1, coalesced. + self.assertEqual( + compute_overlap_segments([7, 8, 9], remote_slices, items), + [_seg(1, 2, 0, 3)], + ) + + def test_more_remote_ranks(self): + # 2-rank local cylinder reading a 4-rank remote cylinder. + remote_slices = [[0, 1], [2, 3, 4], [5, 6], [7, 8, 9]] + items = [1] * 10 + segs = compute_overlap_segments([0, 1, 2, 3, 4], remote_slices, items) + self.assertEqual(segs, [_seg(0, 0, 0, 2), _seg(1, 0, 2, 3)]) + self._assert_tiles_local(segs, [0, 1, 2, 3, 4], items) + + def test_coalesces_contiguous_run_within_remote_rank(self): + remote_slices = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]] + items = [1] * 10 + segs = compute_overlap_segments([2, 3, 4], remote_slices, items) + self.assertEqual(segs, [_seg(0, 2, 0, 3)]) + + def test_variable_items_per_scenario(self): + # Multistage-style: scenarios contribute different item counts. + items = [1, 2, 3, 1, 2] + remote_slices = [[0, 1, 2], [3, 4]] + # Local rank holds scen1,2 (both in remote rank0). scen1 starts at + # remote offset items[0]=1; coalesced count = 2+3 = 5. + segs = compute_overlap_segments([1, 2], remote_slices, items) + self.assertEqual(segs, [_seg(0, 1, 0, 5)]) + self._assert_tiles_local(segs, [1, 2], items) + + def test_variable_items_across_remote_ranks(self): + items = [1, 2, 3, 1, 2] + remote_slices = [[0, 1], [2, 3, 4]] + # Local holds scen1 (remote rank0 @offset1, count2) then scen2 + # (remote rank1 @offset0, count3). + segs = compute_overlap_segments([1, 2], remote_slices, items) + self.assertEqual(segs, [_seg(0, 1, 0, 2), _seg(1, 0, 2, 3)]) + self._assert_tiles_local(segs, [1, 2], items) + + def test_single_scenario(self): + remote_slices = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]] + items = [1] * 10 + self.assertEqual( + compute_overlap_segments([6], remote_slices, items), + [_seg(1, 1, 0, 1)], + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/run_coverage.bash b/run_coverage.bash index 996a6c295..5fba670f4 100755 --- a/run_coverage.bash +++ b/run_coverage.bash @@ -124,6 +124,9 @@ run_phase "test_rho_deprecations (serial)" \ run_phase "test_rank_apportionment (serial)" \ coverage run --rcfile=.coveragerc -m pytest mpisppy/tests/test_rank_apportionment.py -v +run_phase "test_overlap_map (serial)" \ + coverage run --rcfile=.coveragerc -m pytest mpisppy/tests/test_overlap_map.py -v + run_phase "test_xhat_from_file (serial)" \ coverage run --rcfile=.coveragerc -m pytest mpisppy/tests/test_xhat_from_file.py -v From bf01a038e5fa22c49d9f2ac4f545dad695a0364a Mon Sep 17 00:00:00 2001 From: Dave Woodruff Date: Sat, 23 May 2026 19:46:15 -0700 Subject: [PATCH 3/9] doc(flex-ranks): fix the overlap-map worked example Two errors in the 4-hub/2-spoke example, both verified against compute_overlap_segments: - Hub rank 2 (scen4,scen5) straddles the spoke's scen5 split, so it reads two segments (scen4 from spoke rank 0, scen5 from spoke rank 1), not one segment of count 2 from spoke rank 0. - Hub rank 3 (scen6-9) sits at offsets 1-4 within spoke rank 1's buffer (which begins at scen5), so remote_offset is 1, not 0. Also notes the split is illustrative (chosen to show a straddle) and states the one-nonant-per-scenario assumption. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/designs/flexible_rank_assignments.md | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/doc/designs/flexible_rank_assignments.md b/doc/designs/flexible_rank_assignments.md index 54b91feca..ff38a29d0 100644 --- a/doc/designs/flexible_rank_assignments.md +++ b/doc/designs/flexible_rank_assignments.md @@ -243,7 +243,9 @@ count to get each cylinder's distribution, then compute pairwise overlaps. For example, with a 4-rank hub and a 2-rank spoke, both handling 10 -scenarios: +scenarios (the per-rank split below is illustrative — the real split is +whatever `scen_names_to_ranks` produces — and is chosen so that a hub +rank straddles the spoke's boundary): ======== ================== ================== Hub (4 ranks) Spoke (2 ranks) @@ -254,23 +256,28 @@ Rank 2 scen4, scen5 Rank 3 scen6--scen9 ======== ================== ================== -Hub rank 0 needs to read from spoke rank 0 (which has scen0--scen4), -extracting only the portion for scen0--scen1. Hub rank 2 also reads -from spoke rank 0, extracting scen4--scen5. +Hub rank 0 reads from spoke rank 0 (which holds scen0--scen4), +extracting only the portion for scen0--scen1. Hub rank 2 holds +scen4--scen5, which *straddles* the spoke's split: scen4 lives in spoke +rank 0 (at offset 4) and scen5 in spoke rank 1 (at offset 0), so hub +rank 2 reads one segment from each. Hub rank 3's scen6--scen9 sit at +offsets 1--4 within spoke rank 1's buffer (which starts at scen5). -The overlap map for hub rank 0 reading from the spoke would be: +Taking one nonant per scenario for simplicity, the overlap map (in +nonant units) for hub rank 0 reading from the spoke would be: ``` [(spoke_rank=0, remote_offset=0, local_offset=0, count=2)] ``` -For hub rank 2: +For hub rank 2 (the straddle — two segments): ``` -[(spoke_rank=0, remote_offset=4, local_offset=0, count=2)] +[(spoke_rank=0, remote_offset=4, local_offset=0, count=1), + (spoke_rank=1, remote_offset=0, local_offset=1, count=1)] ``` For hub rank 3: ``` -[(spoke_rank=1, remote_offset=0, local_offset=0, count=4)] +[(spoke_rank=1, remote_offset=1, local_offset=0, count=4)] ``` These maps are computed once at startup and reused for every From 203e39ff786bfbceb8d696ac6a578b95ba96c15e Mon Sep 17 00:00:00 2001 From: Dave Woodruff Date: Sat, 23 May 2026 19:50:00 -0700 Subject: [PATCH 4/9] flex-ranks phase 1: partial-read support in SPWindow.get() Add optional item_offset / item_count to SPWindow.get() so a reader can fetch a sub-range of a field via MPI_Get displacement. Defaults (item_count=None) reproduce the original whole-padded-field transfer exactly, so existing callers are unaffected. Partial reads are the primitive multi-source assembly will use under unequal rank counts (see overlap_map.py). Tests exercise a real single-process RMA window (MPI.COMM_SELF): full read unchanged, prefix/middle/suffix/single-item partial reads, full-via-count, and out-of-range / dest-size-mismatch assertions. Wired into run_coverage.bash and test_pr_and_main.yml. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/test_pr_and_main.yml | 3 +- mpisppy/cylinders/spwindow.py | 26 ++++++- mpisppy/tests/test_spwindow_partial_get.py | 82 ++++++++++++++++++++++ run_coverage.bash | 3 + 4 files changed, 110 insertions(+), 4 deletions(-) create mode 100644 mpisppy/tests/test_spwindow_partial_get.py diff --git a/.github/workflows/test_pr_and_main.yml b/.github/workflows/test_pr_and_main.yml index 5a25c600d..575523d71 100644 --- a/.github/workflows/test_pr_and_main.yml +++ b/.github/workflows/test_pr_and_main.yml @@ -1129,7 +1129,8 @@ jobs: mpisppy/tests/test_buffer_inspect.py \ mpisppy/tests/test_comm_lor_check.py \ mpisppy/tests/test_rank_apportionment.py \ - mpisppy/tests/test_overlap_map.py + mpisppy/tests/test_overlap_map.py \ + mpisppy/tests/test_spwindow_partial_get.py - name: Upload coverage data if: always() diff --git a/mpisppy/cylinders/spwindow.py b/mpisppy/cylinders/spwindow.py index 547bdcf6a..003694262 100644 --- a/mpisppy/cylinders/spwindow.py +++ b/mpisppy/cylinders/spwindow.py @@ -178,18 +178,38 @@ def free(self): return #### Functions #### - def get(self, dest: nptyping.ArrayLike, strata_rank: int, field: Field): + def get(self, dest: nptyping.ArrayLike, strata_rank: int, field: Field, + item_offset: int = 0, item_count: int = None): + """Read a remote rank's buffer for ``field`` into ``dest``. + + By default the whole padded field is transferred (``dest`` must be + ``padded_len`` long), preserving the original behavior. When + ``item_count`` is given, only that many doubles are read, starting + ``item_offset`` doubles into the field -- a partial read used for + multi-source assembly across cylinders with different rank counts + (see ``overlap_map.py``). ``dest`` must then be ``item_count`` long. + """ assert (0 <= strata_rank < len(self.strata_buffer_layouts)) that_layout = self.strata_buffer_layouts[strata_rank] assert field in that_layout (offset, logical_len, padded_len) = that_layout[field] - assert np.size(dest) == padded_len + + if item_count is None: + count = padded_len + disp = offset + else: + assert item_offset >= 0 and item_count >= 0 + assert item_offset + item_count <= padded_len, \ + f"{field=} partial get {item_offset=}+{item_count=} exceeds {padded_len=}" + count = item_count + disp = offset + item_offset + assert np.size(dest) == count window = self.window window.Lock(strata_rank, MPI.LOCK_SHARED) - window.Get((dest, padded_len, MPI.DOUBLE), strata_rank, offset) + window.Get((dest, count, MPI.DOUBLE), strata_rank, disp) window.Unlock(strata_rank) return diff --git a/mpisppy/tests/test_spwindow_partial_get.py b/mpisppy/tests/test_spwindow_partial_get.py new file mode 100644 index 000000000..05487aaea --- /dev/null +++ b/mpisppy/tests/test_spwindow_partial_get.py @@ -0,0 +1,82 @@ +############################################################################### +# mpi-sppy: MPI-based Stochastic Programming in PYthon +# +# Copyright (c) 2024, Lawrence Livermore National Security, LLC, Alliance for +# Sustainable Energy, LLC, The Regents of the University of California, et al. +# All rights reserved. Please see the files COPYRIGHT.md and LICENSE.md for +# full copyright and license information. +############################################################################### +"""Partial-read support in SPWindow.get(), exercised on a single-process +RMA window (MPI.COMM_SELF). Verifies the default whole-field read is +unchanged and that item_offset/item_count read the right sub-range.""" + +import unittest + +import numpy as np + +from mpisppy import MPI +from mpisppy.cylinders.spwindow import Field, SPWindow, padded_len_n_doubles + + +class TestPartialGet(unittest.TestCase): + + def setUp(self): + self.logical = 10 + self.padded = padded_len_n_doubles(self.logical) + my_fields = {Field.NONANTS_VALS: (self.logical, self.padded)} + self.win = SPWindow(my_fields, MPI.COMM_SELF) + # Known, distinct values across the whole padded field. + self.data = np.arange(100, 100 + self.padded, dtype="d") + self.win.put(self.data, Field.NONANTS_VALS) + + def tearDown(self): + self.win.free() + + def test_full_read_unchanged(self): + # Default args: whole padded field, as before. + dest = np.empty(self.padded, dtype="d") + self.win.get(dest, 0, Field.NONANTS_VALS) + np.testing.assert_array_equal(dest, self.data) + + def test_partial_prefix(self): + dest = np.empty(3, dtype="d") + self.win.get(dest, 0, Field.NONANTS_VALS, item_offset=0, item_count=3) + np.testing.assert_array_equal(dest, self.data[0:3]) + + def test_partial_middle(self): + dest = np.empty(4, dtype="d") + self.win.get(dest, 0, Field.NONANTS_VALS, item_offset=2, item_count=4) + np.testing.assert_array_equal(dest, self.data[2:6]) + + def test_partial_suffix(self): + dest = np.empty(5, dtype="d") + self.win.get(dest, 0, Field.NONANTS_VALS, + item_offset=self.padded - 5, item_count=5) + np.testing.assert_array_equal(dest, self.data[self.padded - 5:]) + + def test_partial_full_via_count(self): + # item_count equal to padded length reproduces the full read. + dest = np.empty(self.padded, dtype="d") + self.win.get(dest, 0, Field.NONANTS_VALS, + item_offset=0, item_count=self.padded) + np.testing.assert_array_equal(dest, self.data) + + def test_single_item(self): + dest = np.empty(1, dtype="d") + self.win.get(dest, 0, Field.NONANTS_VALS, item_offset=7, item_count=1) + self.assertEqual(dest[0], self.data[7]) + + def test_out_of_range_raises(self): + dest = np.empty(2, dtype="d") + with self.assertRaises(AssertionError): + self.win.get(dest, 0, Field.NONANTS_VALS, + item_offset=self.padded - 1, item_count=2) + + def test_dest_size_mismatch_raises(self): + dest = np.empty(5, dtype="d") # wrong size for count=3 + with self.assertRaises(AssertionError): + self.win.get(dest, 0, Field.NONANTS_VALS, item_offset=0, item_count=3) + + +if __name__ == "__main__": + unittest.main() diff --git a/run_coverage.bash b/run_coverage.bash index 5fba670f4..49a1a7d38 100755 --- a/run_coverage.bash +++ b/run_coverage.bash @@ -127,6 +127,9 @@ run_phase "test_rank_apportionment (serial)" \ run_phase "test_overlap_map (serial)" \ coverage run --rcfile=.coveragerc -m pytest mpisppy/tests/test_overlap_map.py -v +run_phase "test_spwindow_partial_get (serial)" \ + coverage run --rcfile=.coveragerc -m pytest mpisppy/tests/test_spwindow_partial_get.py -v + run_phase "test_xhat_from_file (serial)" \ coverage run --rcfile=.coveragerc -m pytest mpisppy/tests/test_xhat_from_file.py -v From 109e5710aca5a80de194629a2c0f714678770020 Mon Sep 17 00:00:00 2001 From: Dave Woodruff Date: Sat, 23 May 2026 19:54:56 -0700 Subject: [PATCH 5/9] flex-ranks phase 1: read rank_ratio dict keys; gate the unequal path WheelSpinner.run() now reads a per-cylinder "rank_ratio" key from each hub/spoke dict (default 1.0). When all ratios are 1.0 the path is exactly as before. When any differs, it apportions the rank pool (largest-remainder, floor of one), reports the allocation via global_toc on rank 0, then raises NotImplementedError -- building communicators for unequal per-cylinder counts is not yet wired into the comm/window layer. This lands the apportionment wiring and the dict-key plumbing while keeping the uniform (default) behavior unchanged; verified by running test_with_cylinders (PH hub + xhatshuffle spoke) end to end. Tests use a stub comm to drive the gate without MPI: a non-uniform ratio raises NotImplementedError; an infeasible floor-of-one raises ValueError first; explicit all-1.0 ratios skip the gate. Wired into run_coverage.bash and test_pr_and_main.yml. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/test_pr_and_main.yml | 3 +- mpisppy/spin_the_wheel.py | 22 +++++++ mpisppy/tests/test_flexible_rank_gate.py | 75 ++++++++++++++++++++++++ run_coverage.bash | 3 + 4 files changed, 102 insertions(+), 1 deletion(-) create mode 100644 mpisppy/tests/test_flexible_rank_gate.py diff --git a/.github/workflows/test_pr_and_main.yml b/.github/workflows/test_pr_and_main.yml index 575523d71..904ed4b71 100644 --- a/.github/workflows/test_pr_and_main.yml +++ b/.github/workflows/test_pr_and_main.yml @@ -1130,7 +1130,8 @@ jobs: mpisppy/tests/test_comm_lor_check.py \ mpisppy/tests/test_rank_apportionment.py \ mpisppy/tests/test_overlap_map.py \ - mpisppy/tests/test_spwindow_partial_get.py + mpisppy/tests/test_spwindow_partial_get.py \ + mpisppy/tests/test_flexible_rank_gate.py - name: Upload coverage data if: always() diff --git a/mpisppy/spin_the_wheel.py b/mpisppy/spin_the_wheel.py index 2b3d4b8fa..1f41f32a0 100644 --- a/mpisppy/spin_the_wheel.py +++ b/mpisppy/spin_the_wheel.py @@ -12,6 +12,7 @@ from mpisppy.utils import nice_join from mpisppy.utils.sputils import first_stage_nonant_writer, scenario_tree_solution_writer +from mpisppy.utils.rank_apportionment import apportion_ranks class WheelSpinner: @@ -103,6 +104,27 @@ def run(self, comm_world=None): # Create the necessary communicators fullcomm = comm_world + + # Flexible rank assignments: each cylinder may request a rank ratio + # relative to the hub (default 1.0) via a "rank_ratio" dict key. The + # uniform case (all ratios 1.0) is handled exactly as before. A + # non-uniform request is apportioned (largest-remainder, floor of one) + # and reported, but building the communicators for unequal per-cylinder + # counts is not yet wired into the communicator/window layer. + rank_ratios = [d.get("rank_ratio", 1.0) for d in communicator_list] + if any(r != 1.0 for r in rank_ratios): + rank_counts = apportion_ranks(rank_ratios, fullcomm.Get_size()) + global_toc( + f"Requested per-cylinder rank ratios {rank_ratios} -> " + f"rank counts {rank_counts}", + fullcomm.Get_rank() == 0, + ) + raise NotImplementedError( + "Per-cylinder rank counts (flexible rank assignments) are not " + f"yet wired into the communicator layer; computed allocation " + f"{rank_counts}. Run with all rank_ratio == 1.0 for now." + ) + strata_comm, cylinder_comm = _make_comms(n_spcomms, fullcomm=fullcomm) strata_rank = strata_comm.Get_rank() cylinder_rank = cylinder_comm.Get_rank() diff --git a/mpisppy/tests/test_flexible_rank_gate.py b/mpisppy/tests/test_flexible_rank_gate.py new file mode 100644 index 000000000..a94fc44c2 --- /dev/null +++ b/mpisppy/tests/test_flexible_rank_gate.py @@ -0,0 +1,75 @@ +############################################################################### +# mpi-sppy: MPI-based Stochastic Programming in PYthon +# +# Copyright (c) 2024, Lawrence Livermore National Security, LLC, Alliance for +# Sustainable Energy, LLC, The Regents of the University of California, et al. +# All rights reserved. Please see the files COPYRIGHT.md and LICENSE.md for +# full copyright and license information. +############################################################################### +"""WheelSpinner reads per-cylinder rank_ratio dict keys and, when any is +non-default, apportions and reports the allocation then raises (the live +unequal-rank communicator path is not yet wired in). The uniform default +path is exercised by the cylinder test suite, not here. + +The gate fires before any communicator is built, so a stub comm with just +Get_size/Get_rank is enough to drive it without MPI.""" + +import unittest + +from mpisppy.spin_the_wheel import WheelSpinner + + +class _StubComm: + """Minimal comm: the gate only needs size and rank.""" + def __init__(self, size, rank=0): + self._size = size + self._rank = rank + + def Get_size(self): + return self._size + + def Get_rank(self): + return self._rank + + +def _min_dict(role, **extra): + # Dicts only need the keys WheelSpinner validates before the gate; + # the classes are never instantiated on this path. + d = {f"{role}_class": object, "opt_class": object} + d.update(extra) + return d + + +class TestFlexibleRankGate(unittest.TestCase): + + def test_nonuniform_ratio_raises_not_implemented(self): + hub = _min_dict("hub", rank_ratio=1.0) + spoke = _min_dict("spoke", rank_ratio=0.5) + ws = WheelSpinner(hub, [spoke]) + with self.assertRaises(NotImplementedError): + ws.run(comm_world=_StubComm(size=4)) + + def test_nonuniform_with_too_few_ranks_raises_value_error(self): + # When even the floor-of-one is infeasible, apportion_ranks raises + # ValueError first (before the NotImplementedError gate). + hub = _min_dict("hub", rank_ratio=1.0) + spoke_a = _min_dict("spoke", rank_ratio=0.5) + spoke_b = _min_dict("spoke", rank_ratio=0.5) + ws = WheelSpinner(hub, [spoke_a, spoke_b]) + with self.assertRaises(ValueError): + ws.run(comm_world=_StubComm(size=2)) + + def test_explicit_uniform_ratios_pass_the_gate(self): + # All ratios 1.0 -> gate is skipped; the next step (_make_comms) is + # reached and fails on the stub (no Split). Reaching that point at + # all proves the NotImplementedError gate did not fire. + hub = _min_dict("hub", rank_ratio=1.0) + spoke = _min_dict("spoke", rank_ratio=1.0) + ws = WheelSpinner(hub, [spoke]) + with self.assertRaises(Exception) as ctx: + ws.run(comm_world=_StubComm(size=2)) + self.assertNotIsInstance(ctx.exception, NotImplementedError) + + +if __name__ == "__main__": + unittest.main() diff --git a/run_coverage.bash b/run_coverage.bash index 49a1a7d38..7d96639a6 100755 --- a/run_coverage.bash +++ b/run_coverage.bash @@ -130,6 +130,9 @@ run_phase "test_overlap_map (serial)" \ run_phase "test_spwindow_partial_get (serial)" \ coverage run --rcfile=.coveragerc -m pytest mpisppy/tests/test_spwindow_partial_get.py -v +run_phase "test_flexible_rank_gate (serial)" \ + coverage run --rcfile=.coveragerc -m pytest mpisppy/tests/test_flexible_rank_gate.py -v + run_phase "test_xhat_from_file (serial)" \ coverage run --rcfile=.coveragerc -m pytest mpisppy/tests/test_xhat_from_file.py -v From bb2e63eaa0a0cea145cd03bceff5e58d8e7c864d Mon Sep 17 00:00:00 2001 From: Dave Woodruff Date: Sun, 24 May 2026 06:49:49 -0700 Subject: [PATCH 6/9] doc(flex-ranks): add a development and rollout strategy section Records the procedure decisions for landing this multi-phase, low-level MPI/RMA work safely: - Sequence phases by blast radius: inert, gated phases are safe to land incrementally; the live communicator rewrite is where risk concentrates. - Chosen workflow: stacked PRs (each phase branch targets the one below, merged bottom-up) -- keeps work moving despite slow reviews while every change stays small, reviewable, and bisectable, and main stays shippable; preferred over a long-lived branch / fork big-bang merge. - Live-path rewrite: branch by abstraction with a runtime fallback (new fullcomm/Option-D path alongside strata_comm, old path default until proven). - Gate the default flip on an MPI CI matrix (multiple MPI implementations and mpi4py versions). Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/designs/flexible_rank_assignments.md | 54 ++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/doc/designs/flexible_rank_assignments.md b/doc/designs/flexible_rank_assignments.md index ff38a29d0..f45d2240a 100644 --- a/doc/designs/flexible_rank_assignments.md +++ b/doc/designs/flexible_rank_assignments.md @@ -616,6 +616,60 @@ the first pass bundled into "Phase 0" has already landed separately. they should compose with async senders; verify and add tests. +### Development and rollout strategy + +These changes touch a low-level, implementation-sensitive corner of the +stack -- MPI one-sided (RMA) communication through mpi4py -- where +behavior varies across MPI implementations (OpenMPI, MPICH, vendor +builds) and versions, and where passive-target RMA (`Lock` / `Get` / +`Put`) is the flakiest part of the spec. Reviews of this material are +necessarily slow, and a regression in the communicator layer would +affect *every* run, not just flexible-rank runs. The phases above are +therefore sequenced by *blast radius*: + +- **Inert phases** (the infrastructure phase) add only gated, unreachable + code: at equal rank ratios the existing code path is unchanged. These + can land on `main` incrementally and safely -- each is small, + reviewable on its own, and cannot change behavior for current users. +- **Live-path phases** (the communicator rework that replaces + `strata_comm`) rewrite code that every run exercises, equal-rank or + not. This cannot hide behind the rank-ratio gate, and is where the + real risk concentrates. + +**Chosen workflow: stacked PRs.** Because reviews of this material are +slow, phases are developed as a *stack* of pull requests rather than +waiting for each to merge before the next begins. Each phase lives on +its own short-lived branch whose PR targets the branch *below* it (phase +N+1 onto phase N, phase 1 onto `main`); the stack is reviewed and merged +bottom-up, each PR being retargeted to `main` as the one beneath it +lands. This keeps work moving while earlier PRs sit in review, yet -- +unlike a long-lived feature branch or a development fork -- every change +stays small, individually reviewable, and revertible/bisectable, and +`main` stays continuously shippable. (A big-bang integration merge of an +entire multi-phase feature is the opposite: a large, hard-to-review, +hard-to-bisect surface, which is *more* dangerous for code whose failures +are subtle. If isolation is ever genuinely wanted, prefer a branch in +the upstream repository over a separate fork -- same isolation, far less +CI and merge friction.) + +**Live-path rework: branch by abstraction.** Stacked PRs govern *how* +phases are reviewed and merged; the communicator rework that replaces +`strata_comm` additionally needs care in *how the code is structured*, +because it changes the path every run exercises. Implement the new +fullcomm / Option-D path *alongside* the existing `strata_comm` path, +selected by a flag, defaulting to the **old** path. Land it with the old +path still default, so current users are unaffected; exercise the new +path opt-in; make it the default only once it is proven; remove the old +path afterward. This yields a **runtime fallback** a feature branch or +fork does not: if a cluster's MPI has buggy RMA, users can switch back to +the proven path. For an unstable dependency we do not control, a toggle +available in production is worth more than isolation during development. + +**Gate the default flip on an MPI CI matrix.** Exercise the new path on +at least two MPI implementations (e.g. OpenMPI and MPICH) and more than +one mpi4py / MPI version before it becomes the default. + + ### Possible future optimization (out of scope) The first-stage portion of `BEST_XHAT` / `RECENT_XHATS` is genuinely From b620f3364309f22d549abca4a130a2ccb6e1f4fc Mon Sep 17 00:00:00 2001 From: Dave Woodruff Date: Sun, 24 May 2026 07:23:08 -0700 Subject: [PATCH 7/9] doc(flex-ranks): adopt gated-additive rollout, not a live-path rework Rewrite the rollout strategy: flexible ranks are added as a gated-additive feature keyed on the rank-ratio config, rather than replacing strata_comm behind a runtime flag. At equal ratios the existing per-strata_comm windows and single-source reader run unchanged; the fullcomm + overlap-map path is reached only when a ratio differs from 1.0. Disabling the feature is the production fallback, so no second implementation of the equal-rank path is needed. Reconcile the sections that assumed Option D replaced strata_comm globally (Window Topology recommendation, Impact bullets for spin_the_wheel/spwindow/spcommunicator, Phase 2, Backward Compatibility) and add a key-decision pointer at the top. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/designs/flexible_rank_assignments.md | 202 +++++++++++++++-------- 1 file changed, 135 insertions(+), 67 deletions(-) diff --git a/doc/designs/flexible_rank_assignments.md b/doc/designs/flexible_rank_assignments.md index f45d2240a..8d9a8deac 100644 --- a/doc/designs/flexible_rank_assignments.md +++ b/doc/designs/flexible_rank_assignments.md @@ -3,6 +3,14 @@ Status: Design Document (Draft — second pass) Date: 2026-05-24 +**Key decision up front.** Flexible ranks are added as a +*gated-additive* feature, not a rework of the path every run exercises. +At equal rank ratios (today's only case) the existing `strata_comm` +windows and single-source reader run unchanged; the new `fullcomm` + +overlap-map machinery is reached only when a rank ratio differs from +1.0. The rationale, trade-offs, and rollout workflow are in +§Development and rollout strategy. + ### Terminology - **Window:** An MPI one-sided ("RMA") addressing scope, created @@ -301,7 +309,10 @@ multi-stage problems (though for two-stage problems they are uniform). #### Window Topology Options -Spoiler alert: we are going to recommend option D, which is similar to A. +Of the options below we settle on Option D, though only as the topology +for the unequal-rank path; it is similar to Option A. See §Development +and rollout strategy for why it is added alongside the existing +`strata_comm` rather than replacing it. **Option A: Single global window on `fullcomm`** @@ -361,9 +372,16 @@ window creation. Cons: those pairs) is an incremental fix that does not require abandoning Option D. -**Recommendation:** Option D. The window is on `fullcomm` (or a -subset), and each rank knows which global ranks to read from via the -overlap maps. +**Recommendation:** Option D, *as the topology for the unequal-rank +path only*. When any rank ratio differs from 1.0, the window is on +`fullcomm` (or a subset) and each rank knows which global ranks to read +from via the overlap maps. `strata_comm` is **not** removed: the +equal-rank case keeps its existing per-`strata_comm` windows untouched, +and the `fullcomm` topology is built additively, reached only when the +feature is engaged (see §Development and rollout strategy). "Abandon +`strata_comm`" above describes Option D as a global replacement, which +this design does *not* adopt; it borrows only Option D's addressing +scheme for the new path. #### Multi-Source Read Assembly @@ -503,9 +521,15 @@ unnecessary given the per-field analysis). algorithm (§User Interface) that respects per-cylinder rank counts. - `cylinder_comm` creation still uses `MPI_Comm_split` (all ranks in the same cylinder get the same color). -- `strata_comm` is removed (Option D); the window moves to `fullcomm`. -- Indexing that assumes a fixed `strata_rank` meaning must change; - each rank needs to know its cylinder index directly. +- `strata_comm` is **retained** for the equal-rank path. When ranks + are equal across cylinders, `strata_comm` and its per-strata windows + are used exactly as today. When ratios differ, the window is built + on `fullcomm` instead (Option D's addressing), selected once at + startup; see §Development and rollout strategy. +- In the unequal-rank path, indexing that assumes a fixed `strata_rank` + meaning does not apply; each rank addresses peers by global rank via + the overlap maps. The equal-rank path's `strata_rank` indexing is + unchanged. #### `spbase.py` @@ -514,25 +538,30 @@ unnecessary given the per-field analysis). #### `spwindow.py` -- `SPWindow.get()` must support partial reads (offset + count). -- The buffer layout exchange must use a communicator that spans all - cylinders (not `strata_comm`), or be computed locally from the - static rank-count/scenario-map data (preferred). -- Buffer validation must account for asymmetric sizes: a receiver's - expected size may differ from the sender's, and that is fine as long - as each requested segment fits within the sender's buffer. +- `SPWindow.get()` must support partial reads (offset + count) -- an + additive capability the equal-rank path does not exercise. +- For the unequal-rank path, the buffer layout must be exchanged over a + communicator that spans all cylinders, or (preferred) computed + locally from the static rank-count/scenario-map data. The + equal-rank path keeps its existing `strata_comm` allgather. +- Buffer validation must account for asymmetric sizes on the + multi-source path: a receiver's expected size may differ from the + sender's, and that is fine as long as each requested segment fits + within the sender's buffer. - No field-length or layout changes: every field keeps its current per-scenario / global / scalar sizing. (Canonicalization is *not* part of this design.) #### `spcommunicator.py` -- `register_receive_fields()` builds overlap maps instead of assuming - 1-to-1 rank correspondence. -- `get_receive_buffer()` supports multi-source assembly, with the - per-field coherence policy applied. +- `register_receive_fields()` builds overlap maps for the unequal-rank + path; the equal-rank path retains the 1-to-1 rank correspondence. +- `get_receive_buffer()` gains a multi-source assembly path (with the + per-field coherence policy applied), selected when ratios differ; the + single-source path is unchanged at equal ranks. - `put_send_buffer()` is unchanged (each rank writes its own data). -- `_validate_recv_field()` is relaxed to per-segment validation. +- `_validate_recv_field()` gains per-segment validation for the + multi-source path; equal-rank validation is unchanged. - The `synchronize` parameter's `cylinder_comm.Barrier()` / `Allreduce` still work within a cylinder. @@ -578,12 +607,17 @@ the first pass bundled into "Phase 0" has already landed separately. various rank-count combinations (including the equal-rank identity case). -**Phase 2: Communication layer** - -- Replace the `strata_comm`-based layout exchange with a - `fullcomm`-based or locally-computed layout (Option D). -- Implement multi-source `get_receive_buffer()` using overlap maps. -- Relax `_validate_recv_field()` to per-segment validation. +**Phase 2: Communication layer** (additive; reached only when a ratio +differs from 1.0) + +- Add a `fullcomm`-based or locally-computed layout exchange for the + unequal-rank path (Option D's addressing), *alongside* the existing + `strata_comm`-based exchange, which the equal-rank path keeps using. +- Implement multi-source `get_receive_buffer()` using overlap maps, as + a path taken only under non-default ratios; the single-source reader + is unchanged for the equal-rank case. +- Add per-segment validation for the multi-source path; leave + `_validate_recv_field()` as-is on the equal-rank path. - Test the relaxed-coherence per-scenario fields first (`NONANTS_VALS`) with a simple case: hub with 4 ranks, Lagrangian spoke with 2 ranks. @@ -624,23 +658,65 @@ behavior varies across MPI implementations (OpenMPI, MPICH, vendor builds) and versions, and where passive-target RMA (`Lock` / `Get` / `Put`) is the flakiest part of the spec. Reviews of this material are necessarily slow, and a regression in the communicator layer would -affect *every* run, not just flexible-rank runs. The phases above are -therefore sequenced by *blast radius*: - -- **Inert phases** (the infrastructure phase) add only gated, unreachable - code: at equal rank ratios the existing code path is unchanged. These - can land on `main` incrementally and safely -- each is small, - reviewable on its own, and cannot change behavior for current users. -- **Live-path phases** (the communicator rework that replaces - `strata_comm`) rewrite code that every run exercises, equal-rank or - not. This cannot hide behind the rank-ratio gate, and is where the - real risk concentrates. - -**Chosen workflow: stacked PRs.** Because reviews of this material are -slow, phases are developed as a *stack* of pull requests rather than -waiting for each to merge before the next begins. Each phase lives on -its own short-lived branch whose PR targets the branch *below* it (phase -N+1 onto phase N, phase 1 onto `main`); the stack is reviewed and merged +affect *every* run, not just flexible-rank runs. The guiding principle +is therefore to **keep the equal-rank path off the table entirely**: the +common case that every current user exercises must execute the same code +after this feature lands as before it. + +**Gated-additive, not a live-path rework.** An earlier draft proposed +replacing `strata_comm` with a single `fullcomm` window (Option D) for +*all* runs, introduced via branch-by-abstraction behind a runtime flag +that defaulted to the old path and was to be removed once the new path +proved out. We reject that here. A replacement -- even one staged +behind a flag -- rewrites the path every run hits, and the temporary +"two implementations behind a flag, delete the old one later" state is +exactly the kind of clutter that becomes permanent: removing the proven +path is always scarier than keeping it, the two paths drift because tests +rarely exercise both equally, and the toggle outlives its purpose. + +Instead, the new `fullcomm` + overlap-map machinery is **purely +additive and gated by the feature itself**. The rank-ratio +configuration *is* the gate -- no separate runtime flag: + +- When all ratios are 1.0 (today's only possibility), the system builds + the existing per-`strata_comm` windows and uses the existing + single-source reader, unchanged. Current users execute the same bytes + they execute now. +- When any ratio differs from 1.0, `make_windows()` builds the + `fullcomm` window (Option D's topology) and the multi-source, + overlap-map reader is used. A run uses exactly one topology + throughout, chosen once at startup; the two never coexist within a + single run. + +This makes *every* phase inert with respect to the equal-rank path -- +there is no live-path phase to sequence around. Each phase adds gated +code that is unreachable until a user opts in with a non-default ratio, +so each can land on `main` incrementally and safely. + +**The production fallback comes for free.** The branch-by-abstraction +flag was justified as a runtime escape hatch for clusters with buggy +RMA. Gated-additive preserves that property without a second +implementation of the equal-rank path: if the `fullcomm` path +misbehaves on some MPI build, the user sets the ratios back to 1.0 and is +on the proven path. Disabling the feature *is* the escape hatch. + +**Cost we accept.** Two window topologies live in the tree +permanently -- per-`strata_comm` windows for the equal-rank case and the +`fullcomm` window for the unequal case -- and we do *not* plan a later +unification onto a single communicator. This is benign clutter: the two +are not competing implementations of the same behavior (which would +drift), but the right tool for two different cases, partitioned by a +condition decided once at startup. The thing given up is the aesthetic +of a single communicator; if a future benchmark ever shows `fullcomm` is +strictly better even at equal ranks, the cutover remains available as +separate work, but it is not undertaken speculatively for an unstable +dependency we do not control. + +**Workflow: stacked PRs.** Because reviews of this material are slow, +phases are developed as a *stack* of pull requests rather than waiting +for each to merge before the next begins. Each phase lives on its own +short-lived branch whose PR targets the branch *below* it (phase N+1 onto +phase N, phase 1 onto `main`); the stack is reviewed and merged bottom-up, each PR being retargeted to `main` as the one beneath it lands. This keeps work moving while earlier PRs sit in review, yet -- unlike a long-lived feature branch or a development fork -- every change @@ -652,22 +728,11 @@ are subtle. If isolation is ever genuinely wanted, prefer a branch in the upstream repository over a separate fork -- same isolation, far less CI and merge friction.) -**Live-path rework: branch by abstraction.** Stacked PRs govern *how* -phases are reviewed and merged; the communicator rework that replaces -`strata_comm` additionally needs care in *how the code is structured*, -because it changes the path every run exercises. Implement the new -fullcomm / Option-D path *alongside* the existing `strata_comm` path, -selected by a flag, defaulting to the **old** path. Land it with the old -path still default, so current users are unaffected; exercise the new -path opt-in; make it the default only once it is proven; remove the old -path afterward. This yields a **runtime fallback** a feature branch or -fork does not: if a cluster's MPI has buggy RMA, users can switch back to -the proven path. For an unstable dependency we do not control, a toggle -available in production is worth more than isolation during development. - -**Gate the default flip on an MPI CI matrix.** Exercise the new path on -at least two MPI implementations (e.g. OpenMPI and MPICH) and more than -one mpi4py / MPI version before it becomes the default. +**Gate reliance on the feature with an MPI CI matrix.** There is no +default to flip, but before the `fullcomm` path is advertised as +supported, exercise it on at least two MPI implementations (e.g. OpenMPI +and MPICH) and more than one mpi4py / MPI version, since that path is +where the RMA-portability risk lives. ### Possible future optimization (out of scope) @@ -684,15 +749,18 @@ work. ### Backward Compatibility -When all rank ratios are 1.0 (the default), the system must behave -identically to the current implementation. The overlap maps degenerate -to single-segment identity mappings, multi-source reads become -single-source reads, and the strict/relaxed coherence checks are -satisfied trivially (one source per field). Because this design makes -**no field-layout changes**, the on-the-wire format is unchanged at -equal ranks — there is no analogue of the first pass's "Phase 0 changes -the wire format even at equal ranks" caveat. Verify by running the -full existing test suite with the new code and default ratios. +When all rank ratios are 1.0 (the default), the system behaves +identically to the current implementation — not via a degenerate case of +the new machinery, but because the new machinery is **not reached at +all**. At equal ratios the existing per-`strata_comm` windows, +single-source reader, and `strata_rank` addressing run verbatim; the +overlap maps, `fullcomm` window, and multi-source reader are gated behind +a non-default ratio (see §Development and rollout strategy). Because +this design also makes **no field-layout changes**, the on-the-wire +format is unchanged at equal ranks — there is no analogue of the first +pass's "Phase 0 changes the wire format even at equal ranks" caveat. +Verify by running the full existing test suite with the new code and +default ratios. --- From b6f17fd564ab6fe8e62165eac60f71b2626a3803 Mon Sep 17 00:00:00 2001 From: Dave Woodruff Date: Sun, 24 May 2026 07:27:32 -0700 Subject: [PATCH 8/9] doc(flex-ranks): scope spin_the_wheel impact bullets to the unequal path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The divisibility check and uniform MPI_Comm_split are kept on the equal-rank path (the gate runs before _make_comms, which still enforces n_proc % n_spcomms == 0); only the unequal-rank path bypasses the check and uses apportionment. Reword the two §Impact bullets that read as an unconditional remove/replace, consistent with the gated-additive rollout. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/designs/flexible_rank_assignments.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/doc/designs/flexible_rank_assignments.md b/doc/designs/flexible_rank_assignments.md index 8d9a8deac..0777edbb9 100644 --- a/doc/designs/flexible_rank_assignments.md +++ b/doc/designs/flexible_rank_assignments.md @@ -516,9 +516,13 @@ unnecessary given the per-field analysis). #### `spin_the_wheel.py` -- Remove the `n_proc % n_spcomms == 0` check. -- Replace the uniform `MPI_Comm_split` with the apportionment - algorithm (§User Interface) that respects per-cylinder rank counts. +- Keep the `n_proc % n_spcomms == 0` check on the equal-rank path; the + unequal-rank path bypasses it, since apportionment (§User Interface) + removes the requirement that `n_proc` divide evenly by cylinder count. +- On the unequal-rank path, replace the uniform `MPI_Comm_split` with + the apportionment algorithm (§User Interface) that respects + per-cylinder rank counts. The equal-rank path keeps the uniform + split unchanged. - `cylinder_comm` creation still uses `MPI_Comm_split` (all ranks in the same cylinder get the same color). - `strata_comm` is **retained** for the equal-rank path. When ranks From d0983640861b11a78d263d1ccb909ef054657d62 Mon Sep 17 00:00:00 2001 From: Dave Woodruff Date: Sun, 24 May 2026 07:36:38 -0700 Subject: [PATCH 9/9] doc(flex-ranks): clarify strata_comm is not created in an unequal run "Retained" was ambiguous: it meant the strata_comm code path stays in the codebase for equal-rank runs, but read as if strata_comm coexists with the fullcomm window inside an unequal-rank run. It does not -- the strata grouping (color = global_rank // n_spcomms) is undefined when cylinders have different rank counts, which is the reason for Option D. State the per-run rule explicitly: an equal-rank run creates strata_comm and windows on it; an unequal-rank run does not create strata_comm and puts the window on fullcomm; cylinder_comm is created in both. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/designs/flexible_rank_assignments.md | 50 ++++++++++++++++-------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/doc/designs/flexible_rank_assignments.md b/doc/designs/flexible_rank_assignments.md index 0777edbb9..db0dd5d6b 100644 --- a/doc/designs/flexible_rank_assignments.md +++ b/doc/designs/flexible_rank_assignments.md @@ -373,15 +373,29 @@ window creation. Cons: Option D. **Recommendation:** Option D, *as the topology for the unequal-rank -path only*. When any rank ratio differs from 1.0, the window is on -`fullcomm` (or a subset) and each rank knows which global ranks to read -from via the overlap maps. `strata_comm` is **not** removed: the -equal-rank case keeps its existing per-`strata_comm` windows untouched, -and the `fullcomm` topology is built additively, reached only when the -feature is engaged (see §Development and rollout strategy). "Abandon -`strata_comm`" above describes Option D as a global replacement, which -this design does *not* adopt; it borrows only Option D's addressing -scheme for the new path. +path only*. A given run uses exactly one topology, chosen at startup +from the rank ratios: + +- *Equal-rank run* (all ratios 1.0): create `strata_comm` and + `cylinder_comm`, and put the window on `strata_comm` -- exactly as + today. +- *Unequal-rank run* (any ratio differs from 1.0): **`strata_comm` is + not created at all.** The strata grouping (`color = + global_rank // n_spcomms`, "rank *i* of every cylinder") is undefined + when cylinders have different rank counts -- that is the whole reason + for Option D. Create `cylinder_comm` only (each cylinder is a + contiguous block of apportioned ranks, so the intra-cylinder grouping + is still well-defined), put the window on `fullcomm`, and have each + rank address peers by global rank via the overlap maps. Each rank + takes its cylinder index from the apportionment rather than from + `strata_rank`. + +So `strata_comm` is *retained in the codebase* -- the equal-rank path +still uses it unchanged -- but it is *not created in an unequal-rank +run*. This is additive, not a replacement: the textbook "abandon +`strata_comm`" framing in the Option D description above would remove it +for *all* runs, which this design does not do. See §Development and +rollout strategy. #### Multi-Source Read Assembly @@ -525,15 +539,19 @@ unnecessary given the per-field analysis). split unchanged. - `cylinder_comm` creation still uses `MPI_Comm_split` (all ranks in the same cylinder get the same color). -- `strata_comm` is **retained** for the equal-rank path. When ranks - are equal across cylinders, `strata_comm` and its per-strata windows - are used exactly as today. When ratios differ, the window is built - on `fullcomm` instead (Option D's addressing), selected once at - startup; see §Development and rollout strategy. +- `strata_comm` is **retained in the codebase** for the equal-rank + path: when ranks are equal across cylinders, `strata_comm` and its + per-strata windows are created and used exactly as today. In an + unequal-rank run `strata_comm` is **not created** -- the strata + grouping is undefined for cylinders of different sizes -- and the + window is built on `fullcomm` instead (Option D's addressing). The + choice is made once at startup; a run creates one or the other, never + both. `cylinder_comm` is created in both cases. See §Development and + rollout strategy. - In the unequal-rank path, indexing that assumes a fixed `strata_rank` meaning does not apply; each rank addresses peers by global rank via - the overlap maps. The equal-rank path's `strata_rank` indexing is - unchanged. + the overlap maps, taking its cylinder index from the apportionment. + The equal-rank path's `strata_rank` indexing is unchanged. #### `spbase.py`