sync D77694359 to OSS

wz337 · meta-codesync[bot] · commit 36df9ff7fae4 · 2025-11-07T16:58:02.000-08:00
Summary: sync D77694359 to OSS

Reviewed By: jialun-zhang

Differential Revision: D86571324

fbshipit-source-id: a81e8868a85f275ecf917f837e659e21e5e6b11a
diff --git a/distributed_shampoo/distributed_shampoo.py b/distributed_shampoo/distributed_shampoo.py
@@ -549,8 +549,13 @@ def _instantiate_distributor(self) -> None:
                     param_assignment_strategy=FSDPParamAssignmentStrategy.DEFAULT
                 ):
                     distributor_cls = FullyShardDistributor
-                case FullyShardDistributedConfig(
-                    param_assignment_strategy=FSDPParamAssignmentStrategy.REPLICATE
+                case (
+                    FullyShardDistributedConfig(
+                        param_assignment_strategy=FSDPParamAssignmentStrategy.REPLICATE
+                    )
+                    | FullyShardDistributedConfig(
+                        param_assignment_strategy=FSDPParamAssignmentStrategy.ROUND_ROBIN
+                    )
                 ):
                     distributor_cls = FullyShardLosslessDistributor
                 case _:
diff --git a/distributed_shampoo/distributor/_shampoo_fully_shard_lossless_distributor.py b/distributed_shampoo/distributor/_shampoo_fully_shard_lossless_distributor.py
@@ -21,6 +21,10 @@
     FullyShardDistributedConfig,
     PARAMS,
 )
+from distributed_shampoo.utils.shampoo_utils import (
+    prepare_update_param_buffers,
+    redistribute_and_update_params,
+)
 from torch import distributed as dist, Tensor
 
 logger: logging.Logger = logging.getLogger(__name__)
@@ -50,14 +54,41 @@ def __init__(self, param_group: dict[str, Any]) -> None:
         logger.info(
             f"Shampoo FullyShardLosslessDistributor {self._param_assignment_strategy=}",
         )
-        # Stores full parameters (as opposed to DTensors) for the model parameters assigned to this rank.
-        # For example, when the strategy is REPLICATE, it stores the full parameters on all ranks.
+
+        self._group_size: int = dist.get_world_size()
+        self._dist_group: dist.ProcessGroup = dist.new_subgroups(
+            group_size=self._group_size
+        )[0]
+        self._group_rank: int = dist.get_rank(group=self._dist_group)
+
+        should_assign_param_idx = (
+            lambda i: i % self._group_size == self._group_rank
+            if self._param_assignment_strategy
+            == FSDPParamAssignmentStrategy.ROUND_ROBIN
+            else True
+        )
+        self._assigned_params_mask: tuple[bool, ...] = tuple(
+            should_assign_param_idx(idx) for idx in range(len(param_group[PARAMS]))
+        )
+
+        # Collects and stores the model parameters assigned to this rank.
         # Note that we explicitly disable the unnecessary gradient tracking for the all-gather collectives
         # used to initialize the full parameters.
         with torch.no_grad():
-            self._assigned_full_params: tuple[torch.Tensor, ...] = tuple(
-                p.full_tensor() for p in param_group[PARAMS]
-            )
+            full_params: list[Tensor] = [p.full_tensor() for p in param_group[PARAMS]]
+        self._assigned_full_params: list[Tensor] = [
+            p
+            for p, assigned in zip(full_params, self._assigned_params_mask)
+            if assigned
+        ]
+
+        # For ROUND_ROBIN strategy, creates a buffer for receiving the updated param shards.
+        self._update_param_buffers: list[Tensor] | None = (
+            prepare_update_param_buffers(param_group[PARAMS], self._group_size)
+            if self._param_assignment_strategy
+            == FSDPParamAssignmentStrategy.ROUND_ROBIN
+            else None
+        )
 
         super().__init__(param_group)
 
@@ -86,11 +117,18 @@ def _get_params_or_grads(self, get_grad: bool = False) -> Iterable[Tensor | None
         if get_grad:
             # Getting grads at every optimizer step triggers implicit all-gather. Note that p.numel()
             # returns total number of elements in the tensor (as opposed to local shard of DTensor).
-            return (
+            full_grads = (
                 None if p.grad is None else p.grad.full_tensor()
                 for p in self._param_group[PARAMS]
-                if p.numel() > 0
             )
+            return (
+                full_grad
+                for full_grad, assigned in zip(
+                    full_grads, self._assigned_params_mask, strict=True
+                )
+                if assigned and (full_grad is None or full_grad.numel() > 0)
+            )
+
         else:
             return filter(
                 lambda p: isinstance(p, Tensor) and p.numel() > 0,
@@ -115,7 +153,15 @@ def update_params(
         # For example, when the strategy is REPLICATE, we need to take each updated full parameter `full_param`,
         # redistribute it according to the device mesh to get the locally assigned slice, and copy the slice to the
         # corresponding local parameter `local_param` in the param group.
-        if self._param_assignment_strategy == FSDPParamAssignmentStrategy.REPLICATE:
+        if self._param_assignment_strategy == FSDPParamAssignmentStrategy.ROUND_ROBIN:
+            redistribute_and_update_params(
+                self._param_group[PARAMS],
+                self._assigned_full_params,
+                self._update_param_buffers,  # type: ignore
+                self._dist_group,
+            )
+
+        elif self._param_assignment_strategy == FSDPParamAssignmentStrategy.REPLICATE:
             local_params = list(
                 filter(lambda p: p.numel() > 0, self._param_group[PARAMS])
             )
@@ -143,5 +189,11 @@ def update_params(
     def _construct_local_block_info_list(self) -> tuple[BlockInfo, ...]:
         """Construct local block info list from param_group."""
         return self._construct_local_block_info_list_with_params(
-            params=filter(lambda p: p.numel() > 0, self._param_group[PARAMS])
+            params=(
+                p
+                for assigned, p in zip(
+                    self._assigned_params_mask, self._param_group[PARAMS], strict=True
+                )
+                if assigned and p.numel() > 0
+            ),
         )
diff --git a/distributed_shampoo/distributor/gpu_tests/shampoo_fully_shard_lossless_distributor_test.py b/distributed_shampoo/distributor/gpu_tests/shampoo_fully_shard_lossless_distributor_test.py
@@ -127,12 +127,20 @@ def _shampoo_optim_factory(
     @with_comms
     @skip_if_lt_x_gpu(2)
     @parametrize("model_linear_layers_dims", TEST_MODEL_LAYER_DIMS)
+    @parametrize(
+        "param_assignment_strategy",
+        (
+            FSDPParamAssignmentStrategy.REPLICATE,
+            FSDPParamAssignmentStrategy.ROUND_ROBIN,
+        ),
+    )
     def test_all_ranks_with_no_grads(
         self,
         model_linear_layers_dims: tuple[int, ...],
+        param_assignment_strategy: FSDPParamAssignmentStrategy,
     ) -> None:
         fully_shard_config = FullyShardDistributedConfig(
-            param_assignment_strategy=FSDPParamAssignmentStrategy.REPLICATE
+            param_assignment_strategy=param_assignment_strategy
         )
 
         steps_without_gradients = 2
@@ -156,13 +164,21 @@ def test_all_ranks_with_no_grads(
 
     @with_comms
     @skip_if_lt_x_gpu(2)
+    @parametrize(
+        "param_assignment_strategy",
+        (
+            FSDPParamAssignmentStrategy.REPLICATE,
+            FSDPParamAssignmentStrategy.ROUND_ROBIN,
+        ),
+    )
     @parametrize("model_linear_layers_dims", TEST_MODEL_LAYER_DIMS)
     def test_fully_shard_shampoo_against_default_shampoo(
         self,
+        param_assignment_strategy: FSDPParamAssignmentStrategy,
         model_linear_layers_dims: tuple[int, ...],
     ) -> None:
         fully_shard_config = FullyShardDistributedConfig(
-            param_assignment_strategy=FSDPParamAssignmentStrategy.REPLICATE
+            param_assignment_strategy=param_assignment_strategy
         )
         control_model_factory = partial(
             ShampooFullyShardLosslessDistributorTest._construct_model,
diff --git a/distributed_shampoo/tests/distributed_shampoo_test.py b/distributed_shampoo/tests/distributed_shampoo_test.py
@@ -36,8 +36,6 @@
     DistributedConfig,
     EigendecomposedShampooPreconditionerConfig,
     EigenvalueCorrectedShampooPreconditionerConfig,
-    FSDPParamAssignmentStrategy,
-    FullyShardDistributedConfig,
     PreconditionerConfig,
     RootInvShampooPreconditionerConfig,
     ShampooPT2CompileConfig,
@@ -259,16 +257,6 @@ class NotSupportedDistributedConfig(DistributedConfig):
             distributed_config=NotSupportedDistributedConfig(),
         )
 
-        self.assertRaisesRegex(
-            NotImplementedError,
-            r"group\[DISTRIBUTED_CONFIG\]=.*FullyShardDistributedConfig\(.*ROUND_ROBIN.*\) not supported!",
-            DistributedShampoo,
-            params=self._model.parameters(),
-            distributed_config=FullyShardDistributedConfig(
-                param_assignment_strategy=FSDPParamAssignmentStrategy.ROUND_ROBIN
-            ),
-        )
-
 
 class DistributedShampooTest(unittest.TestCase):
     def setUp(self) -> None:
diff --git a/distributed_shampoo/utils/gpu_tests/shampoo_utils_test.py b/distributed_shampoo/utils/gpu_tests/shampoo_utils_test.py
@@ -0,0 +1,86 @@
+"""
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+
+This source code is licensed under the BSD-style license found in the
+LICENSE file in the root directory of this source tree.
+
+"""
+
+#!/usr/bin/env python3
+
+import unittest
+
+import numpy as np
+import torch
+from distributed_shampoo.utils.shampoo_utils import (
+    prepare_update_param_buffers,
+    redistribute_and_update_params,
+)
+from torch import distributed as dist
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.tensor import distribute_tensor, Shard
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+)
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
+)
+
+
+def generate_param_shapes(num_params: int) -> list[tuple[int, ...]]:
+    """Generate parameter shapes for testing.
+
+    For N parameters, we generate the following shapes:
+        [(1, 2), (2, 3), (3, 4), ..., (N, N + 1)].
+    """
+    return [(i, i + 1) for i in range(1, num_params + 1)]
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "Skip when CUDA is not available")
+@instantiate_parametrized_tests
+class RedistributeAndUpdateParamsTest(DTensorTestBase):
+    @property
+    def world_size(self) -> int:
+        return 4
+
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    @parametrize("num_params", (1, 4, 7))
+    def test_redistribute_and_update_params(self, num_params: int) -> None:
+        device_mesh = init_device_mesh("cuda", (4,))
+        shapes = generate_param_shapes(num_params)
+        params = [torch.zeros(s, device="cuda") for s in shapes]
+        dtensor_params = tuple(
+            distribute_tensor(t, device_mesh, [Shard(0)]) for t in params
+        )
+
+        update_buffers = prepare_update_param_buffers(dtensor_params, self.world_size)
+        self.assertEqual(
+            len(update_buffers),
+            int(np.ceil(num_params / self.world_size) * self.world_size),
+        )
+        for i, buffer in enumerate(update_buffers):
+            if i < num_params:
+                self.assertEqual(buffer.numel(), dtensor_params[i].to_local().numel())
+            else:
+                self.assertEqual(buffer.numel(), 0)
+
+        rank = dist.get_rank()
+        dist_group = dist.distributed_c10d._get_default_group()
+        # Fill the locally assigned parameters with the rank as value.
+        local_full_params = [
+            torch.zeros(s, device="cuda").fill_(rank)
+            for i, s in enumerate(shapes)
+            if i % self.world_size == rank
+        ]
+        redistribute_and_update_params(
+            dtensor_params, local_full_params, update_buffers, dist_group
+        )
+        for i, param in enumerate(dtensor_params):
+            np.testing.assert_allclose(
+                param.to_local().cpu().numpy(), i % self.world_size
+            )
diff --git a/distributed_shampoo/utils/shampoo_utils.py b/distributed_shampoo/utils/shampoo_utils.py
@@ -16,10 +16,13 @@
 from types import TracebackType
 from typing import Any, TypeVar
 
+import numpy as np
+
 import torch
 from distributed_shampoo.shampoo_types import LoadBalancingConfig
 from distributed_shampoo.utils.load_balancing_utils import AlignedMemoryCostModel
-from torch import Tensor
+from torch import distributed as dist, Tensor
+from torch.distributed.tensor import DTensor
 
 
 @cache
@@ -329,4 +332,78 @@ def distribute_buffer_sizes(
 
     buffer_size_ranks = tuple(zip(buffer_sizes_aligned, param_block_ranks, strict=True))
 
-    return buffer_size_ranks
+    return tuple(buffer_size_ranks)
+
+
+def prepare_update_param_buffers(
+    params: tuple[DTensor, ...], group_size: int
+) -> list[Tensor]:
+    """Allocates a persistent shadow copy of updated parameters."""
+    if any(p.dtype != params[0].dtype for p in params):
+        raise NotImplementedError(
+            "When using round-robin assignment in FSDP Shampoo, parameters of "
+            "different dtypes are not currently supported."
+        )
+
+    param_sizes = [p.to_local().numel() for p in params]
+    buffer_size = sum(param_sizes)
+    buffer = params[0].to_local().new_zeros(buffer_size)
+    buffer_offsets = np.cumsum(param_sizes).tolist()
+
+    def round_up_to_multiple_of(x: int, y: int) -> int:
+        return ((x + y - 1) // y) * y
+
+    pad_len = round_up_to_multiple_of(len(buffer_offsets), group_size) - len(
+        buffer_offsets
+    )
+    # Pad the list with empty tensors to ensure each rank participates in all-to-all.
+    buffer_offsets.extend([buffer_size] * pad_len)
+    # Drop the last element as torch.tensor_split takes indices as split points.
+    buffer_offsets = buffer_offsets[:-1]
+
+    return list(torch.tensor_split(buffer, buffer_offsets))
+
+
+def redistribute_and_update_params(
+    params: tuple[DTensor, ...],
+    local_full_params: list[Tensor],
+    update_param_buffers: list[Tensor],
+    dist_group: torch.distributed.ProcessGroup,
+) -> None:
+    """Redistributes updated parameters to each parameter's rank."""
+    group_size = dist_group.size()
+
+    # Run all-to-all collectives to exchange the updated parameters across
+    # ranks in group. This implementation runs multiple rounds of a2a ops
+    # if the number of parameters is larger than the world size.
+    for a2a_round in range(len(update_param_buffers) // group_size):
+        # Send either a valid full parameter, or a padding zero tensor.
+        send_param = (
+            local_full_params[a2a_round]
+            if a2a_round < len(local_full_params)
+            else params[0].to_local().new_zeros(0)
+        )
+        # Chunk the send_param to exactly group_size slices to distribute to
+        # all ranks. We need to manually pad the result of torch.chunk since
+        # it does not guarantee that the result has the desired chunks.
+        send_list = [t.flatten() for t in torch.chunk(send_param, group_size, dim=0)]
+        if len(send_list) < group_size:
+            # NOTE: Intentionally use `torch.tensor_split` here to do a trivial
+            # split to ensure that the padding is in contiguous memory space as
+            # is required for all-to-all collectives.
+            append_len = group_size - len(send_list)
+            last_t = send_list[-1]
+            split_indices = [send_list[-1].shape[0]] * append_len
+            send_list.extend(torch.tensor_split(last_t, split_indices, dim=0)[1:])
+        assert len(send_list) == group_size
+
+        # Specify receive list as a range of update_param_buffers.
+        recv_list = update_param_buffers[
+            a2a_round * group_size : (a2a_round + 1) * group_size
+        ]
+
+        dist.all_to_all(recv_list, send_list, dist_group)
+
+    torch._foreach_copy_(
+        [p.to_local().flatten() for p in params], update_param_buffers[: len(params)]
+    )