Fix init_weights crash with aliased buffers (e.g. rope.cache/freqs_cis)

xmfan · xmfan · commit 08daf77f2c77 · 2026-02-20T16:01:56.000-08:00
When a model registers the same tensor under multiple FQNs (e.g. `rope.cache` and `freqs_cis` in torchtitan's Decoder), PyTorch's `named_buffers()` deduplicates them by default, and this deduplication also happens during AOTAutograd tracing. As a result, the AutoParallelModule was missing the alias FQNs entirely. This must be fixed in autoparallel (not deep in PyTorch) because autoparallel returns an nn.Module that the user should be able to use like their original model -- all buffer FQNs from the original model must be present on the returned module. The fix has two parts: (1) in api.py, capture buffer alias info before `move_to_fake` destroys aliasing, then re-register aliases on the parallel module after sharding; (2) in init_weights.py, skip hooking buffer FQNs that don't exist on the parallel model (aliases that were deduplicated). Authored with Claude. stack-info: PR: #321, branch: xmfan/stack/24
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -72,3 +72,34 @@ python -m pytest tests/
 - Leverages DTensor for distributed tensor operations
 - Uses linear programming (PuLP) to solve sharding optimization problems
 - Includes fake tensor mode for shape inference without actual computation
+
+# Commit messages
+
+Don't commit unless the user explicitly asks you to.
+
+When writing a commit message, don't make a bullet list of the individual
+changes. Instead, if the PR is large, explain the order to review changes
+(e.g., the logical progression), or if it's short just omit the bullet list
+entirely.
+
+Disclose that the PR was authored with Claude.
+
+# Coding Style Guidelines
+
+Follow these rules for all code changes in this repository:
+
+- Minimize comments; be concise; code should be self-explanatory and self-documenting.
+- Comments should be useful, for example, comments that remind the reader about
+  some global context that is non-obvious and can't be inferred locally.
+- Don't make trivial (1-2 LOC) helper functions that are only used once unless
+  it significantly improves code readability.
+- Prefer clear abstractions. State management should be explicit.
+  For example, if managing state in a Python class: there should be a clear
+  class definition that has all of the members: don't dynamically `setattr`
+  a field on an object and then dynamically `getattr` the field on the object.
+- Match existing code style and architectural patterns.
+- Assume the reader has familiarity with PyTorch. They may not be the expert
+  on the code that is being read, but they should have some experience in the
+  area.
+
+If uncertain, choose the simpler, more concise implementation.
diff --git a/autoparallel/api.py b/autoparallel/api.py
@@ -251,6 +251,19 @@ def __init__(
         # in dtype casting and move_to_fake
         model = copy.deepcopy(model)
 
+        # Capture buffer alias info before move_to_fake breaks aliasing.
+        # named_buffers() deduplicates by default, so aliases are dropped.
+        # We record alias_fqn -> canonical_fqn so we can re-register them later.
+        self._buffer_alias_map: dict[str, str] = {}
+        canonical_by_id: dict[int, str] = {}
+        canonical_fqns: set[str] = set()
+        for fqn, buf in model.named_buffers():
+            canonical_by_id[id(buf)] = fqn
+            canonical_fqns.add(fqn)
+        for fqn, buf in model.named_buffers(remove_duplicate=False):
+            if fqn not in canonical_fqns and id(buf) in canonical_by_id:
+                self._buffer_alias_map[fqn] = canonical_by_id[id(buf)]
+
         # keep a separate copy of the fake orig model to customize for supporting init_weights
         self.init_weights_model = move_to_fake(
             copy.deepcopy(model), self.fake_mode, device
@@ -579,6 +592,20 @@ def _register_params_and_init_weights(
                 attr_kind=_AttrKind.BUFFER,
             )
 
+        # Register aliased buffers that were deduplicated during tracing.
+        # e.g. if the original model has rope.cache and freqs_cis pointing to
+        # the same tensor, only one survives in sharded_buffer_dict. We register
+        # the missing alias so the parallel model mirrors the original structure.
+        for alias_fqn, canonical_fqn in self._buffer_alias_map.items():
+            if canonical_fqn in sharded_buffer_dict:
+                _assign_attr(
+                    self.parallel_model.get_buffer(canonical_fqn),
+                    self.parallel_model,
+                    self.model,
+                    alias_fqn,
+                    attr_kind=_AttrKind.BUFFER,
+                )
+
         # Right now we require a convention that the user model provides an init_weights method,
         # although we could snoop for other methods too.
         hook_params_setters(self.init_weights_model, self.parallel_model)
diff --git a/autoparallel/init_weights.py b/autoparallel/init_weights.py
@@ -122,6 +122,8 @@ def hook_params_setters(
     Also wraps init_weights_model.init_weights (if present) with a TorchDispatchMode
     to handle in-place data operations like ``self.weight.data[:] = value``.
     """
+    parallel_buffer_fqns = set(n for n, _ in parallel_model.named_buffers())
+
     for mod_name, mod in sorted(init_weights_model.named_modules()):
         params_dict = dict(mod.named_parameters(recurse=False))
         buffers_dict = dict(mod.named_buffers(recurse=False))
@@ -132,7 +134,13 @@ def hook_params_setters(
             namespace[p_name] = _build_param_property(parallel_model, fqn)
 
         for b_name in buffers_dict:
-            fqn = mod_name + "." + b_name
+            fqn = f"{mod_name}.{b_name}" if mod_name else b_name
+            # Skip buffers not present on the parallel model. This happens when
+            # the original model has aliased buffers (e.g. rope.cache and freqs_cis
+            # point to the same tensor): named_buffers() deduplicates them so only
+            # one FQN is registered on the parallel model.
+            if fqn not in parallel_buffer_fqns:
+                continue
             namespace[b_name] = _build_buffer_property(parallel_model, fqn)
 
         cls = mod.__class__
diff --git a/tests/test_api.py b/tests/test_api.py
@@ -168,6 +168,71 @@ def input_fn():
     )
 
 
+def test_init_aliased_buffers(device_mesh_1d):
+    """Test that init_weights works when a submodule buffer aliases a top-level buffer.
+
+    This mirrors the torchtitan Decoder pattern where rope.cache and freqs_cis
+    are the same tensor. named_buffers(remove_duplicate=True) deduplicates them,
+    so only freqs_cis ends up on the parallel model. The init_weights hook must
+    still correctly propagate values set via the aliased buffer (rope.cache).
+    """
+    dim = 128
+
+    class RoPE(nn.Module):
+        def __init__(self, dim):
+            super().__init__()
+            self.register_buffer("cache", torch.zeros(dim), persistent=False)
+
+        def forward(self, x):
+            return x + self.cache
+
+        def init_weights(self):
+            self.cache = torch.arange(dim).float()
+
+    class Model(nn.Module):
+        def __init__(self, dim):
+            super().__init__()
+            self.linear = nn.Linear(dim, dim)
+            self.rope = RoPE(dim)
+            self.register_buffer("freqs_cis", self.rope.cache, persistent=False)
+
+        def forward(self, x):
+            return self.linear(x) + self.freqs_cis
+
+        def init_weights(self):
+            with torch.no_grad():
+                self.linear.weight.fill_(1.0)
+                self.linear.bias.fill_(0.0)
+            self.rope.init_weights()
+            self.freqs_cis = self.rope.cache
+
+    def input_fn():
+        b = 512
+        inputs = (torch.rand(b, dim, device="cuda"),)
+        return inputs
+
+    with torch.device("meta"):
+        model = Model(dim)
+
+    assert model.freqs_cis is model.rope.cache
+
+    with AutoParallel(
+        model,
+        input_fn,
+        device_mesh_1d,
+    ) as autop:
+        x_sharding = (Shard(0),)
+        autop.add_input_constraints([x_sharding])
+        sharding_placement = autop.optimize_placement()
+        parallel_mod = autop.apply_placement(sharding_placement)
+
+    parallel_mod.to_empty(device="cuda")
+    parallel_mod.init_weights()
+
+    expected = torch.arange(dim).float().cuda()
+    assert torch.equal(parallel_mod.get_buffer("freqs_cis").full_tensor(), expected)
+
+
 def test_fx_graph_annotate(device_mesh_1d):
     dim = 128