Use expand/repeat if-else for MPS backward compat in block_utils

Ubiquinone-dot · claude · Ubiquinone-dot · commit e465cfd93318 · 2026-04-04T15:17:00.000-07:00
The unconditional .repeat(L, 1) on line 431 allocated an O(L^2) tensor
on every backend.  .expand() is zero-copy but produces a non-contiguous
view that MPS torch.where cannot handle.  Guard with an if-else so
CUDA/CPU keep the original expand path and MPS gets repeat.

Also runs ruff 0.8.3 format on files that were formatted with a
different ruff version.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/models/rf3/src/rf3/model/layers/pairformer_layers.py b/models/rf3/src/rf3/model/layers/pairformer_layers.py
@@ -202,7 +202,9 @@ def embed_features(C_L, D_LL, V_LL):
             A_I = scatter_mean(
                 torch.zeros(A_I_shape, device=Q_L.device, dtype=Q_L.dtype),
                 -2,
-                f["atom_to_token_map"].long(),  # [L], mapping from atom index to token index
+                f[
+                    "atom_to_token_map"
+                ].long(),  # [L], mapping from atom index to token index
                 processed_Q_L,  # (..., L, C_token)
             )  # (..., I, C_token)
 
@@ -261,9 +263,7 @@ def forward(
         B, L = B_IIH.shape[:2]
 
         if not self.use_deepspeed_evo or L <= 24:
-            Q_IH = Q_IH / torch.sqrt(
-                torch.tensor(self.c).to(Q_IH.device, Q_IH.dtype)
-            )
+            Q_IH = Q_IH / torch.sqrt(torch.tensor(self.c).to(Q_IH.device, Q_IH.dtype))
             # Attention
             A_IIH = torch.softmax(
                 torch.einsum("...ihd,...jhd->...ijh", Q_IH, K_IH) + B_IIH, dim=-2
diff --git a/models/rfd3/src/rfd3/model/layers/block_utils.py b/models/rfd3/src/rfd3/model/layers/block_utils.py
@@ -173,10 +173,12 @@ def scatter_add_pair_features(P_LK_tgt, P_LK_indices, P_LA_src, P_LA_indices):
     elif not torch.all(matches.sum(dim=-1) <= 1):
         raise ValueError("Did not find a scatter index for every atom")
     k_indices = matches.long().argmax(dim=-1)  # (B, L, a)
-    scatter_indices = k_indices.unsqueeze(-1).expand(
-        -1, -1, -1, P_LK_tgt.shape[-1]
-    ).contiguous()  # (B, L, a, c)
-    P_LK_tgt = P_LK_tgt.scatter_add(dim=2, index=scatter_indices, src=P_LA_src.contiguous())
+    scatter_indices = (
+        k_indices.unsqueeze(-1).expand(-1, -1, -1, P_LK_tgt.shape[-1]).contiguous()
+    )  # (B, L, a, c)
+    P_LK_tgt = P_LK_tgt.scatter_add(
+        dim=2, index=scatter_indices, src=P_LA_src.contiguous()
+    )
     return P_LK_tgt
 
 
@@ -428,10 +430,15 @@ def extend_index_mask_with_neighbours(
     inf = torch.tensor(float("inf"), dtype=D_LL.dtype, device=device)
 
     # 1. Selection of sequence neighbours
-    # Use .repeat() instead of .expand() to produce a contiguous tensor — MPS does
-    # not handle non-contiguous inputs to torch.where correctly.
-    all_idx_row = torch.arange(L, device=device).unsqueeze(0).repeat(L, 1)
-    indices = torch.where(mask.contiguous(), all_idx_row, inf)  # sentinel inf if not-forced
+    # MPS does not handle non-contiguous inputs to torch.where correctly,
+    # so use .repeat() (allocates) there; .expand() (zero-copy view) elsewhere.
+    if device.type == "mps":
+        all_idx_row = torch.arange(L, device=device).unsqueeze(0).repeat(L, 1)
+    else:
+        all_idx_row = torch.arange(L, device=device).unsqueeze(0).expand(L, L)
+    indices = torch.where(
+        mask.contiguous(), all_idx_row, inf
+    )  # sentinel inf if not-forced
     indices = indices.sort(dim=1)[0][:, :k]  # (L, k)
 
     # 2. Find k-nn excluding forced indices
diff --git a/models/rfd3/src/rfd3/model/layers/pairformer_layers.py b/models/rfd3/src/rfd3/model/layers/pairformer_layers.py
@@ -61,9 +61,7 @@ def forward(
         B, L = B_IIH.shape[:2]
 
         if not self.use_deepspeed_evo or L <= 24:
-            Q_IH = Q_IH / torch.sqrt(
-                torch.tensor(self.c).to(Q_IH.device, Q_IH.dtype)
-            )
+            Q_IH = Q_IH / torch.sqrt(torch.tensor(self.c).to(Q_IH.device, Q_IH.dtype))
             # Attention
             A_IIH = torch.softmax(
                 torch.einsum("...ihd,...jhd->...ijh", Q_IH, K_IH) + B_IIH, dim=-2
diff --git a/src/foundry/utils/torch.py b/src/foundry/utils/torch.py
@@ -1,6 +1,12 @@
 """General convenience utilities for PyTorch."""
 
-__all__ = ["map_to", "assert_no_nans", "assert_shape", "assert_same_shape", "scatter_mean"]
+__all__ = [
+    "map_to",
+    "assert_no_nans",
+    "assert_shape",
+    "assert_same_shape",
+    "scatter_mean",
+]
 
 import time
 import warnings