Project-MONAI · yang-ze-kang · Sep 25, 2025 · Sep 25, 2025 · Sep 25, 2025 · Sep 25, 2025
@@ -80,15 +80,16 @@ def __init__(
         self.norm2 = nn.LayerNorm(hidden_size)
         self.with_cross_attention = with_cross_attention
 
-        self.norm_cross_attn = nn.LayerNorm(hidden_size)
-        self.cross_attn = CrossAttentionBlock(
-            hidden_size=hidden_size,
-            num_heads=num_heads,
-            dropout_rate=dropout_rate,
-            qkv_bias=qkv_bias,
-            causal=False,
-            use_flash_attention=use_flash_attention,
-        )
+        if with_cross_attention:
+            self.norm_cross_attn = nn.LayerNorm(hidden_size)
+            self.cross_attn = CrossAttentionBlock(
+                hidden_size=hidden_size,
+                num_heads=num_heads,
+                dropout_rate=dropout_rate,
+                qkv_bias=qkv_bias,
+                causal=False,
+                use_flash_attention=use_flash_attention,
+            )
-        if with_cross_attention:
-            self.norm_cross_attn = nn.LayerNorm(hidden_size)
-            self.cross_attn = CrossAttentionBlock(
-                hidden_size=hidden_size,
-                num_heads=num_heads,
-                dropout_rate=dropout_rate,
-                qkv_bias=qkv_bias,
-                causal=False,
-                use_flash_attention=use_flash_attention,
-            )
+         return x
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        # Strip out any cross‐attention params when with_cross_attention=False
+        if not self.with_cross_attention:
+            keys = [
+                key
+                for key in list(state_dict.keys())
+                if key.startswith(f"{prefix}cross_attn.") or key.startswith(f"{prefix}norm_cross_attn.")
+            ]
+            for key in keys:
+                state_dict.pop(key)
+        # Delegate actual loading to the parent implementation
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
-        if with_cross_attention:
-            self.norm_cross_attn = nn.LayerNorm(hidden_size)
-            self.cross_attn = CrossAttentionBlock(
-                hidden_size=hidden_size,
-                num_heads=num_heads,
-                dropout_rate=dropout_rate,
-                qkv_bias=qkv_bias,
-                causal=False,
-                use_flash_attention=use_flash_attention,
-            )
+         return x
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        # Strip out any cross‐attention params when with_cross_attention=False
+        if not self.with_cross_attention:
+            keys = [
+                key
+                for key in list(state_dict.keys())
+                if key.startswith(f"{prefix}cross_attn.") or key.startswith(f"{prefix}norm_cross_attn.")
+            ]
+            for key in keys:
+                state_dict.pop(key)
+        # Delegate actual loading to the parent implementation
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
 
     def forward(
         self, x: torch.Tensor, context: Optional[torch.Tensor] = None, attn_mask: Optional[torch.Tensor] = None