Add fast_set_attr to modules not inheriting from base.py (NVIDIA#2724)

vthumbe1503 · web-flow · commit c68ec3101d0d · 2026-03-02T18:16:53.000-08:00
fix fast_set_attr in other nn modules for fsdp

Signed-off-by: Varun Thumbe &lt;vthumbe@nvidia.com&gt;
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/backends.py b/transformer_engine/pytorch/attention/dot_product_attention/backends.py
@@ -293,6 +293,10 @@ def mask_func(x, y):
             bool(int(os.getenv("NVTE_APPLY_QK_LAYER_SCALING", "0"))) and layer_number is not None
         )
 
+    def fast_setattr(self, name: str, value: Any) -> None:
+        """Fast attribute set for non-parameter fields."""
+        self.__dict__[name] = value
+
     def forward(
         self,
         _alibi_cache: Dict[str, Any],
diff --git a/transformer_engine/pytorch/attention/multi_head_attention.py b/transformer_engine/pytorch/attention/multi_head_attention.py
@@ -5,7 +5,7 @@
 """Multi-head Attention."""
 import os
 import collections
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Any, Callable, List, Optional, Tuple, Union
 import torch
 
 from transformer_engine.pytorch.quantization import FP8GlobalStateManager
@@ -478,6 +478,10 @@ def __init__(
             **common_gemm_kwargs,
         )
 
+    def fast_setattr(self, name: str, value: Any) -> None:
+        """Fast attribute set for non-parameter fields."""
+        self.__dict__[name] = value
+
     def _create_qk_norm_modules(
         self,
         qk_norm_type: Optional[str],
diff --git a/transformer_engine/pytorch/module/layernorm.py b/transformer_engine/pytorch/module/layernorm.py
@@ -4,7 +4,7 @@
 
 """LayerNorm API"""
 import warnings
-from typing import Iterable, Optional, Union
+from typing import Any, Iterable, Optional, Union
 
 import torch
 
@@ -102,6 +102,10 @@ def __init__(
             **kwargs,
         )
 
+    def fast_setattr(self, name: str, value: Any) -> None:
+        """Fast attribute set for non-parameter fields."""
+        self.__dict__[name] = value
+
     def reset_layer_norm_parameters(self) -> None:
         """Init LN params"""
         warnings.warn(
diff --git a/transformer_engine/pytorch/module/rmsnorm.py b/transformer_engine/pytorch/module/rmsnorm.py
@@ -4,7 +4,7 @@
 
 """RMSNorm API"""
 import warnings
-from typing import Iterable, Optional, Union
+from typing import Any, Iterable, Optional, Union
 
 import torch
 
@@ -106,6 +106,10 @@ def __init__(
             **kwargs,
         )
 
+    def fast_setattr(self, name: str, value: Any) -> None:
+        """Fast attribute set for non-parameter fields."""
+        self.__dict__[name] = value
+
     def reset_rms_norm_parameters(self) -> None:
         """Deprecated"""
         warnings.warn(
diff --git a/transformer_engine/pytorch/transformer.py b/transformer_engine/pytorch/transformer.py
@@ -6,7 +6,7 @@
 import os
 import warnings
 from contextlib import nullcontext
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Any, Callable, List, Optional, Tuple, Union
 
 import torch
 
@@ -545,6 +545,10 @@ def __init__(
                 device=device,
             )
 
+    def fast_setattr(self, name: str, value: Any) -> None:
+        """Fast attribute set for non-parameter fields."""
+        self.__dict__[name] = value
+
     def set_tensor_parallel_group(self, tp_group: Union[dist_group_type, None]) -> None:
         """
         Set the tensor parallel group for the given