NVIDIA
diff --git a/‎examples/pytorch/quantized_model_init/README.md‎
Lines changed: 47 additions & 0 deletions b/‎examples/pytorch/quantized_model_init/README.md‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎examples/pytorch/quantized_model_init/fully_shard.py‎
Lines changed: 192 additions & 0 deletions b/‎examples/pytorch/quantized_model_init/fully_shard.py‎
Lines changed: 192 additions & 0 deletions
diff --git a/‎examples/pytorch/quantized_model_init/main.py‎
Lines changed: 151 additions & 0 deletions b/‎examples/pytorch/quantized_model_init/main.py‎
Lines changed: 151 additions & 0 deletions
@@ -0,0 +1,47 @@
+# Quantized Model Initialization Examples
+
+## `main.py` -- Single-GPU with Gradient Accumulation Fusion
+
+Trains a single `TransformerLayer` on synthetic data while combining three
+Transformer Engine optimizations:
+
+* **`quantized_model_init`** -- model parameters are stored in FP8, saving
+  memory by not keeping a high-precision shadow copy.
+* **`FusedAdam` with master weights** -- the optimizer maintains FP32 master
+  copies of the weights for stable training updates.
+* **Gradient accumulation fusion** -- weight gradients are accumulated directly
+  in FP32 via Tensor Cores (`fuse_wgrad_accumulation=True` + `main_grad`).
+
+```bash
+python main.py
+```
+
+## `fully_shard.py` -- Multi-GPU with FSDP2
+
+Extends the single-GPU example to multi-GPU training using PyTorch-native FSDP2
+(`fully_shard`). Demonstrates:
+
+* **`quantized_model_init`** -- same FP8 weight initialization as `main.py`.
+* **`fully_shard`** -- PyTorch FSDP2 sharding of each `TransformerLayer`.
+* **`save_custom_attrs` / `restore_custom_attrs`** -- preserves custom
+  Python-level attributes on `QuantizedTensor` parameters that FSDP2's
+  `DTensor` wrapping would otherwise drop.
+* **`FusedAdam` with master weights** -- FP32 master copies maintained by the
+  optimizer, with DTensor-aware state initialization.
+
+```bash
+torchrun --nproc-per-node 2 fully_shard.py
+```
+
+### Why `fuse_wgrad_accumulation` is not used with FSDP2
+
+`fuse_wgrad_accumulation` writes weight gradients directly into a `main_grad`
+buffer during the wgrad GEMM and returns `None` to autograd. This bypasses
+FSDP2's reduce-scatter, leaving each rank with an unreduced gradient. Correct
+distributed training requires the per-rank gradients to be reduced across ranks,
+which FSDP2 handles automatically -- but only for gradients that flow through
+autograd.
+
+Megatron-Core's FSDP integration solves this by providing `get_main_grad()`,
+which returns a buffer wired into its own reduce-scatter machinery. Vanilla
+PyTorch FSDP2 does not yet expose an equivalent hook.
@@ -0,0 +1,192 @@
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""FSDP2 distributed training with quantized model initialization.
+
+Extends the single-GPU ``main.py`` example to multi-GPU training using
+PyTorch-native FSDP2 (``fully_shard``).  The script demonstrates:
+
+1. ``quantized_model_init`` -- FP8 weight initialization (same as main.py).
+2. ``fully_shard`` -- PyTorch FSDP2 sharding of each TransformerLayer.
+3. ``save_custom_attrs`` / ``restore_custom_attrs`` -- Preserve custom
+   Python-level attributes on QuantizedTensor parameters that FSDP2's
+   DTensor wrapping would otherwise drop.
+4. ``FusedAdam`` with FP32 master weights for full-precision training updates.
+
+.. note::
+   ``fuse_wgrad_accumulation`` is **not** used here.  That feature writes
+   weight gradients directly into ``main_grad`` buffers, bypassing the
+   autograd gradient flow.  FSDP2 requires gradients to go through its
+   reduce-scatter, so ``fuse_wgrad_accumulation`` needs Megatron-Core's
+   FSDP integration (which provides ``get_main_grad()``).
+
+Usage::
+
+    torchrun --nproc-per-node 2 fully_shard.py
+"""
+
+import os
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch.distributed._composable.fsdp import fully_shard
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.tensor import DTensor
+
+import transformer_engine.pytorch as te
+from transformer_engine.pytorch import QuantizedTensor
+
+# ── Configuration (matches main.py) ──────────────────────────────────
+HIDDEN_SIZE = 256
+FFN_HIDDEN_SIZE = 1024
+NUM_ATTENTION_HEADS = 8
+NUM_LAYERS = 3
+SEQ_LEN = 32
+BATCH_PER_RANK = 2
+NUM_STEPS = 5
+DTYPE = torch.bfloat16
+
+
+def dist_print(msg):
+    """Print only on rank 0."""
+    if int(os.environ.get("RANK", "0")) == 0:
+        print(msg)
+
+
+# ── Save / restore custom attributes across FSDP2 sharding ──────────
+# FSDP2's fully_shard replaces parameters with DTensors, which drops any
+# custom Python-level attributes.  These helpers preserve them.
+# (Pattern from tests/pytorch/distributed/run_fsdp2_model.py)
+
+
+def save_custom_attrs(module):
+    """Save custom attributes from all parameters before FSDP2 sharding."""
+    custom_attrs = {}
+    for name, param in module.named_parameters():
+        if isinstance(param, QuantizedTensor):
+            ignore_keys = [key for key in param.__dict__.keys() if key.startswith("_")]
+        else:
+            ignore_keys = []
+        attrs = vars(param)
+        custom_attrs[name] = {k: v for k, v in attrs.items() if k not in ignore_keys}
+    return custom_attrs
+
+
+def restore_custom_attrs(module, custom_attrs):
+    """Restore saved custom attributes after FSDP2 sharding."""
+    for name, param in module.named_parameters():
+        if name in custom_attrs:
+            for attr_name, attr_value in custom_attrs[name].items():
+                setattr(param, attr_name, attr_value)
+
+
+def main():
+    # ── 1. Distributed setup ─────────────────────────────────────────
+    assert "TORCHELASTIC_RUN_ID" in os.environ, (
+        "This script must be launched with torchrun, e.g.:\n"
+        "  torchrun --nproc-per-node 2 fully_shard.py"
+    )
+    world_size = int(os.environ["WORLD_SIZE"])
+    local_rank = int(os.environ["LOCAL_RANK"])
+
+    torch.cuda.set_device(local_rank)
+    dist.init_process_group(backend="nccl")
+    device = torch.device(f"cuda:{local_rank}")
+
+    torch.manual_seed(42)
+    torch.cuda.manual_seed(42)
+
+    # ── 2. Create model with quantized (FP8) parameters ──────────────
+    with te.quantized_model_init(enabled=True):
+        model = torch.nn.Sequential(
+            *[
+                te.TransformerLayer(
+                    HIDDEN_SIZE,
+                    FFN_HIDDEN_SIZE,
+                    NUM_ATTENTION_HEADS,
+                    fuse_qkv_params=True,
+                    params_dtype=DTYPE,
+                    hidden_dropout=0.0,
+                    attention_dropout=0.0,
+                )
+                for _ in range(NUM_LAYERS)
+            ]
+        )
+
+    # Pre-shard verification: count QuantizedTensor parameters.
+    qt_count = sum(1 for _, p in model.named_parameters() if isinstance(p, QuantizedTensor))
+    assert qt_count > 0, "No QuantizedTensor parameters found"
+    dist_print(f"Found {qt_count} QuantizedTensor (FP8) weight parameters.")
+
+    # ── 3. FSDP2 sharding ────────────────────────────────────────────
+    custom_attrs = save_custom_attrs(model)
+
+    mesh = DeviceMesh("cuda", list(range(world_size)))
+    for child in model.children():
+        fully_shard(child, mesh=mesh)
+    fully_shard(model, mesh=mesh)
+
+    restore_custom_attrs(model, custom_attrs)
+
+    # Post-shard verification: parameters are DTensors wrapping QuantizedTensors.
+    for name, param in model.named_parameters():
+        assert isinstance(param, DTensor), f"{name} is not a DTensor after sharding"
+    dist_print("FSDP2 sharding complete. All parameters are DTensors.")
+
+    # ── 4. Optimizer ─────────────────────────────────────────────────
+    optimizer = te.optimizers.FusedAdam(
+        model.parameters(),
+        lr=1e-3,
+        master_weights=True,
+        master_weight_dtype=torch.float32,
+    )
+    dist_print("Using FusedAdam with master_weights=True.")
+
+    # ── 5. Training loop ─────────────────────────────────────────────
+    x = torch.randn(SEQ_LEN, BATCH_PER_RANK, HIDDEN_SIZE, dtype=DTYPE, device=device)
+    target = torch.randn(SEQ_LEN, BATCH_PER_RANK, HIDDEN_SIZE, dtype=DTYPE, device=device)
+
+    for step in range(NUM_STEPS):
+        optimizer.zero_grad(set_to_none=True)
+
+        with te.autocast(enabled=True):
+            output = model(x)
+
+        loss = F.mse_loss(output, target)
+        loss.backward()
+        optimizer.step()
+        dist_print(f"  Step {step}: loss = {loss.item():.6f}")
+
+    # ── 6. Post-training assertions ──────────────────────────────────
+    dist_print("\nVerifying invariants ...")
+
+    qt_after = 0
+    for name, param in model.named_parameters():
+        assert isinstance(param, DTensor), f"{name} lost DTensor wrapping"
+        if isinstance(param._local_tensor, QuantizedTensor):
+            qt_after += 1
+    assert qt_after > 0, "No QuantizedTensor local tensors after training"
+    dist_print(f"  {qt_after} params still have QuantizedTensor local tensors.")
+
+    # Optimizer states: master weights and moments should be float32.
+    for param in model.parameters():
+        state = optimizer.state[param]
+        if "master_param" in state:
+            assert (
+                state["master_param"].dtype == torch.float32
+            ), f"Master weight dtype {state['master_param'].dtype}, expected float32"
+        assert state["exp_avg"].dtype == torch.float32, "exp_avg should be float32"
+        assert state["exp_avg_sq"].dtype == torch.float32, "exp_avg_sq should be float32"
+
+    dist_print("All assertions passed!")
+    dist_print("  - Linear weight parameters: QuantizedTensor (FP8) wrapped in DTensor")
+    dist_print("  - Optimizer master weights: float32")
+    dist_print("  - Optimizer states (exp_avg, exp_avg_sq): float32")
+
+    dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,151 @@
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""Quantized model initialization with FusedAdam and gradient accumulation fusion.
+
+Demonstrates three Transformer Engine features working together:
+
+1. ``quantized_model_init`` -- Initialize a model with low-precision (FP8)
+   parameters, avoiding the memory cost of storing both high-precision and
+   quantized copies of every weight.
+
+2. ``FusedAdam`` with master weights -- Maintain FP32 master copies of the
+   weights inside the optimizer so that the training update retains full
+   precision despite the model parameters being FP8.
+
+3. Gradient accumulation fusion -- Use ``fuse_wgrad_accumulation=True``
+   together with per-parameter ``main_grad`` buffers so that weight
+   gradients are accumulated directly in FP32 via Tensor Cores, avoiding a
+   separate FP8-to-FP32 cast kernel.
+
+Usage::
+
+    python main.py
+"""
+
+import torch
+import transformer_engine.pytorch as te
+from transformer_engine.pytorch.quantized_tensor import QuantizedTensor
+
+# ── Configuration ──────────────────────────────────────────────────────
+HIDDEN_SIZE = 256
+FFN_HIDDEN_SIZE = 1024
+NUM_ATTENTION_HEADS = 8
+SEQ_LEN = 32
+BATCH_SIZE = 2
+NUM_STEPS = 5
+DTYPE = torch.bfloat16
+
+
+def main():
+    # ── 1. Create model with quantized parameters ─────────────────────
+    #
+    # Inside quantized_model_init, TransformerEngine modules store only the
+    # FP8 quantized copy of each parameter (a Float8Tensor), eliminating the
+    # memory overhead of a high-precision shadow copy.
+    with te.quantized_model_init(enabled=True):
+        model = te.TransformerLayer(
+            HIDDEN_SIZE,
+            FFN_HIDDEN_SIZE,
+            NUM_ATTENTION_HEADS,
+            fuse_wgrad_accumulation=True,
+            fuse_qkv_params=True,  # required for fuse_wgrad_accumulation
+            params_dtype=DTYPE,
+            hidden_dropout=0.0,  # disable dropout for this synthetic example
+            attention_dropout=0.0,
+        )
+
+    # Verify that linear-layer weight parameters are quantized.
+    # Biases and LayerNorm parameters are *not* quantized.
+    quantized_count = 0
+    for name, param in model.named_parameters():
+        if isinstance(param, QuantizedTensor):
+            quantized_count += 1
+    assert quantized_count > 0, "No QuantizedTensor parameters found"
+    print(f"Found {quantized_count} QuantizedTensor (FP8) weight parameters.")
+
+    # ── 2. Allocate main_grad buffers (FP32) ──────────────────────────
+    #
+    # fuse_wgrad_accumulation causes weight-gradient GEMMs to write directly
+    # into ``param.main_grad`` in FP32 (via Tensor Core accumulation).
+    # Non-weight parameters (e.g. LayerNorm) still receive gradients through
+    # the normal ``param.grad`` path.
+    for param in model.parameters():
+        param.main_grad = torch.zeros(param.shape, dtype=torch.float32, device=param.device)
+
+    # ── 3. Optimizer with FP32 master weights ─────────────────────────
+    #
+    # use_decoupled_grad=True tells FusedAdam to read gradients from
+    # ``param.decoupled_grad`` instead of ``param.grad``.  This avoids
+    # the dtype-mismatch error that would occur when assigning FP32
+    # gradients to bfloat16 parameters via ``.grad``.
+    optimizer = te.optimizers.FusedAdam(
+        model.parameters(),
+        lr=1e-3,
+        master_weights=True,
+        master_weight_dtype=torch.float32,
+        use_decoupled_grad=True,
+    )
+
+    # ── 4. Training loop ──────────────────────────────────────────────
+    #
+    # Use a fixed synthetic dataset so that loss decreases over steps.
+    x = torch.randn(SEQ_LEN, BATCH_SIZE, HIDDEN_SIZE, dtype=DTYPE, device="cuda")
+    target = torch.randn(SEQ_LEN, BATCH_SIZE, HIDDEN_SIZE, dtype=DTYPE, device="cuda")
+
+    for step in range(NUM_STEPS):
+        optimizer.zero_grad(set_to_none=True)
+        for param in model.parameters():
+            param.main_grad.zero_()
+
+        # Forward pass inside autocast to enable FP8 compute.
+        with te.autocast(enabled=True):
+            output = model(x)
+
+        loss = torch.nn.functional.mse_loss(output, target)
+        loss.backward()
+
+        # Consolidate gradients into main_grad.
+        #   * Weight params with fuse_wgrad_accumulation: backward already
+        #     accumulated the gradient directly into main_grad (FP32).
+        #   * Other params (e.g. LayerNorm): autograd set param.grad.
+        for param in model.parameters():
+            if param.grad is not None:
+                param.main_grad.copy_(param.grad)
+                param.grad = None
+
+        # Expose main_grad as decoupled_grad so FusedAdam can read it.
+        for param in model.parameters():
+            param.decoupled_grad = param.main_grad
+
+        optimizer.step()
+        print(f"  Step {step}: loss = {loss.item():.6f}")
+
+    # ── 5. Post-training assertions ───────────────────────────────────
+    print("\nVerifying invariants ...")
+
+    # Optimizer states.
+    for param in model.parameters():
+        state = optimizer.state[param]
+        if "master_param" in state:
+            master = state["master_param"]
+            assert (
+                master.dtype == torch.float32
+            ), f"Master weight dtype {master.dtype}, expected float32"
+        assert state["exp_avg"].dtype == torch.float32, "exp_avg should be float32"
+        assert state["exp_avg_sq"].dtype == torch.float32, "exp_avg_sq should be float32"
+
+    # main_grad buffers.
+    for param in model.parameters():
+        assert param.main_grad.dtype == torch.float32, "main_grad should be float32"
+
+    print("All assertions passed!")
+    print("  - Linear weight parameters: QuantizedTensor (FP8)")
+    print("  - Optimizer master weights: float32")
+    print("  - Optimizer states (exp_avg, exp_avg_sq): float32")
+    print("  - Gradient accumulation buffers (main_grad): float32")
+
+
+if __name__ == "__main__":
+    main()