Move loss_fn into model definition in PP example

xmfan · xmfan · commit b97e75a554de · 2026-02-24T14:18:39.000-08:00
Instead of passing loss_fn to AutoParallelPP, wrap the model in a ModelWithLoss module that bakes cross-entropy loss into forward(). This makes the example compatible with tracing self.model directly. stack-info: PR: #324, branch: xmfan/stack/25
diff --git a/autoparallel/api.py b/autoparallel/api.py
@@ -273,7 +273,6 @@ def __init__(
         self.enable_ac = enable_ac
         self.ac_stage_size_in_GiB = ac_stage_size_in_GiB
         self.reshard_after_forward = reshard_after_forward
-        self.loss_fn = None
 
         if dynamic:
             self.fake_mode.shape_env = ShapeEnv()
diff --git a/autoparallel/api_pp.py b/autoparallel/api_pp.py
@@ -62,78 +62,6 @@ def forward(self, *args):
 
 
 class AutoParallelPP(AutoParallel):
-    def __init__(
-        self,
-        model,
-        input_fn,
-        mesh,
-        mp_policy=None,
-        compile: bool = False,
-        enable_ac: bool = True,
-        ac_stage_size_in_GiB=None,
-        reshard_after_forward: bool = True,
-        dynamic: bool = False,
-        loss_fn: Optional[Any] = None,
-        numerics_logger=None,
-        **kwargs,
-    ):
-        # Call parent __init__ without loss_fn
-        super().__init__(
-            model=model,
-            input_fn=input_fn,
-            mesh=mesh,
-            mp_policy=mp_policy,
-            compile=compile,
-            enable_ac=enable_ac,
-            ac_stage_size_in_GiB=ac_stage_size_in_GiB,
-            reshard_after_forward=reshard_after_forward,
-            dynamic=dynamic,
-            numerics_logger=numerics_logger,
-            **kwargs,
-        )
-        # Set loss_fn after parent initialization
-        self.loss_fn = loss_fn
-
-    def _prepare_model_wrapper_and_inputs(
-        self, raw_inputs: Any
-    ) -> tuple[Any, tuple[Any, ...]]:
-        """
-        Prepare the model wrapper and formatted inputs for tracing.
-
-        Overrides the base class to handle loss_fn when provided.
-
-        Args:
-            raw_inputs: The raw inputs from input_fn()
-
-        Returns:
-            A tuple of (model_wrapper, formatted_inputs) where:
-            - model_wrapper is a callable that will be traced
-            - formatted_inputs are the inputs to pass to model_wrapper
-        """
-        if self.loss_fn is not None:
-            # Expected format: ((inp1, inp2,...), target)
-            if isinstance(raw_inputs, tuple) and len(raw_inputs) == 2:
-                model_inputs, target = raw_inputs
-                # Normalize inputs to always be a tuple
-                if not isinstance(model_inputs, tuple):
-                    model_inputs = (model_inputs,)
-                formatted_inputs = (model_inputs, target)
-
-                def model_with_loss(model_inputs, target) -> Any:
-                    output = self.model(*model_inputs)
-                    loss = self.loss_fn(output, target)  # type: ignore[misc]
-                    return loss
-
-                return model_with_loss, formatted_inputs
-            else:
-                raise ValueError(
-                    "When loss_fn is provided, input_fn must return (inputs, target) "
-                    "where inputs can be a single tensor or tuple of tensors"
-                )
-        else:
-            # No loss function, use parent implementation
-            return super()._prepare_model_wrapper_and_inputs(raw_inputs)
-
     def apply_placement_pp(
         self, sharding_placement=None, graph_passes: list[str] = []
     ) -> dict[str, Any]:
diff --git a/examples/example_pp_graph_passes.py b/examples/example_pp_graph_passes.py
@@ -104,13 +104,12 @@ def _get_pp_module_and_graphs(
         dynamic=True,
         compile=False,
         reshard_after_forward=False,
-        loss_fn=dsv3_loss_fn if use_loss_fn else None,
     ) as autop:
         autop.add_parameter_memory_constraint(low=None, high=None)
 
         # x_sharding = (Shard(0), Replicate())
         x_sharding = (Shard(0), Shard(0))
-        if autop.loss_fn is not None:
+        if use_loss_fn:
             autop.add_input_constraints([x_sharding, x_sharding])
             autop.add_output_constraints([(Replicate(), Replicate())])
         else:
@@ -466,6 +465,22 @@ def test_combined(
         model = DeepSeekV3Model(config).bfloat16()
         model.tok_embeddings = None  # type: ignore[assignment]
 
+    if use_loss_fn:
+
+        class ModelWithLoss(torch.nn.Module):
+            def __init__(self, model):
+                super().__init__()
+                self.model = model
+
+            def forward(self, h, labels):
+                output = self.model(h)
+                return dsv3_loss_fn(output, labels)
+
+            def init_weights(self, *args, **kwargs):
+                return self.model.init_weights(*args, **kwargs)
+
+        model = ModelWithLoss(model)
+
     def make_input_fn(sharded: bool = False, with_target: bool = False):
         """Create input generator. `sharded` uses mesh-adjusted batch size."""