[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit 41e22efdb1dc · 2026-02-17T19:48:14.000Z
for more information, see https://pre-commit.ci
diff --git a/tests/pytorch/test_permutation.py b/tests/pytorch/test_permutation.py
@@ -304,6 +304,7 @@ def _test_permutation_index_map(
         torch._dynamo.reset()
         # Disable donated buffers to allow retain_graph=True
         import torch._functorch.config as functorch_config
+
         old_donated_buffer = functorch_config.donated_buffer
         functorch_config.donated_buffer = False
 
@@ -337,9 +338,7 @@ def unpermute_wrapper(inp, row_map, probs_val):
 
         # Compile with fullgraph=True
         compiled_unpermute = torch.compile(unpermute_wrapper, fullgraph=True)
-        te_unpermute_output = compiled_unpermute(
-            te_unpermute_fwd_input, row_id_map, te_probs
-        )
+        te_unpermute_output = compiled_unpermute(te_unpermute_fwd_input, row_id_map, te_probs)
     else:
         te_unpermute_output = te_unpermute(
             te_unpermute_fwd_input, row_id_map, te_probs, map_type="index"
diff --git a/transformer_engine/pytorch/permutation.py b/transformer_engine/pytorch/permutation.py
@@ -93,12 +93,8 @@ def _moe_permute_index_map_fake(
     output_tokens = num_out_tokens if num_out_tokens > 0 else num_tokens * topK
 
     # row_id_map is 1D with size = num_tokens * topK (see permutation.cpp line 59-60)
-    fake_output = torch.empty(
-        (output_tokens, inp.shape[1]), dtype=inp.dtype, device=inp.device
-    )
-    fake_row_id_map = torch.empty(
-        (num_tokens * topK,), dtype=torch.int32, device=inp.device
-    )
+    fake_output = torch.empty((output_tokens, inp.shape[1]), dtype=inp.dtype, device=inp.device)
+    fake_row_id_map = torch.empty((num_tokens * topK,), dtype=torch.int32, device=inp.device)
 
     return fake_output, fake_row_id_map
 
@@ -167,6 +163,7 @@ def _moe_permute_index_map_backward_wrapper(ctx, grad_permuted_act, grad_row_id_
 
 # ---------------------------------- Forward custom op ----------------------------------
 
+
 @torch.library.custom_op("te_moe::unpermute_index_map_fwd", mutates_args=[])
 def moe_unpermute_index_map_forward(
     inp: torch.Tensor,
@@ -190,13 +187,12 @@ def _moe_unpermute_index_map_forward_fake(
 ) -> torch.Tensor:
     """Fake implementation for shape inference."""
     # Output shape: (num_tokens, hidden_size) — see permutation.cpp line 95-97
-    return torch.empty(
-        (num_tokens, inp.shape[1]), dtype=inp.dtype, device=inp.device
-    )
+    return torch.empty((num_tokens, inp.shape[1]), dtype=inp.dtype, device=inp.device)
 
 
 # ---------------------------------- Backward custom op ----------------------------------
 
+
 @torch.library.custom_op("te_moe::unpermute_index_map_bwd", mutates_args=[])
 def moe_unpermute_index_map_backward(
     unpermuted_act_grad: torch.Tensor,
@@ -237,6 +233,7 @@ def _moe_unpermute_index_map_backward_fake(
 
 # ---------------------------------- Autograd glue ----------------------------------
 
+
 def _moe_unpermute_index_map_setup_context(ctx, inputs, output):
     """Save context for backward pass."""
     inp, row_id_map, probs, num_tokens, topK = inputs
@@ -272,6 +269,7 @@ def _moe_unpermute_index_map_backward_wrapper(ctx, unpermuted_act_grad):
 
 # ===================== _moe_permute_mask_map custom ops =====================
 
+
 @torch.library.custom_op("te_moe::permute_mask_map_fwd", mutates_args=[])
 def moe_permute_mask_map_forward(
     inp: torch.Tensor,
@@ -296,7 +294,9 @@ def moe_permute_mask_map_forward(
     if pad_offsets is not None:
         assert pad_offsets.is_cuda, "TransformerEngine needs CUDA."
     assert inp.size(0) == routing_map.size(0), "Permute not possible"
-    assert num_out_tokens is not None, "num_out_tokens must be provided to the fused permute function."
+    assert (
+        num_out_tokens is not None
+    ), "num_out_tokens must be provided to the fused permute function."
 
     num_tokens, hidden_size = inp.size()
     num_experts = routing_map.size(1)
@@ -335,38 +335,58 @@ def moe_permute_mask_map_forward(
         scale_hidden_dim = None
 
     output, permuted_scale, permuted_probs = triton_permutation.permute_with_mask_map(
-        inp, row_id_map, probs, fp8_scale, pad_offsets,
-        num_tokens, num_experts, num_out_tokens, hidden_size, scale_hidden_dim,
+        inp,
+        row_id_map,
+        probs,
+        fp8_scale,
+        pad_offsets,
+        num_tokens,
+        num_experts,
+        num_out_tokens,
+        hidden_size,
+        scale_hidden_dim,
     )
 
     if fp8:
         if per_tensor_recipe:
             output = Float8Tensor(
-                data=output, fp8_dtype=fp8_dtype, fp8_scale_inv=fp8_scale_inv,
-                shape=output.shape, dtype=fake_dtype,
+                data=output,
+                fp8_dtype=fp8_dtype,
+                fp8_scale_inv=fp8_scale_inv,
+                shape=output.shape,
+                dtype=fake_dtype,
             )
         elif blockwise_recipe:
             output = Float8BlockwiseQTensor(
-                shape=output.shape, dtype=fake_dtype, rowwise_data=output,
+                shape=output.shape,
+                dtype=fake_dtype,
+                rowwise_data=output,
                 rowwise_scale_inv=permuted_scale.T.contiguous(),
-                columnwise_data=None, columnwise_scale_inv=None,
-                fp8_dtype=fp8_dtype, quantizer=None, is_2D_scaled=False,
+                columnwise_data=None,
+                columnwise_scale_inv=None,
+                fp8_dtype=fp8_dtype,
+                quantizer=None,
+                is_2D_scaled=False,
                 requires_grad=output.requires_grad,
             )
         elif mxfp8_recipe:
             output = MXFP8Tensor(
-                shape=output.shape, dtype=fake_dtype, fp8_dtype=fp8_dtype,
-                rowwise_data=output, rowwise_scale_inv=permuted_scale.contiguous(),
-                columnwise_data=None, columnwise_scale_inv=None,
-                quantizer=None, requires_grad=output.requires_grad,
+                shape=output.shape,
+                dtype=fake_dtype,
+                fp8_dtype=fp8_dtype,
+                rowwise_data=output,
+                rowwise_scale_inv=permuted_scale.contiguous(),
+                columnwise_data=None,
+                columnwise_scale_inv=None,
+                quantizer=None,
+                requires_grad=output.requires_grad,
                 with_gemm_swizzled_scales=False,
             )
 
     # If permuted_probs is None, return empty tensor (custom ops need concrete tensors)
     if permuted_probs is None:
         permuted_probs = torch.empty(0, device=inp.device)
 
-
     return output, row_id_map, permuted_probs
 
 
@@ -406,8 +426,14 @@ def moe_permute_mask_map_backward(
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """Backward pass for MoE permute with mask router map."""
     act_grad, probs_grad = triton_permutation.unpermute_with_mask_map(
-        permuted_act_grad, row_id_map, None, permuted_probs_grad, pad_offsets,
-        num_tokens, num_experts, hidden_size,
+        permuted_act_grad,
+        row_id_map,
+        None,
+        permuted_probs_grad,
+        pad_offsets,
+        num_tokens,
+        num_experts,
+        hidden_size,
     )
     if probs_grad is None:
         probs_grad = torch.empty(0, device=permuted_act_grad.device)
@@ -430,7 +456,8 @@ def _moe_permute_mask_map_backward_fake(
     )
     if permuted_probs_grad is not None:
         probs_grad = torch.empty(
-            (num_tokens, num_experts), dtype=permuted_probs_grad.dtype,
+            (num_tokens, num_experts),
+            dtype=permuted_probs_grad.dtype,
             device=permuted_act_grad.device,
         )
     else:
@@ -471,8 +498,13 @@ def _moe_permute_mask_map_backward_wrapper(ctx, grad_output, grad_row_id_map, gr
     probs_grad_input = grad_permuted_probs if grad_permuted_probs.numel() > 0 else None
 
     act_grad, probs_grad = moe_permute_mask_map_backward(
-        grad_output, probs_grad_input, row_id_map, pad_offsets,
-        ctx.num_tokens, ctx.num_experts, ctx.hidden_size,
+        grad_output,
+        probs_grad_input,
+        row_id_map,
+        pad_offsets,
+        ctx.num_tokens,
+        ctx.num_experts,
+        ctx.hidden_size,
     )
 
     if not ctx.needs_probs_grad or probs_grad.numel() == 0:
@@ -489,6 +521,7 @@ def _moe_permute_mask_map_backward_wrapper(ctx, grad_output, grad_row_id_map, gr
 
 # ===================== _moe_unpermute_mask_map custom ops =====================
 
+
 @torch.library.custom_op("te_moe::unpermute_mask_map_fwd", mutates_args=[])
 def moe_unpermute_mask_map_forward(
     inp: torch.Tensor,
@@ -508,8 +541,14 @@ def moe_unpermute_mask_map_forward(
         inp, QuantizedTensor
     ), "The forward of moe_unpermute does not support FP8."
     unpermuted_output, _ = triton_permutation.unpermute_with_mask_map(
-        inp, row_id_map, merging_probs, None, pad_offsets,
-        num_tokens, num_experts, hidden_size,
+        inp,
+        row_id_map,
+        merging_probs,
+        None,
+        pad_offsets,
+        num_tokens,
+        num_experts,
+        hidden_size,
     )
     return unpermuted_output
 
@@ -542,8 +581,15 @@ def moe_unpermute_mask_map_backward_with_probs(
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """Backward pass for MoE unpermute with merging probs."""
     act_grad, probs_grad = triton_permutation.unpermute_with_mask_map_bwd_with_merging_probs(
-        unpermuted_act_grad, row_id_map, fwd_input, merging_probs, pad_offsets,
-        num_tokens, num_experts, num_permuted_tokens, hidden_size,
+        unpermuted_act_grad,
+        row_id_map,
+        fwd_input,
+        merging_probs,
+        pad_offsets,
+        num_tokens,
+        num_experts,
+        num_permuted_tokens,
+        hidden_size,
     )
     return act_grad, probs_grad
 
@@ -563,11 +609,13 @@ def _moe_unpermute_mask_map_bwd_with_probs_fake(
     """Fake for backward shape inference with merging probs."""
     act_grad = torch.empty(
         (num_permuted_tokens, hidden_size),
-        dtype=unpermuted_act_grad.dtype, device=unpermuted_act_grad.device,
+        dtype=unpermuted_act_grad.dtype,
+        device=unpermuted_act_grad.device,
     )
     probs_grad = torch.empty(
         (num_tokens, num_experts),
-        dtype=merging_probs.dtype, device=unpermuted_act_grad.device,
+        dtype=merging_probs.dtype,
+        device=unpermuted_act_grad.device,
     )
     return act_grad, probs_grad
 
@@ -615,30 +663,51 @@ def moe_unpermute_mask_map_backward_no_probs(
         fp8_scale = None
 
     act_grad, permuted_scale, _ = triton_permutation.permute_with_mask_map(
-        unpermuted_act_grad, row_id_map, None, fp8_scale, pad_offsets,
-        num_tokens, num_experts, num_permuted_tokens, hidden_size, scale_hidden_dim,
+        unpermuted_act_grad,
+        row_id_map,
+        None,
+        fp8_scale,
+        pad_offsets,
+        num_tokens,
+        num_experts,
+        num_permuted_tokens,
+        hidden_size,
+        scale_hidden_dim,
     )
 
     if fp8:
         if per_tensor_recipe:
             act_grad = Float8Tensor(
-                data=act_grad, fp8_dtype=fp8_dtype, fp8_scale_inv=fp8_scale_inv,
-                shape=act_grad.shape, dtype=fake_dtype,
+                data=act_grad,
+                fp8_dtype=fp8_dtype,
+                fp8_scale_inv=fp8_scale_inv,
+                shape=act_grad.shape,
+                dtype=fake_dtype,
             )
         elif blockwise_recipe:
             act_grad = Float8BlockwiseQTensor(
-                shape=act_grad.shape, dtype=fake_dtype, rowwise_data=act_grad,
+                shape=act_grad.shape,
+                dtype=fake_dtype,
+                rowwise_data=act_grad,
                 rowwise_scale_inv=permuted_scale.T.contiguous(),
-                columnwise_data=None, columnwise_scale_inv=None,
-                fp8_dtype=fp8_dtype, quantizer=None, is_2D_scaled=False,
+                columnwise_data=None,
+                columnwise_scale_inv=None,
+                fp8_dtype=fp8_dtype,
+                quantizer=None,
+                is_2D_scaled=False,
                 requires_grad=act_grad.requires_grad,
             )
         elif mxfp8_recipe:
             act_grad = MXFP8Tensor(
-                shape=act_grad.shape, dtype=fake_dtype, fp8_dtype=fp8_dtype,
-                rowwise_data=act_grad, rowwise_scale_inv=permuted_scale.contiguous(),
-                columnwise_data=None, columnwise_scale_inv=None,
-                quantizer=None, requires_grad=act_grad.requires_grad,
+                shape=act_grad.shape,
+                dtype=fake_dtype,
+                fp8_dtype=fp8_dtype,
+                rowwise_data=act_grad,
+                rowwise_scale_inv=permuted_scale.contiguous(),
+                columnwise_data=None,
+                columnwise_scale_inv=None,
+                quantizer=None,
+                requires_grad=act_grad.requires_grad,
                 with_gemm_swizzled_scales=False,
             )
 
@@ -658,7 +727,8 @@ def _moe_unpermute_mask_map_bwd_no_probs_fake(
     """Fake for backward shape inference without probs."""
     return torch.empty(
         (num_permuted_tokens, hidden_size),
-        dtype=unpermuted_act_grad.dtype, device=unpermuted_act_grad.device,
+        dtype=unpermuted_act_grad.dtype,
+        device=unpermuted_act_grad.device,
     )
 
 
@@ -697,14 +767,26 @@ def _moe_unpermute_mask_map_backward_wrapper(ctx, unpermuted_act_grad):
             unpermuted_act_grad, QuantizedTensor
         ), "The backward of moe_unpermute with merging probs does not support FP8."
         act_grad, probs_grad = moe_unpermute_mask_map_backward_with_probs(
-            unpermuted_act_grad, row_id_map, fwd_input, merging_probs, pad_offsets,
-            ctx.num_tokens, ctx.num_experts, ctx.num_permuted_tokens, ctx.hidden_size,
+            unpermuted_act_grad,
+            row_id_map,
+            fwd_input,
+            merging_probs,
+            pad_offsets,
+            ctx.num_tokens,
+            ctx.num_experts,
+            ctx.num_permuted_tokens,
+            ctx.hidden_size,
         )
     else:
         row_id_map, pad_offsets = ctx.saved_tensors
         act_grad = moe_unpermute_mask_map_backward_no_probs(
-            unpermuted_act_grad, row_id_map, pad_offsets,
-            ctx.num_tokens, ctx.num_experts, ctx.num_permuted_tokens, ctx.hidden_size,
+            unpermuted_act_grad,
+            row_id_map,
+            pad_offsets,
+            ctx.num_tokens,
+            ctx.num_experts,
+            ctx.num_permuted_tokens,
+            ctx.hidden_size,
         )
 
     if not ctx.needs_probs_grad:
@@ -945,8 +1027,13 @@ def moe_unpermute(
                 assert pad_offsets.is_cuda, "TransformerEngine needs CUDA."
 
         return moe_unpermute_mask_map_forward(
-            inp, row_id_map, merging_probs,
-            num_tokens, num_experts, hidden_size, pad_offsets,
+            inp,
+            row_id_map,
+            merging_probs,
+            num_tokens,
+            num_experts,
+            hidden_size,
+            pad_offsets,
         )
     raise ValueError("map_type should be one of 'mask' or 'index'")