NVIDIA
diff --git a/‎tests/jax/test_distributed_fused_attn.py‎
Lines changed: 0 additions & 134 deletions b/‎tests/jax/test_distributed_fused_attn.py‎
Lines changed: 0 additions & 134 deletions
diff --git a/‎tests/jax/test_distributed_layernorm.py‎
Lines changed: 0 additions & 6 deletions b/‎tests/jax/test_distributed_layernorm.py‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎tests/jax/test_distributed_layernorm_mlp.py‎
Lines changed: 0 additions & 89 deletions b/‎tests/jax/test_distributed_layernorm_mlp.py‎
Lines changed: 0 additions & 89 deletions
@@ -68,9 +68,7 @@ def impl_test_self_attn(
         attn_mask_type,
         dtype,
         softmax_type,
-        use_shardy,
     ):
-        jax.config.update("jax_use_shardy_partitioner", use_shardy)
         dropout_prob = 0.0
         is_training = True
         batch, seqlen, num_head, hidden = data_shape
@@ -178,48 +176,6 @@ def test_self_attn(
             attn_mask_type,
             dtype,
             softmax_type,
-            use_shardy=False,
-        )
-
-    @pytest.mark.parametrize("device_count,mesh_shape,mesh_axes,mesh_resource", generate_configs())
-    @pytest.mark.parametrize(
-        "attn_bias_type, bias_shape",
-        [
-            pytest.param(AttnBiasType.NO_BIAS, None, id="NO_BIAS"),
-            pytest.param(AttnBiasType.PRE_SCALE_BIAS, BiasShape._1HSS, id="PRE_SCALE_BIAS-1HSS"),
-        ],
-    )
-    @pytest.mark.parametrize(
-        "softmax_type",
-        [
-            pytest.param(AttnSoftmaxType.VANILLA_SOFTMAX, id="VANILLA_SOFTMAX"),
-            pytest.param(AttnSoftmaxType.OFF_BY_ONE_SOFTMAX, id="OFF_BY_ONE_SOFTMAX"),
-            pytest.param(AttnSoftmaxType.LEARNABLE_SOFTMAX, id="LEARNABLE_SOFTMAX"),
-        ],
-    )
-    def test_self_attn_shardy(
-        self,
-        device_count,
-        mesh_shape,
-        mesh_axes,
-        mesh_resource,
-        attn_bias_type,
-        bias_shape,
-        softmax_type,
-    ):
-        data_shape = (32, 512, 12, 64)
-        self.impl_test_self_attn(
-            device_count,
-            mesh_shape,
-            mesh_axes,
-            mesh_resource,
-            data_shape,
-            attn_bias_type,
-            bias_shape,
-            AttnMaskType.PADDING_MASK,
-            jnp.bfloat16,
-            softmax_type,
-            use_shardy=True,
         )
 
 
@@ -348,7 +304,6 @@ def impl_test_context_parallel_attn(
         qkv_layout,
         load_balanced,
         cp_strategy,
-        use_shardy,
         use_scan_ring=False,
         window_size=None,
         stripe_size=None,
@@ -366,8 +321,6 @@ def impl_test_context_parallel_attn(
             os.environ["NVTE_FUSED_RING_ATTENTION_USE_SCAN"] = "1"
         else:
             os.environ["NVTE_FUSED_RING_ATTENTION_USE_SCAN"] = "0"
-
-        jax.config.update("jax_use_shardy_partitioner", use_shardy)
         attn_bias_type = AttnBiasType.NO_BIAS
         bias_shape = None
         dropout_prob = 0.0
@@ -452,49 +405,6 @@ def check_has_backend_for_mask(mask_type):
         runner.test_backward()
         del os.environ["NVTE_FUSED_RING_ATTENTION_USE_SCAN"]
 
-    @pytest_parametrize_wrapper(
-        "device_count,mesh_shape,mesh_axes,mesh_resource",
-        generate_context_parallel_configs_for_attn(),
-    )
-    @pytest.mark.parametrize("data_shape", DISTRIBUTED_CONTEXT_SELF_ATTN_DATA_SHAPES)
-    @pytest.mark.parametrize("dtype", [pytest.param(jnp.bfloat16, id="BF16")])
-    @pytest.mark.parametrize(
-        "qkv_layout, attn_mask_type",
-        DISTRIBUTED_CONTEXT_SELF_ATTN_LAYOUTS_MASKS,
-    )
-    def test_context_parallel_allgather_attn_shardy(
-        self,
-        device_count,
-        mesh_shape,
-        mesh_axes,
-        mesh_resource,
-        data_shape,
-        attn_mask_type,
-        dtype,
-        qkv_layout,
-    ):
-        if qkv_layout.is_thd():
-            pytest.skip("Only BSHD layout is supported for CP + AG + Dual chunk attention")
-        kv_groups = 8
-        self.impl_test_context_parallel_attn(
-            device_count,
-            mesh_shape,
-            mesh_axes,
-            mesh_resource,
-            data_shape,
-            kv_groups,
-            attn_mask_type,
-            dtype,
-            qkv_layout,
-            load_balanced=True,
-            cp_strategy=CPStrategy.ALL_GATHER,
-            use_shardy=True,
-        )
-
-    @pytest_parametrize_wrapper(
-        "device_count,mesh_shape,mesh_axes,mesh_resource",
-        generate_context_parallel_configs_for_attn(),
-    )
     @pytest.mark.parametrize("data_shape", DISTRIBUTED_CONTEXT_SELF_ATTN_DATA_SHAPES[:1])
     @pytest.mark.parametrize("kv_groups", [1, 8])
     @pytest.mark.parametrize("dtype", [pytest.param(jnp.bfloat16, id="BF16")])
@@ -551,7 +461,6 @@ def test_context_parallel_allgather_striped_attn(
             qkv_layout,
             load_balanced,
             CPStrategy.ALL_GATHER,
-            use_shardy=False,
             window_size=window_size,
             stripe_size=stripe_size,
             num_segments_per_seq=num_segments_per_seq,
@@ -599,7 +508,6 @@ def test_context_parallel_allgather_attn(
             qkv_layout,
             load_balanced,
             CPStrategy.ALL_GATHER,
-            use_shardy=False,
         )
 
     @pytest_parametrize_wrapper(
@@ -664,53 +572,11 @@ def test_context_parallel_ring_attn(
             qkv_layout,
             load_balanced,
             CPStrategy.RING,
-            use_shardy=False,
             use_scan_ring=use_scan,
             window_size=window_size,
             stripe_size=stripe_size,
         )
 
-    @pytest_parametrize_wrapper(
-        "device_count,mesh_shape,mesh_axes,mesh_resource",
-        generate_context_parallel_configs_for_attn(),
-    )
-    @pytest.mark.parametrize("data_shape", DISTRIBUTED_CONTEXT_SELF_ATTN_DATA_SHAPES[:1])
-    @pytest.mark.parametrize("dtype", [pytest.param(jnp.bfloat16, id="BF16")])
-    @pytest.mark.parametrize(
-        "qkv_layout, attn_mask_type",
-        DISTRIBUTED_CONTEXT_SELF_ATTN_LAYOUTS_MASKS,
-    )
-    def test_context_parallel_ring_attn_shardy(
-        self,
-        device_count,
-        mesh_shape,
-        mesh_axes,
-        mesh_resource,
-        data_shape,
-        attn_mask_type,
-        dtype,
-        qkv_layout,
-    ):
-        kv_groups = 8
-        # Set the stripe size to 1 (ring attention only support stripe_size=1)
-        stripe_size = 1 if qkv_layout.is_thd() else None
-        self.impl_test_context_parallel_attn(
-            device_count,
-            mesh_shape,
-            mesh_axes,
-            mesh_resource,
-            data_shape,
-            kv_groups,
-            attn_mask_type,
-            dtype,
-            qkv_layout,
-            load_balanced=True,
-            cp_strategy=CPStrategy.RING,
-            use_shardy=False,
-            use_scan_ring=True,
-            stripe_size=stripe_size,
-        )
-
 
 REORDER_CAUSAL_LOAD_BALANCING_DATA_SHAPES = {
     "L0": [[]],
 
@@ -87,7 +87,6 @@ def generate_collectives_count_ref(
     @pytest_parametrize_wrapper("zero_centered_gamma", [False, True])
     @pytest_parametrize_wrapper("shard_weights", [False, True])
     @pytest_parametrize_wrapper("fp8_recipe", SUPPORTED_RECIPES)
-    @pytest_parametrize_wrapper("use_shardy", [False, True])
     def test_layernorm(
         self,
         device_count,
@@ -99,9 +98,7 @@ def test_layernorm(
         zero_centered_gamma,
         shard_weights,
         fp8_recipe,
-        use_shardy,
     ):
-        jax.config.update("jax_use_shardy_partitioner", use_shardy)
         epsilon = 1e-6
         ln_type = "layernorm"
         q_dtype = jnp.float8_e4m3fn
@@ -178,7 +175,6 @@ def ref_func(x, gamma, beta):
     @pytest_parametrize_wrapper("dtype", DTYPES)
     @pytest_parametrize_wrapper("shard_weights", [False, True])
     @pytest_parametrize_wrapper("fp8_recipe", SUPPORTED_RECIPES)
-    @pytest_parametrize_wrapper("use_shardy", [False, True])
     def test_rmsnorm(
         self,
         device_count,
@@ -189,9 +185,7 @@ def test_rmsnorm(
         dtype,
         shard_weights,
         fp8_recipe,
-        use_shardy,
     ):
-        jax.config.update("jax_use_shardy_partitioner", use_shardy)
         epsilon = 1e-6
         ln_type = "rmsnorm"
         q_dtype = jnp.float8_e4m3fn
 
@@ -192,10 +192,8 @@ def _test_layernorm_mlp_grad(
         input_shape,
         dtype,
         quantization_recipe,
-        use_shardy,
         with_jax_gemm,
     ):
-        jax.config.update("jax_use_shardy_partitioner", use_shardy)
         device_count, mesh_shape, mesh_axes, mesh_resource = mesh_config
         layernorm_type = "rmsnorm"
 
@@ -313,36 +311,6 @@ def test_layernorm_mlp_grad(
         dtype,
         quantization_recipe,
         with_jax_gemm,
-    ):
-        if dtype == jnp.float16 and quantization_recipe is not None and quantization_recipe.nvfp4():
-            pytest.skip("NVFP4 GEMM + Float16 output is unsupported!")
-        self._test_layernorm_mlp_grad(
-            mesh_config,
-            activation_type,
-            use_bias,
-            input_shape,
-            dtype,
-            quantization_recipe,
-            use_shardy=False,
-            with_jax_gemm=with_jax_gemm,
-        )
-
-    @pytest_parametrize_wrapper("mesh_config", generate_fsdp_and_tpsp_configs())
-    @pytest_parametrize_wrapper("input_shape", INPUT_SHAPE)
-    @pytest_parametrize_wrapper("activation_type", [("gelu",), ("gelu", "linear")])
-    @pytest_parametrize_wrapper("dtype", DTYPES)
-    @pytest_parametrize_wrapper("use_bias", [True, False])
-    @pytest_parametrize_wrapper("quantization_recipe", [None] + SUPPORTED_RECIPES)
-    @pytest_parametrize_wrapper("with_jax_gemm", [False, True])
-    def test_layernorm_mlp_grad_shardy(
-        self,
-        mesh_config,
-        activation_type,
-        use_bias,
-        input_shape,
-        dtype,
-        quantization_recipe,
-        with_jax_gemm,
     ):
         if dtype == jnp.float16 and quantization_recipe is not None and quantization_recipe.nvfp4():
             pytest.skip("NVFP4 GEMM + Float16 output is unsupported!")
@@ -353,7 +321,6 @@ def test_layernorm_mlp_grad_shardy(
             input_shape,
             dtype,
             quantization_recipe=quantization_recipe,
-            use_shardy=True,
             with_jax_gemm=with_jax_gemm,
         )
 
@@ -366,10 +333,8 @@ def _test_layernorm_mlp(
         dtype,
         use_fp8,
         quantization_recipe,
-        use_shardy,
         with_jax_gemm,
     ):
-        jax.config.update("jax_use_shardy_partitioner", use_shardy)
         batch, seqlen, hidden_in = input_shape
         layernorm_type = "rmsnorm"
 
@@ -481,7 +446,6 @@ def test_layernorm_mlp_layer(
             dtype,
             use_fp8=False,
             quantization_recipe=None,
-            use_shardy=False,
             with_jax_gemm=with_jax_gemm,
         )
 
@@ -512,58 +476,5 @@ def test_layernorm_mlp_layer_fp8(
             dtype,
             use_fp8=True,
             quantization_recipe=quantization_recipe,
-            use_shardy=False,
-            with_jax_gemm=with_jax_gemm,
-        )
-
-    @pytest_parametrize_wrapper("input_shape", INPUT_SHAPE)
-    @pytest_parametrize_wrapper("mesh_config", generate_fsdp_and_tpsp_configs())
-    @pytest_parametrize_wrapper("activation_type", [("gelu",), ("silu", "linear")])
-    @pytest_parametrize_wrapper("dtype", DTYPES)
-    @pytest_parametrize_wrapper("use_bias", [True, False])
-    @pytest_parametrize_wrapper("with_jax_gemm", [False, True])
-    def test_layernorm_mlp_layer_shardy(
-        self, mesh_config, activation_type, use_bias, input_shape, dtype, with_jax_gemm
-    ):
-        self._test_layernorm_mlp(
-            mesh_config,
-            activation_type,
-            use_bias,
-            input_shape,
-            dtype,
-            use_fp8=False,
-            quantization_recipe=None,
-            use_shardy=True,
-            with_jax_gemm=with_jax_gemm,
-        )
-
-    @pytest_parametrize_wrapper("mesh_config", generate_fsdp_and_tpsp_configs())
-    @pytest_parametrize_wrapper("activation_type", [("gelu",), ("gelu", "linear")])
-    @pytest_parametrize_wrapper("use_bias", [True, False])
-    @pytest_parametrize_wrapper("input_shape", INPUT_SHAPE)
-    @pytest_parametrize_wrapper("dtype", DTYPES)
-    @pytest_parametrize_wrapper("quantization_recipe", SUPPORTED_RECIPES)
-    @pytest_parametrize_wrapper("with_jax_gemm", [False, True])
-    def test_layernorm_mlp_layer_fp8_shardy(
-        self,
-        mesh_config,
-        activation_type,
-        use_bias,
-        input_shape,
-        dtype,
-        quantization_recipe,
-        with_jax_gemm,
-    ):
-        if dtype == jnp.float16 and quantization_recipe is not None and quantization_recipe.nvfp4():
-            pytest.skip("NVFP4 GEMM + Float16 output is unsupported!")
-        self._test_layernorm_mlp(
-            mesh_config,
-            activation_type,
-            use_bias,
-            input_shape,
-            dtype,
-            use_fp8=True,
-            quantization_recipe=quantization_recipe,
-            use_shardy=True,
             with_jax_gemm=with_jax_gemm,
         )