From 4ddfae0a91ecf03da11010cfce96ad0bb8d3543d Mon Sep 17 00:00:00 2001 From: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com> Date: Thu, 9 Apr 2026 03:27:03 -0700 Subject: [PATCH] [None][fix] Update moe hidden_size in communicator for nemotron-h Signed-off-by: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com> --- .../fused_moe/communication/communication_factory.py | 12 ++++++++++-- .../_torch/modules/fused_moe/configurable_moe.py | 1 + 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/tensorrt_llm/_torch/modules/fused_moe/communication/communication_factory.py b/tensorrt_llm/_torch/modules/fused_moe/communication/communication_factory.py index c1a2dfae4e9..cbcf0502ae9 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/communication/communication_factory.py +++ b/tensorrt_llm/_torch/modules/fused_moe/communication/communication_factory.py @@ -57,6 +57,7 @@ def create_strategy( payload_in_workspace: bool = False, alltoall_result_do_sum: bool = True, use_flashinfer: bool = False, + hidden_size: Optional[int] = None, ) -> Optional[Communication]: """ Create the best communication method for the given configuration @@ -78,6 +79,9 @@ def create_strategy( expert_size_per_partition: Number of experts per partition (required for DeepEP) payload_in_workspace: If True, final_hidden_states is already in workspace (for NVLinkOneSided) alltoall_result_do_sum: If True, sum the alltoall results (for NVLinkTwoSided) + hidden_size: Actual MoE activation dimension (the A2A payload width). + For latent-MoE models this is moe_latent_size, not pretrained_config.hidden_size. + Falls back to pretrained_config.hidden_size when not provided. # TODO: Need a way to indicate whether EPLB is enabled. Returns: @@ -89,7 +93,8 @@ def create_strategy( """ # Extract parameters from model_config mapping = model_config.mapping - hidden_size = model_config.pretrained_config.hidden_size + if hidden_size is None: + hidden_size = model_config.pretrained_config.hidden_size act_dtype = model_config.torch_dtype quant_config = model_config.quant_config max_num_tokens = model_config.max_num_tokens @@ -120,6 +125,7 @@ def create_strategy( payload_in_workspace, alltoall_result_do_sum, use_flashinfer, + hidden_size=hidden_size, ) # Auto-selection: Try strategies in priority order using try-catch @@ -218,6 +224,7 @@ def _create_forced_method( payload_in_workspace: bool, alltoall_result_do_sum: bool, use_flashinfer: bool, + hidden_size: Optional[int] = None, ) -> Communication: """ Create a specific method (for debugging/testing) @@ -228,7 +235,8 @@ def _create_forced_method( """ # Extract parameters from model_config mapping = model_config.mapping - hidden_size = model_config.pretrained_config.hidden_size + if hidden_size is None: + hidden_size = model_config.pretrained_config.hidden_size act_dtype = model_config.torch_dtype quant_config = model_config.quant_config max_num_tokens = model_config.max_num_tokens diff --git a/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py b/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py index 97a7499d4fb..264ca71c8af 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py +++ b/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py @@ -447,6 +447,7 @@ def _create_comm_strategy_auto(self) -> Communication: # Keep updated with more supported backends. alltoall_result_do_sum=True, use_flashinfer=self.use_flashinfer, + hidden_size=self.hidden_size, ) def forward_impl(