LarryXFly · pull · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/.github/tava_architecture_diagram.md b/.github/tava_architecture_diagram.md
@@ -91,6 +91,16 @@ graph TB
         BatchManager --> KVCache
     end
 
+    subgraph "Usage_Telemetry"
+        ReportUsage[report_usage]
+        BgReporter[Background Reporter]
+        GxtPayload[GXT Payload Builder]
+        GxtEndpoint[NvTelemetry Endpoint]
+        ReportUsage --> BgReporter
+        BgReporter --> GxtPayload
+        GxtPayload --> GxtEndpoint
+    end
+
     subgraph "Output_Results"
         Tokens[Generated Tokens]
         Stats[Performance Stats]
@@ -99,13 +109,17 @@ graph TB
         GenVideos[Generated Videos]
     end
 
+    LLMAPI --> ReportUsage
+
     PyTorch_Flow ~~~ TensorRT_Flow
 
     TensorRT_Flow --> Output_Results
     PyTorch_Flow --> Output_Results
     AutoDeploy_Flow --> Output_Results
     Visual_Gen_Flow --> Output_Results
 
+    AutoDeploy_Flow ~~~ Usage_Telemetry
+
     %% Force Output_Results to be between PyTorch_flow and TensorRT_flow
     PyTorch_Flow ~~~ Output_Results
 
@@ -141,6 +155,10 @@ graph TB
     classDef api fill:#bfb,stroke:#333,stroke-width:2px;
     class PythonAPI,CppAPI,LLMAPI api;
 
+    %% Telemetry format
+    classDef telemetry fill:#cef,stroke:#333,stroke-width:2px;
+    class ReportUsage,BgReporter,GxtPayload,GxtEndpoint telemetry;
+
     %% Results format
     classDef result fill:#fbb,stroke:#333,stroke-width:2px;
     class Tokens,Stats,Metrics,GenImages,GenVideos result;

diff --git a/README.md b/README.md
@@ -284,6 +284,41 @@ Deprecation is used to inform developers that some APIs and tools are no longer
 4. Removal After Migration Period
   - After the 3-month migration period ends, deprecated APIs, tools, or parameters are removed in a manner consistent with semantic versioning (major version changes may include breaking removals).
 
+## Telemetry Data Collection
+
+TensorRT-LLM collects anonymous telemetry data by default. This data is used
+in aggregate to understand usage patterns and prioritize engineering efforts.
+**This data cannot be traced back to any individual user.** No prompts,
+user-identifying information, or persistent identifiers are collected. Any
+deployment identifiers are ephemeral, randomly generated per deployment, and
+not linked to users. The data we collect includes:
+
+- Ingress point (e.g., LLM API, CLI, serve command)
+- Deployment duration (via periodic heartbeats)
+- GPU SKUs, count, memory, and CUDA version
+- Model architecture class name (e.g., `LlamaForCausalLM`)
+- Parallelism configuration (TP/PP/CP/MoE-EP/MoE-TP sizes), quantization algorithm, dtype, KV cache dtype
+- System information (OS platform, Python version, CPU architecture, CPU count)
+- TRT-LLM version and backend
+- Feature flags (LoRA, speculative decoding, prefix caching, CUDA graphs, chunked context, data parallelism)
+- Disaggregated serving metadata (role and deployment ID)
+
+Telemetry is automatically disabled in CI and test environments.
+
+### Opting Out of Telemetry Data Collection
+
+To disable telemetry data collection, use any of the following methods:
+
+- **Environment variable**: Set `TRTLLM_NO_USAGE_STATS=1`, `DO_NOT_TRACK=1`, or `TELEMETRY_DISABLED=true`
+- **File-based**: Create the file `~/.config/trtllm/do_not_track`
+- **Python API**: Pass `TelemetryConfig(disabled=True)` to `LLM()`
+- **CLI flag**: Use `--no-telemetry` on `trtllm-serve`, `trtllm-bench`, or `trtllm-eval`
+
+The telemetry collection code is fully open source and auditable at
+[`tensorrt_llm/usage/`](./tensorrt_llm/usage/). For a detailed field-by-field
+reference of exactly what is collected, see the
+[schema documentation](./tensorrt_llm/usage/schemas/README.md).
+
 ## Useful Links
 - [Quantized models on Hugging Face](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4): A growing collection of quantized (e.g., FP8, FP4) and optimized LLMs, including [DeepSeek FP4](https://huggingface.co/nvidia/DeepSeek-R1-FP4), ready for fast inference with TensorRT LLM.
 - [NVIDIA Dynamo](https://github.com/ai-dynamo/dynamo): A datacenter scale distributed inference serving framework that works seamlessly with TensorRT LLM.

diff --git a/examples/auto_deploy/model_registry/configs/qwen3.5_moe_400b.yaml b/examples/auto_deploy/model_registry/configs/qwen3.5_moe_400b.yaml
@@ -2,26 +2,35 @@ runtime: trtllm
 compile_backend: torch-cudagraph
 attn_backend: trtllm
 max_seq_len: 262144
-max_num_tokens: 8192
-max_batch_size: 32
+max_num_tokens: 16000
+max_batch_size: 256
 cuda_graph_config:
-  batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
+  batch_sizes: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 64, 128, 256]
+world_size: 8
 enable_chunked_prefill: true
-model_factory: Qwen3_5MoeForConditionalGeneration
+# For text-only mode, use AutoModelForCausalLM until issue #12699 is resolved
+# Once issue #12699 is resolved, consider to unify the factory to Qwen3_5MoeForConditionGeneration for both VLM and text mode
+# model_factory: Qwen3_5MoeForConditionalGeneration
+model_factory: AutoModelForCausalLM
 kv_cache_config:
   enable_block_reuse: false
   free_gpu_memory_fraction: 0.8
   tokens_per_block: 32
 model_kwargs:
   torch_dtype: bfloat16
 transforms:
+  # disable for text only use case
   initialize_mrope_delta_cache:
     enabled: true
   export_to_gm:
     num_moe_experts_for_export: 2
   fuse_gemms_mixed_children:
     enabled: true
+  fuse_nvfp4_moe:
+    backend: trtllm_gen
   detect_sharding:
+    # for long input, tp8ep1 gives better performance
+    # dist_mapping: {moe_tp: 8, moe_ep: 1}
     allreduce_strategy: SYMM_MEM
     shard_all_unprocessed: true
     simple_shard_filter: "lm_head"
@@ -37,6 +46,9 @@ transforms:
         "k_proj": "colwise"
         "v_proj": "colwise"
         "o_proj": "rowwise"
+        # lm_head: "gather" = column split + all_gather (not "colwise" which
+        # requires a LayerSubgraph and crashes for standalone unprocessed nodes)
+        "lm_head": "gather"
         # replicating shared experts (keep them commented out)
         # "shared_expert_gate_proj": "colwise"
         # "shared_expert_up_proj": "colwise"

diff --git a/examples/llm-api/quickstart_advanced.py b/examples/llm-api/quickstart_advanced.py
@@ -181,7 +181,9 @@ def add_llm_args(parser):
     parser.add_argument('--spec_decode_max_draft_len', type=int, default=1)
     parser.add_argument('--draft_model_dir', type=str, default=None)
     parser.add_argument('--max_matching_ngram_size', type=int, default=5)
-    parser.add_argument('--use_one_model', default=False, action='store_true')
+    parser.add_argument('--use_one_model',
+                        default=True,
+                        action=argparse.BooleanOptionalAction)
     parser.add_argument('--eagle_choices', type=str, default=None)
     parser.add_argument('--use_dynamic_tree',
                         default=False,

diff --git a/setup.py b/setup.py
@@ -170,7 +170,8 @@ def has_ext_modules(self):
     "_torch/auto_deploy/config/*.yaml",
     # Include CUDA source for fused MoE align extension so runtime JIT can find it in wheels
     '_torch/auto_deploy/custom_ops/fused_moe/moe_align_kernel.cu',
-    '_torch/auto_deploy/custom_ops/fused_moe/triton_fused_moe_configs/*'
+    '_torch/auto_deploy/custom_ops/fused_moe/triton_fused_moe_configs/*',
+    'usage/schemas/*.json',
 ]
 
 

diff --git a/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_qwen3_5_moe.py b/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_qwen3_5_moe.py
@@ -727,10 +727,15 @@ class Qwen3_5MoeCausalLMOutput(ModelOutput):
     """Output of the Qwen3.5 MoE causal language model."""
 
     logits: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
 
 
 class Qwen3_5MoeTextModel(Qwen3_5MoePreTrainedModel):
-    """Qwen3.5 MoE text model (embed + decoder layers + final norm)."""
+    """Qwen3.5 MoE text model (embed + decoder layers + final norm + lm_head).
+
+    lm_head is included so that the exported GraphModule contains it directly,
+    allowing sharding and gather_logits_before_lm_head transforms to see it.
+    """
 
     def __init__(self, config: Qwen3_5MoeTextConfig):
         super().__init__(config)
@@ -746,10 +751,15 @@ def __init__(self, config: Qwen3_5MoeTextConfig):
         )
         self.norm = Qwen3_5MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.rotary_emb = Qwen3_5MoeTextRotaryEmbedding(config=config)
+        self.lm_head = None  # set by parent model via set_lm_head()
 
         # Initialize weights and apply final processing
         self.post_init()
 
+    def set_lm_head(self, lm_head: nn.Module):
+        """Set the lm_head from the parent model."""
+        self.lm_head = lm_head
+
     def get_input_embeddings(self):
         return self.embed_tokens
 
@@ -801,7 +811,11 @@ def forward(
             hidden_states = decoder_layer(hidden_states, position_embeddings=position_embeddings)
 
         hidden_states = self.norm(hidden_states)
-        return Qwen3_5MoeOutput(last_hidden_state=hidden_states)
+        assert self.lm_head is not None, (
+            "lm_head not set — call set_lm_head() from the parent model before forward()"
+        )
+        logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float()
+        return Qwen3_5MoeCausalLMOutput(logits=logits, last_hidden_state=hidden_states)
 
 
 class Qwen3_5MoeForCausalLM(Qwen3_5MoePreTrainedModel, GenerationMixin):
@@ -814,6 +828,7 @@ def __init__(self, config: Qwen3_5MoeTextConfig, **kwargs):
         self.model = Qwen3_5MoeTextModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.model.set_lm_head(self.lm_head)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -829,6 +844,7 @@ def get_output_embeddings(self):
 
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
+        self.model.set_lm_head(new_embeddings)
 
     def forward(
         self,
@@ -848,8 +864,7 @@ def forward(
             rope_cos=rope_cos,
             rope_sin=rope_sin,
         )
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float()
+        logits = outputs.logits
         return Qwen3_5MoeCausalLMOutput(logits=logits)
 
 
@@ -2565,10 +2580,19 @@ def __init__(self, config: Qwen3_5MoeConfig, **kwargs):
         self.lm_head = nn.Linear(
             config.text_config.hidden_size, config.text_config.vocab_size, bias=False
         )
+        # Share lm_head with the text model so it's inside the exported graph
+        self.model.language_model.set_lm_head(self.lm_head)
 
         # Initialize weights and apply final processing
         self.post_init()
 
+    def get_input_embeddings(self):
+        return self.model.language_model.get_input_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+        self.model.language_model.set_lm_head(new_embeddings)
+
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -2590,8 +2614,7 @@ def forward(
             video_grid_thw=video_grid_thw,
             **kwargs,
         )
-        hidden_states = outputs.last_hidden_state
-        logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float()
+        logits = outputs.logits
         return Qwen3_5MoeConditionalOutput(logits=logits)
 
 
@@ -2607,6 +2630,9 @@ class Qwen3_5MoeTextExportInfo(TextModelExportInfo):
     (batch, sequence) are dynamic.
     """
 
+    def __init__(self, submodule_name: str):
+        super().__init__(submodule_name)
+
     def _init_dynamic_shape_lookup(self):
         base = super()._init_dynamic_shape_lookup()
         batch_size_dyn = Dim.DYNAMIC
@@ -2858,4 +2884,7 @@ def init_input_processor(self, base):
 AutoConfig.register("qwen3_5_moe_text", Qwen3_5MoeTextConfig)
 
 AutoModelForCausalLMFactory.register_custom_model_cls("Qwen3_5MoeTextConfig", Qwen3_5MoeForCausalLM)
+AutoModelForCausalLMFactory.register_custom_model_cls(
+    "Qwen3_5MoeConfig", Qwen3_5MoeForConditionalGeneration
+)
 Qwen3_5MoeFactory.register_custom_model_cls("Qwen3_5MoeConfig", Qwen3_5MoeForConditionalGeneration)
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py b/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py
@@ -405,6 +405,10 @@ def build_custom_args_for_linear(self, scales: Dict[str, Node]) -> Tuple:
         return ([scales["input_scale"]], [scales["weight_scale"], scales["alpha"]], [], [])
 
     def load_hook(self, state_dict, prefix, *args, weight_name):
+        # Prepend prefix so the hook works when the GraphModule is a submodule
+        # of the model on which load_state_dict is called (e.g., VLM models
+        # where the text model lives at model.language_model.*).
+        weight_name = prefix + weight_name
         if weight_name in state_dict:
             input_scale_name = weight_name.rsplit(".", 1)[0] + ".input_scale"
             alpha_name = weight_name.rsplit(".", 1)[0] + ".alpha"