NVIDIA · yechank-nvidia · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026
@@ -14,9 +14,11 @@ This document shows how to build and run [EXAONE](https://huggingface.co/LGAI-EX
     - [EXAONE-3.0](#exaone-30)
     - [EXAONE-Deep](#exaone-deep)
     - [EXAONE-4.0](#exaone-40)
+    - [EXAONE-4.5](#exaone-45)
     - [K-EXAONE](#k-exaone)
   - [PyTorch flow](#pytorch-flow)
     - [Running EXAONE-4.0](#running-exaone-40)
+    - [Running EXAONE-4.5](#running-exaone-45)
     - [Running K-EXAONE](#running-k-exaone)
       - [MoE Backend Options](#moe-backend-options)
     - [PyTorch flow Quantization](#pytorch-flow-quantization)
@@ -45,6 +47,7 @@ This document shows how to build and run [EXAONE](https://huggingface.co/LGAI-EX
   * FP16
   * BF16
   * Tensor Parallel (TP)
+  * Multimodal (EXAONE-4.5 only)
   * Expert Parallel (EP) (K-EXAONE only)
   * Attention Data Parallel (ADP) (K-EXAONE only)
   * Disaggregated Serving
@@ -59,7 +62,7 @@ This document shows how to build and run [EXAONE](https://huggingface.co/LGAI-EX
 
 **Note:**
 - **EXAONE-3.0** & **EXAONE-Deep** are supported using the [TRT Flow](#trt-flow).
-- **EXAONE-4.0** & **K-EXAONE** are supported using the [PyTorch flow](#pytorch-flow).
+- **EXAONE-4.0**, **EXAONE-4.5**, & **K-EXAONE** are supported using the [PyTorch flow](#pytorch-flow).
 
 Please refer to the corresponding sections below for usage instructions and examples for each model.
 
@@ -90,6 +93,17 @@ export HF_MODEL_DIR=hf_models/exaone4
 git clone https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B $HF_MODEL_DIR
 ```
 
+### EXAONE-4.5
+
+EXAONE-4.5 is a multimodal model. It is supported only via the [PyTorch flow](#pytorch-flow).
+
+Download the HuggingFace checkpoint for your EXAONE-4.5 variant from the [LGAI-EXAONE](https://huggingface.co/LGAI-EXAONE) organization. The example below uses a placeholder repo name; replace it with the model card you use.
+
+```bash
+export HF_MODEL_DIR=hf_models/exaone4_5
+git clone https://huggingface.co/LGAI-EXAONE/<TODO: FILL> $HF_MODEL_DIR
+```
+
 ### K-EXAONE
 
 K-EXAONE is a Mixture of Experts (MoE) model based on the EXAONE architecture. It features a hybrid architecture with both dense and MoE layers, sliding window attention, and supports FP8 and NVFP4 quantization for efficient inference.
@@ -117,6 +131,19 @@ The output will be like:
 [2] Prompt: 'The future of AI is', Generated text: ' not just about technology but also about how we choose to use it. We must ensure that AI is developed and deployed in a way that benefits all of humanity, not just a select few. This means prioritizing ethical considerations, transparency, and accountability in AI development. It also means involving diverse stakeholders in the conversation about AI'
 ```
 
+### Running EXAONE-4.5
+
+To quickly run EXAONE-4.5 models, you can use [examples/llm-api/quickstart_multimodal.py](../../../llm-api/quickstart_multimodal.py):
+
+```bash
+python ../../../llm-api/quickstart_multimodal.py --model_dir $HF_MODEL_DIR
+```
+
+The output will be like:
+```bash
+TODO: FILL
+```
+
 ### Running K-EXAONE
 
 K-EXAONE is a Mixture of Experts model that benefits from multiple parallelism strategies. You can run it with tensor parallelism (TP), expert parallelism (EP), and attention data parallelism (ADP):

@@ -30,7 +30,7 @@ nvidia-modelopt[torch]~=0.37.0
 # torch 2.10.0+cu130 depends on nvidia-nccl-cu13==2.28.9
 nvidia-nccl-cu13>=2.28.9,<=2.29.2
 nvidia-cuda-nvrtc
-transformers==4.57.3
+transformers==5.3.0
 prometheus_client
 prometheus_fastapi_instrumentator
 pydantic>=2.9.1

@@ -485,10 +485,13 @@ def from_config(config) -> "RopeParams":
 
         hf_rope_parameters = getattr(config, 'rope_parameters', None)
         if hf_rope_parameters is not None:
-            assert not set(hf_rope_parameters.keys()).issubset(
-                ALLOWED_ATTENTION_LAYER_TYPES), (
-                    "Per-layer-type RoPE configuration is not supported yet.")
-            config.update(hf_rope_parameters)
+            if not set(hf_rope_parameters.keys()).issubset(
+                    ALLOWED_ATTENTION_LAYER_TYPES):
+                # Flat rope_parameters dict: merge into config directly.
+                config.update(hf_rope_parameters)
+            # Per-layer-type rope_parameters (e.g. Gemma3 in transformers>=5.x)
+            # are handled by model-specific logic (e.g. rope_local_base_freq),
+            # so skip merging here.
 
         # get rotary parameters.
         hidden_size = config.hidden_size

@@ -32,7 +32,6 @@
 from PIL import Image
 from torch import nn
 from torch.export import Dim
-from transformers import AutoConfig
 from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig
 from transformers.generation import GenerationMixin
@@ -2854,8 +2853,5 @@ def init_input_processor(self, base):
 # Registration
 # =============================================================================
 
-AutoConfig.register("qwen3_5_moe", Qwen3_5MoeConfig)
-AutoConfig.register("qwen3_5_moe_text", Qwen3_5MoeTextConfig)
-
 AutoModelForCausalLMFactory.register_custom_model_cls("Qwen3_5MoeTextConfig", Qwen3_5MoeForCausalLM)
 Qwen3_5MoeFactory.register_custom_model_cls("Qwen3_5MoeConfig", Qwen3_5MoeForConditionalGeneration)
@@ -6,6 +6,7 @@
 from .modeling_cohere2 import Cohere2ForCausalLM
 from .modeling_deepseekv3 import DeepseekV3ForCausalLM
 from .modeling_exaone4 import Exaone4ForCausalLM
+from .modeling_exaone4_5 import Exaone4_5_ForConditionalGeneration
 from .modeling_exaone_moe import ExaoneMoeForCausalLM
 from .modeling_gemma3 import Gemma3ForCausalLM
 from .modeling_gemma3vl import Gemma3VLM
@@ -48,6 +49,7 @@
     "CLIPVisionModel",
     "DeepseekV3ForCausalLM",
     "Exaone4ForCausalLM",
+    "Exaone4_5_ForConditionalGeneration",
     "ExaoneMoeForCausalLM",
     "Gemma3ForCausalLM",
     "Gemma3VLM",

@@ -0,0 +1,34 @@
+from typing import Union
+
+from torch import nn
+
+from tensorrt_llm._torch.model_config import ModelConfig
+from tensorrt_llm._torch.models.checkpoints.base_weight_loader import ConsumableWeightsDict
+from tensorrt_llm._torch.models.checkpoints.hf.weight_mapper import HfWeightMapper
+from tensorrt_llm._torch.models.modeling_utils import DecoderModelForCausalLM, register_mapper
+
+
+@register_mapper("HF", "Exaone4_5_ForConditionalGeneration")
+class Exaone4_5HfWeightMapper(HfWeightMapper):
+    def init_model_and_config(
+        self, model: Union[nn.Module, DecoderModelForCausalLM], config: ModelConfig
+    ):
+        super().init_model_and_config(model, config)
+        self.model.config.tie_word_embeddings = False
+
+    def preprocess_weights(self, weights: dict):
+        """Rename HF checkpoint prefixes; supports plain dict and ConsumableWeightsDict."""
+        is_consumable = isinstance(weights, ConsumableWeightsDict)
+        renamed = {}
+        for key, value in weights.items():
+            if key.startswith("model.visual."):
+                new_key = key.replace("model.visual.", "visual.")
+                renamed[new_key] = value
+            elif key.startswith("model.language_model."):
+                new_key = key.replace("model.language_model.", "model.")
+                renamed[new_key] = value
+            else:
+                renamed[key] = value
+        if is_consumable:
+            return ConsumableWeightsDict(renamed)
+        return renamed
@@ -4,8 +4,6 @@
 import torch.nn as nn
 from transformers.activations import ACT2FN
 from transformers.modeling_outputs import BaseModelOutput
-from transformers.modeling_utils import (get_parameter_device,
-                                         get_parameter_dtype)
 from transformers.models.clip.configuration_clip import CLIPVisionConfig
 from transformers.models.clip.modeling_clip import CLIPVisionEmbeddings
 
@@ -219,12 +217,12 @@ def prepare_attn_metadata(self, batch_size):
         return self.attn_metadata
 
     @property
-    def dtype(self):
-        return get_parameter_dtype(self)
+    def dtype(self) -> torch.dtype:
+        return self.vision_model.embeddings.patch_embedding.weight.dtype
 
     @property
-    def device(self):
-        return get_parameter_device(self)
+    def device(self) -> torch.device:
+        return self.vision_model.embeddings.patch_embedding.weight.device
 
     @torch.inference_mode()
     def forward(self,