From cef44b41c2f2adfbd5a8fb18bd1ccea1b93d7f1f Mon Sep 17 00:00:00 2001
From: Dipankar Sarkar <dipankar@qti.qualcomm.com>
Date: Wed, 18 Mar 2026 01:25:45 +0000
Subject: [PATCH] multi image config for qwen3vl

Signed-off-by: Dipankar Sarkar <dipankar@qti.qualcomm.com>
---
 .../transformers/models/modeling_auto.py      | 195 ++++++++----------
 .../models/qwen3_vl/modeling_qwen3_vl.py      |  57 +++--
 .../models/qwen3_vl_multi.py                  | 153 ++++++++++++++
 .../models/qwen3vl/qwen3_vl.py                |  14 --
 4 files changed, 267 insertions(+), 152 deletions(-)
 create mode 100644 examples/image_text_to_text/models/qwen3_vl_multi.py

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 12bbca747..eee1ca8c0 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -40,6 +40,7 @@
     write_io_files,
 )
 from QEfficient.generation.vlm_generation import VisionLanguageGeneration
+from QEfficient.proxy.pytorch_transform import QeffProxyModuleTransform
 from QEfficient.transformers.modeling_utils import (
     DYNAMIC_SEQ_LEN_SUPPORTED_MODEL_ARCH,
     SPECIALIZED_DISAGG_SERVING_MODEL_ARCH,
@@ -248,6 +249,10 @@ def __init__(self, model: nn.Module, pooling=None, **kwargs):
         **kwargs :
             Additional keyword arguments passed to the base class constructor.
         """
+        if kwargs.pop("enable_proxy", False):
+            self._pytorch_transforms.append(QeffProxyModuleTransform)
+            logger.info("Proxy Model Enabled for QEfficient Model")
+
         super().__init__(model, **kwargs)
 
         # Make Embedding specific transforms like appending pooling
@@ -1027,36 +1032,7 @@ def __init__(self, model, qaic_config: Optional[dict] = None, **kwargs):
         if self.model.qaic_config is not None and self.model.qaic_config.get("num_kv_blocks", None) is not None:
             BlockedKVAttentionTransform.apply(self.model, num_kv_blocks=self.model.qaic_config.get("num_kv_blocks"))
 
-    def __update_prefill_transform(
-        self,
-        enable: Optional[bool] = True,
-        enable_chunking: Optional[bool] = False,
-        retain_full_kv: Optional[bool] = False,
-    ):
-        if enable:
-            if enable_chunking:
-                self.model, tf = PrefillOnlyChunkedTransform.apply(self.model)
-            else:
-                self.model, tf = PrefillOnlyTransform.apply(self.model)
-
-        else:
-            if retain_full_kv:
-                self.model, tf = RevertPrefillKeepAttentionTransform.apply(self.model)
-            else:
-                self.model, tf = RevertPrefillOnlyTransform.apply(self.model)
-
-    def export(
-        self,
-        inputs,
-        output_names,
-        dynamic_axes,
-        export_dir=None,
-        offload_pt_weights=True,
-        prefill_seq_len: Optional[int] = None,
-        prefill_only: bool = False,
-        enable_chunking: bool = False,
-        **kwargs,
-    ):
+    def export(self, inputs, output_names, dynamic_axes, export_dir=None, offload_pt_weights=True, **kwargs):
         """
         Exports the language decoder component to ONNX format.
 
@@ -1080,18 +1056,6 @@ def export(
         str
             Path to the generated ONNX graph file for the language decoder.
         """
-        if prefill_only:
-            assert prefill_seq_len > 1
-            if not enable_chunking and self.continuous_batching:
-                raise NotImplementedError(
-                    "Looks like you are trying to run prefix-caching without chunking, this feature is not available yet!"
-                )
-            self.hash_params["prefill_only"] = True
-            self.__update_prefill_transform(enable=True, enable_chunking=enable_chunking)
-        else:
-            self.hash_params["prefill_only"] = False
-            self.__update_prefill_transform(False, retain_full_kv=kwargs.get("retain_full_kv", False))
-
         return self._export(
             inputs,
             output_names=output_names,
@@ -1277,15 +1241,28 @@ def onnx_path(self):
         """
         return [self.vision_model.onnx_path, self.lang_model.onnx_path]
 
+    @property
+    def qpc_path(self):
+        """
+        Get the QPC paths for the vision and language model components.
+
+        Returns
+        -------
+        Union[List[str], str, None]
+            A list containing both QPC paths if both are compiled, or just one if only one is,
+            or None if neither is compiled.
+        """
+        if self.vision_model.qpc_path and self.lang_model.qpc_path:
+            return [self.vision_model.qpc_path, self.lang_model.qpc_path]
+        elif self.vision_model.qpc_path:
+            return self.vision_model.qpc_path
+        else:
+            return self.lang_model.qpc_path
+
     def export(
         self,
         export_dir: Optional[str] = None,
         use_onnx_subfunctions: bool = False,
-        skip_vision: Optional[bool] = False,
-        skip_lang: Optional[bool] = False,
-        prefill_seq_len: Optional[int] = None,
-        prefill_only: bool = False,
-        enable_chunking: bool = False,
         **kwargs,
     ) -> str:
         """
@@ -1339,33 +1316,26 @@ def export(
                 vocab_size=self.model.language_model.config.vocab_size,
                 qaic_config=self.lang_model.model.qaic_config,
             )
-        if not skip_vision:
-            self.vision_model.export(
-                inputs["vision"],
-                output_names["vision"],
-                dynamic_axes["vision"],
-                export_dir=export_dir,
-                offload_pt_weights=False,
-                use_onnx_subfunctions=use_onnx_subfunctions,
-            )
 
-        if prefill_only and prefill_seq_len > 1:
-            offload_pt_weights = False  # to keep weight for decode onnx
-        else:
-            offload_pt_weights = kwargs.get("offload_pt_weights", True)
+        self.vision_model.export(
+            inputs["vision"],
+            output_names["vision"],
+            dynamic_axes["vision"],
+            export_dir=export_dir,
+            offload_pt_weights=False,
+            use_onnx_subfunctions=use_onnx_subfunctions,
+        )
+
+        offload_pt_weights = kwargs.get("offload_pt_weights", True)
+        self.lang_model.export(
+            inputs["lang"],
+            output_names["lang"],
+            dynamic_axes["lang"],
+            export_dir=export_dir,
+            offload_pt_weights=offload_pt_weights,
+            use_onnx_subfunctions=use_onnx_subfunctions,
+        )
 
-        if not skip_lang:
-            self.lang_model.export(
-                inputs["lang"],
-                output_names["lang"],
-                dynamic_axes["lang"],
-                export_dir=export_dir,
-                offload_pt_weights=offload_pt_weights,
-                use_onnx_subfunctions=use_onnx_subfunctions,
-                prefill_only=prefill_only,
-                enable_chunking=enable_chunking,
-                prefill_seq_len=prefill_seq_len,
-            )
         return self.onnx_path
 
     def compile(
@@ -1389,8 +1359,6 @@ def compile(
         skip_vision: Optional[bool] = False,
         skip_lang: Optional[bool] = False,
         use_onnx_subfunctions: bool = False,
-        prefill_only=None,
-        enable_chunking=False,
         **compiler_options,
     ) -> str:
         """
@@ -1509,23 +1477,19 @@ def compile(
         if lang_onnx_path:
             self.lang_model.onnx_path = lang_onnx_path
 
-        if vision_onnx_path is None or lang_onnx_path is None:
+        if (self.vision_model.onnx_path is None and vision_onnx_path is None) or (
+            self.lang_model.onnx_path is None and lang_onnx_path is None
+        ):
             self.export(
                 use_onnx_subfunctions=use_onnx_subfunctions,
-                skip_vision=skip_vision,
-                skip_lang=skip_lang,
-                prefill_only=prefill_only,
-                enable_chunking=enable_chunking,
-                prefill_seq_len=prefill_seq_len,
             )
 
         # TODO this hould be removed once the continous batching is supported for all the models.
         compiler_options.pop("continuous_batching", None)
         compiler_options.pop("kv_cache_batch_size", None)
         compiler_options.pop("full_batch_size", None)
-        self.qpc_paths = {}
         if not skip_vision:
-            vision_qpc_path = self.vision_model._compile(
+            self.vision_model._compile(
                 compile_dir=compile_dir,
                 compile_only=True,
                 specializations=specializations["vision"],
@@ -1538,8 +1502,6 @@ def compile(
                 use_onnx_subfunctions=use_onnx_subfunctions,
                 **compiler_options,
             )
-            self.qpc_paths["vision_qpc_path"] = vision_qpc_path
-
         # Custom NPI file options
         if hasattr(self.model, "get_npi_file") and "node_precision_info" not in compiler_options:
             compiler_options["node_precision_info"] = self.model.get_npi_file(self.model.name_or_path)
@@ -1550,34 +1512,18 @@ def compile(
             for output_name in output_names["lang"]:
                 if output_name.endswith("_RetainedState"):
                     custom_io_lang[output_name[: -len("_RetainedState")]] = (
-                        "float16"
-                        if ("vision_embeds" in output_name or "deepstack_features" in output_name)
-                        else kv_cache_dtype
+                        "float16" if "vision_embeds" in output_name else kv_cache_dtype
                     )
 
             # outputs
             for output_name in output_names["lang"]:
                 if output_name.endswith("_RetainedState"):
-                    custom_io_lang[output_name] = (
-                        "float16"
-                        if ("vision_embeds" in output_name or "deepstack_features" in output_name)
-                        else kv_cache_dtype
-                    )
-            if prefill_only:
-                specializations = specializations["lang"][:1]
-                qpc_key = "lang_prefill_qpc_path"
-            elif prefill_seq_len == 1:
-                specializations = specializations["lang"][-1:]
-                qpc_key = "lang_decode_qpc_path"
-            else:
-                specializations = specializations["lang"]
-                qpc_key = "lang_qpc_path"
-
-            lang_qpc_path = self.lang_model._compile(
+                    custom_io_lang[output_name] = "float16" if "vision_embeds" in output_name else kv_cache_dtype
+            self.lang_model._compile(
                 compile_dir=compile_dir,
                 compile_only=True,
                 retained_state=True,
-                specializations=specializations,
+                specializations=specializations["lang"],
                 convert_to_fp16=True,
                 mxfp6_matmul=mxfp6_matmul,
                 mdp_ts_num_devices=num_devices,
@@ -1587,8 +1533,7 @@ def compile(
                 use_onnx_subfunctions=use_onnx_subfunctions,
                 **compiler_options,
             )
-            self.qpc_paths.update({qpc_key: lang_qpc_path})
-        return self.qpc_paths
+        return self.qpc_path
 
     def generate(
         self,
@@ -1720,6 +1665,7 @@ def kv_offload_generate(
         AssertionError
             If `generation_len` is not greater than zero.
         """
+        # breakpoint()
         if not self.lang_model.qpc_path:
             raise TypeError("Please run compile API for language model first!")
 
@@ -1751,6 +1697,7 @@ def kv_offload_generate(
             [x[lang_session.binding_index_map["input_ids"]][1][1] for x in lang_session.allowed_shapes]
             + [lang_session.bindings[lang_session.binding_index_map["input_ids"]].dims[1]]
         )
+        # breakpoint()
         input_len = inputs["attention_mask"].sum(1, keepdims=True)
         input_ids_length = inputs["input_ids"].shape[1]
         num_chunks = -(input_ids_length // -prefill_seq_len)  # ceil divide without float
@@ -1787,6 +1734,14 @@ def kv_offload_generate(
 
         vision_inputs_fp16 = {"pixel_values", "image_masks"}
         vision_inputs.update({k: vision_inputs[k].astype("float16") for k in vision_inputs_fp16 if k in vision_inputs})
+        pixel_values_shape = list(vision_inputs["pixel_values"].shape)
+        idx = next(i for i, inner in enumerate(vision_session.allowed_shapes) if (2, pixel_values_shape) in inner)
+
+        biffer_set = {
+            "vision_embeds": np.zeros(vision_session.allowed_shapes[idx][2][1], dtype=np.float16),
+            "image_grid_thw": np.zeros(vision_session.allowed_shapes[idx][0][1], dtype=np.int64),
+        }
+        vision_session.set_buffers(biffer_set)
 
         vision_start = perf_counter()
 
@@ -1796,6 +1751,7 @@ def kv_offload_generate(
         vision_end = perf_counter()
 
         lang_inputs = {k: v for k, v in inputs.items() if k not in vision_inputs}
+        # breakpoint()
         if "position_ids" in inputs:
             lang_inputs["position_ids"] = inputs["position_ids"]
             lang_inputs.pop("attention_mask")
@@ -1807,10 +1763,21 @@ def kv_offload_generate(
         not_mllama = hasattr(self.model.config, "model_type") and self.model.config.model_type != "mllama"
         if not_mllama:
             lang_inputs["image_idx"] = np.array([[0]])
+        # breakpoint()
         if self.vision_model.qpc_path:
             vision_session.deactivate()
         lang_session.activate()
 
+        vision_outputs["vision_embeds"] = np.pad(
+            vision_outputs["vision_embeds"],
+            pad_width=(
+                (0, 0),
+                (0, lang_session.allowed_shapes[0][1][1][1] - vision_session.allowed_shapes[idx][2][1][1]),
+                (0, 0),
+            ),  # pad axis=1 only
+            mode="constant",
+            constant_values=0,
+        )
         lang_session.set_buffers(vision_outputs)
 
         if self.comp_ctx_lengths_prefill is not None:
@@ -1821,6 +1788,7 @@ def kv_offload_generate(
             lang_inputs["comp_ctx_lengths"] = list_of_comp_ctx_lengths_prefill[prefill_ccl_id]
 
         lang_start = perf_counter()
+        # breakpoint()
         # Run prefill
         chunk_inputs = lang_inputs.copy()
         for i in range(num_chunks):
@@ -1852,6 +1820,7 @@ def kv_offload_generate(
         )
         if not_mllama:
             lang_session.skip_buffers(vision_outputs.keys())
+        # breakpoint()
         # Get first token
         lang_inputs["input_ids"] = outputs["logits"].argmax(2)
         lang_inputs["position_ids"] = np.max(lang_inputs["position_ids"], axis=-1, keepdims=True) + 1
@@ -2686,7 +2655,7 @@ class QEFFAutoModelForCausalLM(QEFFBaseModel):
 
     _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
 
-    def __update_prefill_transform(
+    def prefill(
         self,
         enable: Optional[bool] = True,
         enable_chunking: Optional[bool] = False,
@@ -2745,6 +2714,10 @@ def __init__(
             raise TypeError(f"Required pytorch module for CausalLM or LMHeadModel, got {model_class_name}")
         _configure_proxy_for_model(self, kwargs.pop("enable_proxy", False))
 
+        if kwargs.pop("enable_proxy", False):
+            self._pytorch_transforms.append(QeffProxyModuleTransform)
+            logger.info("Proxy Model Enabled for QEfficient Model")
+
         # TODO: remove from version 1.20
         if kwargs.pop("full_batch_size", None):
             continuous_batching = True
@@ -2981,7 +2954,7 @@ def export(
                     raise NotImplementedError(
                         "Looks like you are trying to run prefix-caching without chunking, this feature is not available yet!"
                     )
-                self.__update_prefill_transform(enable=True, enable_chunking=enable_chunking)
+                self.prefill(enable=True, enable_chunking=enable_chunking)
                 self.hash_params.pop("retain_full_kv", None)
                 seq_len = self.get_seq_len_and_handle_specialized_prefill_model(
                     prefill_seq_len=prefill_seq_len, enable_chunking=enable_chunking
@@ -2992,7 +2965,7 @@ def export(
                     else seq_len
                 )
             else:
-                self.__update_prefill_transform(False, retain_full_kv=kwargs.get("retain_full_kv", False))
+                self.prefill(False, retain_full_kv=kwargs.get("retain_full_kv", False))
                 self.hash_params.pop("prefill_only", None)
                 self.hash_params.pop("NUM_Q_BLOCKS", None)
                 self.hash_params.pop("NUM_FFN_BLOCKS", None)
@@ -3995,6 +3968,10 @@ class QEFFAutoModelForCTC(QEFFTransformersBase):
     _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
 
     def __init__(self, model: nn.Module, **kwargs):
+        if kwargs.pop("enable_proxy", False):
+            self._pytorch_transforms.append(QeffProxyModuleTransform)
+            logger.info("Proxy Model Enabled for QEfficient Model")
+
         super().__init__(model, **kwargs)
         self.model.base_model.config.use_cache = True
 
diff --git a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py
index 070856c6e..4d011a0fb 100644
--- a/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py
+++ b/QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py
@@ -833,7 +833,7 @@ def get_specializations(
         height: int = None,
         width: int = None,
         time: int = 1,
-        # dimensions: List = None,
+        dimensions: List = None,
         num_frames: int = 1,
         kv_offload: bool = False,
         continuous_batching: bool = False,
@@ -849,6 +849,7 @@ def get_specializations(
             logger.warning(
                 "Setting height and width to be 1365 and 2048 respectively, as it was neither passed nor found in vision_config"
             )
+            dimensions = [[height, width]]
         prefill_seq_len = prefill_seq_len if prefill_seq_len else 128
         ctx_len = ctx_len if ctx_len else constants.INTERN_CTX_LEN
         channel = 3
@@ -904,26 +905,28 @@ def smart_resize(
                 w_bar = ceil_by_factor(width * beta, factor)
             return h_bar, w_bar
 
-        resized_height, resized_width = smart_resize(height=height, width=width)
-        grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
-        grid_height = grid_h * grid_w
-        grid_width = patch_size * patch_size * temporal_patch_size * channel
-        vision_size = grid_height // 4
-        vision_size = vision_size * num_frames * time
-        grid_height = grid_height * time * batch_size
-
-        vision = [
-            {
-                "batch_size": batch_size,
-                "vision_size": vision_size,
-                "grid_height": grid_height,
-                "grid_width": grid_width,
-                "time": time,
-                "grid_h": grid_h,
-                "grid_w": grid_w,
-                "num_feature_layers": len(self.config.vision_config.deepstack_visual_indexes),
-            }
-        ]
+        vision = []
+        max_vision_size = 0
+        for dimension in dimensions:
+            resized_height, resized_width = smart_resize(height=dimension[0], width=dimension[1])
+            grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
+            grid_height = grid_h * grid_w
+            grid_width = patch_size * patch_size * temporal_patch_size * channel
+            vision_size = grid_height // 4
+            vision_size = vision_size * num_frames
+            grid_height = grid_height * batch_size
+
+            max_vision_size = max(max_vision_size, vision_size)
+            vision.append(
+                {
+                    "batch_size": batch_size,
+                    "vision_size": vision_size,
+                    "grid_height": grid_height,
+                    "grid_width": grid_width,
+                    "grid_h": grid_h,
+                    "grid_w": grid_w,
+                }
+            )
 
         if comp_ctx_lengths_prefill is not None:
             lang = []
@@ -933,10 +936,9 @@ def smart_resize(
                     "batch_size": 1 if continuous_batching else batch_size,
                     "seq_len": prefill_seq_len,
                     "ctx_len": ctx_len,
-                    "vision_size": vision_size,
+                    "vision_size": max_vision_size,
                     "comp_ctx_lengths": comp_ctx_lengths_prefill[i],
                     "vision_batch_size": batch_size,
-                    "num_feature_layers": len(self.config.vision_config.deepstack_visual_indexes),
                 }
 
                 if continuous_batching:
@@ -953,10 +955,9 @@ def smart_resize(
                     "batch_size": full_batch_size if continuous_batching else batch_size,
                     "seq_len": "1",
                     "ctx_len": ctx_len,
-                    "vision_size": vision_size,
+                    "vision_size": max_vision_size,
                     "comp_ctx_lengths": comp_ctx_lengths_decode[i],
                     "vision_batch_size": batch_size,
-                    "num_feature_layers": len(self.config.vision_config.deepstack_visual_indexes),
                 }
 
                 if continuous_batching:
@@ -970,9 +971,8 @@ def smart_resize(
                 "batch_size": 1 if continuous_batching else batch_size,
                 "seq_len": prefill_seq_len,
                 "ctx_len": ctx_len,
-                "vision_size": vision_size,
+                "vision_size": max_vision_size,
                 "vision_batch_size": batch_size,
-                "num_feature_layers": len(self.config.vision_config.deepstack_visual_indexes),
             }
 
             if continuous_batching:
@@ -986,9 +986,8 @@ def smart_resize(
                 "batch_size": full_batch_size if continuous_batching else batch_size,
                 "seq_len": 1,
                 "ctx_len": ctx_len,
-                "vision_size": vision_size,
+                "vision_size": max_vision_size,
                 "vision_batch_size": batch_size,
-                "num_feature_layers": len(self.config.vision_config.deepstack_visual_indexes),
             }
 
             if continuous_batching:
diff --git a/examples/image_text_to_text/models/qwen3_vl_multi.py b/examples/image_text_to_text/models/qwen3_vl_multi.py
new file mode 100644
index 000000000..54231ff6c
--- /dev/null
+++ b/examples/image_text_to_text/models/qwen3_vl_multi.py
@@ -0,0 +1,153 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import requests
+import transformers
+from PIL import Image
+from qwen_vl_utils import process_vision_info
+from transformers import AutoConfig, AutoProcessor, TextStreamer
+
+from QEfficient import QEFFAutoModelForImageTextToText
+
+## For AWQ model update pytorch version to 2.8.*
+model_id = "Qwen/Qwen3-VL-32B-Instruct"
+config = AutoConfig.from_pretrained(model_id)
+config.vision_config.depth = 9
+config.text_config.num_hidden_layers = 1
+config.vision_config.deepstack_visual_indexes = [8]
+
+qeff_model = QEFFAutoModelForImageTextToText.from_pretrained(
+    model_id, attn_implementation="eager", kv_offload=True, config=config
+)
+tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
+processor = AutoProcessor.from_pretrained(model_id)
+
+### use skip_vision=Ture, if want to run only text, ow false ###
+skip_vision = False
+
+if skip_vision:
+    ## Only Text ##
+
+    ## Set Batch_Size ##
+    batch_size = 1
+    qeff_model.compile(
+        batch_size=batch_size,
+        prefill_seq_len=128,
+        ctx_len=4096,
+        num_cores=16,
+        num_devices=8,
+        height=354,
+        width=536,
+        mxfp6_matmul=False,
+        aic_enable_depth_first=True,
+        skip_vision=True,
+        mos=1,
+    )
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Tell me about yourself."},
+            ],
+        },
+    ]
+
+    messages = [messages] * batch_size
+
+    inputs = processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+    )
+
+    import ipdb
+
+    ipdb.set_trace()
+
+    inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size)
+
+    streamer = TextStreamer(tokenizer)
+    output = qeff_model.generate(inputs=inputs, generation_len=100)
+    print(output.generated_ids)
+    print(tokenizer.batch_decode(output.generated_ids))
+    print(output)
+
+else:
+    batch_size = 1
+    ctx_len = 5120
+
+    ## The dimensions list stores all the height × width pairs required for compilation ##
+    # dimension [hxw]
+    dimensions = [[354, 536], [240, 360], [1024, 1024]]
+
+    ## Vision + Text ##
+    qeff_model.compile(
+        batch_size=batch_size,
+        prefill_seq_len=128,
+        ctx_len=5120,
+        num_cores=16,
+        num_devices=4,
+        dimensions=dimensions,
+        mxfp6_matmul=True,
+        mxint8_kv_cache=True,
+        aic_enable_depth_first=True,
+        mos=1,
+    )
+    breakpoint()
+    ### IMAGE + TEXT ###
+    image_url = "https://picsum.photos/id/237/536/354"
+
+    image = Image.open(requests.get(image_url, stream=True).raw)
+
+    ## Resize to any deimnsion present in specializations ##
+    # [wxh]
+    image = image.resize((1024, 1024))
+    breakpoint()
+
+    messages_1 = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": "Describe this image."},
+            ],
+        },
+    ]
+
+    messages_2 = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": "Describe about the color of the dog."},
+            ],
+        },
+    ]
+
+    messages = [messages_1] * batch_size
+
+    texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages]
+
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=texts,
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    breakpoint()
+    inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size)
+    breakpoint()
+    streamer = TextStreamer(tokenizer)
+    output = qeff_model.generate(inputs=inputs, generation_len=100)
+    print(output.generated_ids)
+    print(tokenizer.batch_decode(output.generated_ids))
+    print(output)
diff --git a/examples/image_text_to_text/models/qwen3vl/qwen3_vl.py b/examples/image_text_to_text/models/qwen3vl/qwen3_vl.py
index 20badbfba..d6e29589a 100644
--- a/examples/image_text_to_text/models/qwen3vl/qwen3_vl.py
+++ b/examples/image_text_to_text/models/qwen3vl/qwen3_vl.py
@@ -95,10 +95,6 @@
 
     ### IMAGE + TEXT ###
     image_url = "https://picsum.photos/id/237/536/354"
-    # image_url = (
-    #     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png"
-    # )
-
     image = Image.open(requests.get(image_url, stream=True).raw)
 
     messages_1 = [
@@ -111,16 +107,6 @@
         },
     ]
 
-    # messages_2 = [
-    #     {
-    #         "role": "user",
-    #         "content": [
-    #             {"type": "image", "image": image},
-    #             {"type": "text", "text": "Describe about the color of the dog."},
-    #         ],
-    #     },
-    # ]
-
     messages = [messages_1] * batch_size
 
     texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages]