From 5dba890f3291939191c99fc435e4505c65a2bd41 Mon Sep 17 00:00:00 2001
From: quic-sanising <sanising@qti.qualcomm.com>
Date: Mon, 16 Mar 2026 16:44:36 -0700
Subject: [PATCH 1/6] Copy changes from PR #755

Signed-off-by: quic-sanising <sanising@qti.qualcomm.com>
---
 .../transformers/models/modeling_auto.py      |  20 +++
 .../models/qwen2_5_vl/modeling_qwen2_5_vl.py  |  57 ++++---
 .../qwen_vl/multi_specialization_inference.py | 148 ++++++++++++++++++
 3 files changed, 201 insertions(+), 24 deletions(-)
 create mode 100644 examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index c242a97e3..62121d6b3 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -1432,6 +1432,15 @@ def kv_offload_generate(
         vision_inputs_fp16 = {"pixel_values", "image_masks"}
         vision_inputs.update({k: vision_inputs[k].astype("float16") for k in vision_inputs_fp16 if k in vision_inputs})
 
+        pixel_values_shape = list(vision_inputs["pixel_values"].shape)
+        idx = next(i for i, inner in enumerate(vision_session.allowed_shapes) if (2, pixel_values_shape) in inner)
+
+        buffer_set = {
+            "vision_embeds": np.zeros(vision_session.allowed_shapes[idx][2][1], dtype=np.float16),
+            "image_grid_thw": np.zeros(vision_session.allowed_shapes[idx][0][1], dtype=np.int64),
+        }
+        vision_session.set_buffers(buffer_set)
+
         vision_start = perf_counter()
 
         vision_outputs = {}
@@ -1457,6 +1466,17 @@ def kv_offload_generate(
             vision_session.deactivate()
         lang_session.activate()
 
+        vision_outputs["vision_embeds"] = np.pad(
+            vision_outputs["vision_embeds"],
+            pad_width=(
+                (0, 0),
+                (0, lang_session.allowed_shapes[0][1][1][1] - vision_session.allowed_shapes[idx][2][1][1]),
+                (0, 0),
+            ),  # pad axis=1 only
+            mode="constant",
+            constant_values=0,
+        )
+
         lang_session.set_buffers(vision_outputs)
 
         if self.comp_ctx_lengths_prefill is not None:
diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
index fa1bdd9b9..c9e622e08 100644
--- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
+++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
@@ -1008,8 +1008,8 @@ def get_specializations(
         prefill_seq_len: int,
         ctx_len: int,
         img_size: None,
-        height: int = None,
-        width: int = None,
+        height: int | List[int] = None,
+        width: int | List[int] = None,
         num_frames: int = 1,
         kv_offload: bool = False,
         continuous_batching: bool = False,
@@ -1081,24 +1081,33 @@ def smart_resize(
                 w_bar = ceil_by_factor(width * beta, factor)
             return h_bar, w_bar
 
-        resized_height, resized_width = smart_resize(height=height, width=width)
-        grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
-        grid_height = grid_h * grid_w
-        grid_width = patch_size * patch_size * temporal_patch_size * channel
-        vision_size = grid_height // 4
-        vision_size = vision_size * num_frames
-        grid_height = grid_height * batch_size
-
-        vision = [
-            {
-                "batch_size": batch_size,
-                "vision_size": vision_size,
-                "grid_height": grid_height,
-                "grid_width": grid_width,
-                "grid_h": grid_h,
-                "grid_w": grid_w,
-            }
-        ]
+        vision = []
+        max_vision_size = 0
+
+        height = [height] if isinstance(height, int) else height
+        width = [width] if isinstance(width, int) else width
+
+        for h, w in zip(height, width):
+            resized_height, resized_width = smart_resize(height=h, width=w)
+            grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
+            grid_height = grid_h * grid_w
+            grid_width = patch_size * patch_size * temporal_patch_size * channel
+            vision_size = grid_height // 4
+            vision_size = vision_size * num_frames
+            grid_height = grid_height * batch_size
+
+            max_vision_size = max(max_vision_size, vision_size)
+
+            vision.append(
+                {
+                    "batch_size": batch_size,
+                    "vision_size": vision_size,
+                    "grid_height": grid_height,
+                    "grid_width": grid_width,
+                    "grid_h": grid_h,
+                    "grid_w": grid_w,
+                }
+            )
 
         if comp_ctx_lengths_prefill is not None:
             lang = []
@@ -1108,7 +1117,7 @@ def smart_resize(
                     "batch_size": 1 if continuous_batching else batch_size,
                     "seq_len": prefill_seq_len,
                     "ctx_len": ctx_len,
-                    "vision_size": vision_size,
+                    "vision_size": max_vision_size,
                     "comp_ctx_lengths": comp_ctx_lengths_prefill[i],
                     "vision_batch_size": batch_size,
                 }
@@ -1127,7 +1136,7 @@ def smart_resize(
                     "batch_size": full_batch_size if continuous_batching else batch_size,
                     "seq_len": "1",
                     "ctx_len": ctx_len,
-                    "vision_size": vision_size,
+                    "vision_size": max_vision_size,
                     "comp_ctx_lengths": comp_ctx_lengths_decode[i],
                     "vision_batch_size": batch_size,
                 }
@@ -1143,7 +1152,7 @@ def smart_resize(
                 "batch_size": 1 if continuous_batching else batch_size,
                 "seq_len": prefill_seq_len,
                 "ctx_len": ctx_len,
-                "vision_size": vision_size,
+                "vision_size": max_vision_size,
                 "vision_batch_size": batch_size,
             }
 
@@ -1158,7 +1167,7 @@ def smart_resize(
                 "batch_size": full_batch_size if continuous_batching else batch_size,
                 "seq_len": 1,
                 "ctx_len": ctx_len,
-                "vision_size": vision_size,
+                "vision_size": max_vision_size,
                 "vision_batch_size": batch_size,
             }
 
diff --git a/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py b/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py
new file mode 100644
index 000000000..cdcefaa2b
--- /dev/null
+++ b/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py
@@ -0,0 +1,148 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import requests
+import transformers
+from PIL import Image
+from qwen_vl_utils import process_vision_info
+from transformers import AutoConfig, AutoProcessor, TextStreamer
+
+from QEfficient import QEFFAutoModelForImageTextToText
+
+## For AWQ model update pytorch version to 2.8.*
+model_id = "Qwen/Qwen2.5-VL-3B-Instruct"
+config = AutoConfig.from_pretrained(model_id)
+config.text_config.num_hidden_layers = 2
+
+qeff_model = QEFFAutoModelForImageTextToText.from_pretrained(
+    model_id, attn_implementation="eager", kv_offload=True, config=config
+)
+tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
+processor = AutoProcessor.from_pretrained(model_id)
+
+### use skip_vision=Ture, if want to run only text, ow false ###
+skip_vision = False
+
+if skip_vision:
+    ## Only Text ##
+
+    ## Set Batch_Size ##
+    batch_size = 1
+    qeff_model.compile(
+        batch_size=batch_size,
+        prefill_seq_len=128,
+        ctx_len=4096,
+        num_cores=16,
+        num_devices=8,
+        height=354,
+        width=536,
+        mxfp6_matmul=False,
+        aic_enable_depth_first=True,
+        skip_vision=True,
+        mos=1,
+    )
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Tell me about yourself."},
+            ],
+        },
+    ]
+
+    messages = [messages] * batch_size
+
+    inputs = processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+    )
+
+    import ipdb
+
+    ipdb.set_trace()
+
+    inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size)
+
+    streamer = TextStreamer(tokenizer)
+    output = qeff_model.generate(inputs=inputs, generation_len=100)
+    print(output.generated_ids)
+    print(tokenizer.batch_decode(output.generated_ids))
+    print(output)
+
+else:
+    batch_size = 1
+    ctx_len = 5120
+
+    ## The dimensions list stores all the height × width pairs required for compilation ##
+    dimensions = [[354, 536], [180, 320], [240, 360], [120, 360]]
+
+    ## Vision + Text ##
+    qeff_model.compile(
+        batch_size=batch_size,
+        prefill_seq_len=128,
+        ctx_len=5120,
+        num_cores=16,
+        num_devices=8,
+        dimensions=dimensions,
+        mxfp6_matmul=True,
+        mxint8_kv_cache=True,
+        aic_enable_depth_first=True,
+        mos=1,
+    )
+
+    ### IMAGE + TEXT ###
+    image_url = "https://picsum.photos/id/237/536/354"
+
+    image = Image.open(requests.get(image_url, stream=True).raw)
+
+    ## Resize to any deimnsion present in specializations ##
+    image = image.resize((360, 120))
+
+    messages_1 = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": "Describe this image."},
+            ],
+        },
+    ]
+
+    messages_2 = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": "Describe about the color of the dog."},
+            ],
+        },
+    ]
+
+    messages = [messages_1] * batch_size
+
+    texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages]
+
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=texts,
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+
+    inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size)
+
+    streamer = TextStreamer(tokenizer)
+    output = qeff_model.generate(inputs=inputs, generation_len=100)
+    print(output.generated_ids)
+    print(tokenizer.batch_decode(output.generated_ids))
+    print(output)

From 3e6208f76bc7fae99f415de532584f7c696dec47 Mon Sep 17 00:00:00 2001
From: quic-sanising <sanising@qti.qualcomm.com>
Date: Tue, 17 Mar 2026 17:07:14 -0700
Subject: [PATCH 2/6] Fix logic to calculate vision tokens

Signed-off-by: quic-sanising <sanising@qti.qualcomm.com>
---
 .../models/qwen2_5_vl/modeling_qwen2_5_vl.py  | 30 +++++++++----------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
index c9e622e08..472a84e30 100644
--- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
+++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
@@ -1032,10 +1032,11 @@ def get_specializations(
         patch_size = self.config.vision_config.patch_size
         temporal_patch_size = self.config.vision_config.temporal_patch_size
 
+        # Modified from qwen_vl_utils/vision_process.py
         IMAGE_FACTOR = 28
-        MIN_PIXELS = 4 * 28 * 28
-        MAX_PIXELS = 16384 * 28 * 28
         MAX_RATIO = 200
+        IMAGE_MIN_TOKEN_NUM = 4
+        IMAGE_MAX_TOKEN_NUM = 16384
 
         def round_by_factor(number: int, factor: int) -> int:
             """Returns the closest integer to 'number' that is divisible by 'factor'."""
@@ -1053,18 +1054,19 @@ def smart_resize(
             height: int,
             width: int,
             factor: int = IMAGE_FACTOR,
-            min_pixels: int = MIN_PIXELS,
-            max_pixels: int = MAX_PIXELS,
+            min_pixels: Optional[int] = None,
+            max_pixels: Optional[int] = None,
         ) -> tuple[int, int]:
             """
             Rescales the image so that the following conditions are met:
 
             1. Both dimensions (height and width) are divisible by 'factor'.
-
             2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
-
             3. The aspect ratio of the image is maintained as closely as possible.
             """
+            max_pixels = max_pixels if max_pixels is not None else (IMAGE_MAX_TOKEN_NUM * factor ** 2)
+            min_pixels = min_pixels if min_pixels is not None else (IMAGE_MIN_TOKEN_NUM * factor ** 2)
+            assert max_pixels >= min_pixels, "The max_pixels of image must be greater than or equal to min_pixels."
             if max(height, width) / min(height, width) > MAX_RATIO:
                 raise ValueError(
                     f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
@@ -1082,21 +1084,17 @@ def smart_resize(
             return h_bar, w_bar
 
         vision = []
-        max_vision_size = 0
-
+        min_vision_size = ctx_len
         height = [height] if isinstance(height, int) else height
         width = [width] if isinstance(width, int) else width
-
         for h, w in zip(height, width):
             resized_height, resized_width = smart_resize(height=h, width=w)
             grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
             grid_height = grid_h * grid_w
             grid_width = patch_size * patch_size * temporal_patch_size * channel
             vision_size = grid_height // 4
-            vision_size = vision_size * num_frames
             grid_height = grid_height * batch_size
-
-            max_vision_size = max(max_vision_size, vision_size)
+            min_vision_size = min(min_vision_size, vision_size * num_frames)
 
             vision.append(
                 {
@@ -1117,7 +1115,7 @@ def smart_resize(
                     "batch_size": 1 if continuous_batching else batch_size,
                     "seq_len": prefill_seq_len,
                     "ctx_len": ctx_len,
-                    "vision_size": max_vision_size,
+                    "vision_size": min_vision_size,
                     "comp_ctx_lengths": comp_ctx_lengths_prefill[i],
                     "vision_batch_size": batch_size,
                 }
@@ -1136,7 +1134,7 @@ def smart_resize(
                     "batch_size": full_batch_size if continuous_batching else batch_size,
                     "seq_len": "1",
                     "ctx_len": ctx_len,
-                    "vision_size": max_vision_size,
+                    "vision_size": min_vision_size,
                     "comp_ctx_lengths": comp_ctx_lengths_decode[i],
                     "vision_batch_size": batch_size,
                 }
@@ -1152,7 +1150,7 @@ def smart_resize(
                 "batch_size": 1 if continuous_batching else batch_size,
                 "seq_len": prefill_seq_len,
                 "ctx_len": ctx_len,
-                "vision_size": max_vision_size,
+                "vision_size": min_vision_size,
                 "vision_batch_size": batch_size,
             }
 
@@ -1167,7 +1165,7 @@ def smart_resize(
                 "batch_size": full_batch_size if continuous_batching else batch_size,
                 "seq_len": 1,
                 "ctx_len": ctx_len,
-                "vision_size": max_vision_size,
+                "vision_size": min_vision_size,
                 "vision_batch_size": batch_size,
             }
 

From 58183e9fc0ce44987944c6349041e45f71f034b8 Mon Sep 17 00:00:00 2001
From: quic-sanising <sanising@qti.qualcomm.com>
Date: Tue, 17 Mar 2026 17:20:43 -0700
Subject: [PATCH 3/6] Update example to remove dimensions

Signed-off-by: quic-sanising <sanising@qti.qualcomm.com>
---
 .../qwen_vl/multi_specialization_inference.py      | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py b/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py
index cdcefaa2b..4fd9b4c98 100644
--- a/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py
+++ b/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py
@@ -79,19 +79,19 @@
 
 else:
     batch_size = 1
-    ctx_len = 5120
-
-    ## The dimensions list stores all the height × width pairs required for compilation ##
-    dimensions = [[354, 536], [180, 320], [240, 360], [120, 360]]
+    ctx_len = 14336
+    heights = [360, 320, 360, 454, 536, 640, 720, 910, 720, 1280, 1920]
+    widths = [120, 180, 240, 256, 354, 360, 480, 512, 576, 720, 1080]
 
     ## Vision + Text ##
     qeff_model.compile(
         batch_size=batch_size,
         prefill_seq_len=128,
-        ctx_len=5120,
+        ctx_len=ctx_len,
         num_cores=16,
-        num_devices=8,
-        dimensions=dimensions,
+        num_devices=2,
+        height=heights,
+        width=widths,
         mxfp6_matmul=True,
         mxint8_kv_cache=True,
         aic_enable_depth_first=True,

From 6dede8c651687db084f8adaa4f6d7a6bcfc605a7 Mon Sep 17 00:00:00 2001
From: quic-sanising <sanising@qti.qualcomm.com>
Date: Wed, 18 Mar 2026 14:14:26 -0700
Subject: [PATCH 4/6] Import smart_resize from qwen_vl_utils and allow user
 input for min and max pixels

Signed-off-by: quic-sanising <sanising@qti.qualcomm.com>
---
 .../models/qwen2_5_vl/modeling_qwen2_5_vl.py  | 63 ++++---------------
 .../qwen_vl/multi_specialization_inference.py | 19 +++---
 2 files changed, 22 insertions(+), 60 deletions(-)

diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
index 472a84e30..077e26582 100644
--- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
+++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
@@ -12,6 +12,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from qwen_vl_utils import smart_resize
 from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLModel
 from transformers.cache_utils import Cache
 from transformers.modeling_outputs import (
@@ -1026,69 +1027,31 @@ def get_specializations(
             logger.warning(
                 f"Setting height and width to be {height} and {width} respectively, as it was neither passed nor found in vision_config"
             )
+        height = [height] if isinstance(height, int) else height
+        width = [width] if isinstance(width, int) else width
+
         prefill_seq_len = prefill_seq_len if prefill_seq_len else 128
         ctx_len = ctx_len if ctx_len else constants.INTERN_CTX_LEN
         channel = 3
         patch_size = self.config.vision_config.patch_size
         temporal_patch_size = self.config.vision_config.temporal_patch_size
 
-        # Modified from qwen_vl_utils/vision_process.py
         IMAGE_FACTOR = 28
-        MAX_RATIO = 200
         IMAGE_MIN_TOKEN_NUM = 4
         IMAGE_MAX_TOKEN_NUM = 16384
-
-        def round_by_factor(number: int, factor: int) -> int:
-            """Returns the closest integer to 'number' that is divisible by 'factor'."""
-            return round(number / factor) * factor
-
-        def ceil_by_factor(number: int, factor: int) -> int:
-            """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
-            return math.ceil(number / factor) * factor
-
-        def floor_by_factor(number: int, factor: int) -> int:
-            """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
-            return math.floor(number / factor) * factor
-
-        def smart_resize(
-            height: int,
-            width: int,
-            factor: int = IMAGE_FACTOR,
-            min_pixels: Optional[int] = None,
-            max_pixels: Optional[int] = None,
-        ) -> tuple[int, int]:
-            """
-            Rescales the image so that the following conditions are met:
-
-            1. Both dimensions (height and width) are divisible by 'factor'.
-            2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
-            3. The aspect ratio of the image is maintained as closely as possible.
-            """
-            max_pixels = max_pixels if max_pixels is not None else (IMAGE_MAX_TOKEN_NUM * factor ** 2)
-            min_pixels = min_pixels if min_pixels is not None else (IMAGE_MIN_TOKEN_NUM * factor ** 2)
-            assert max_pixels >= min_pixels, "The max_pixels of image must be greater than or equal to min_pixels."
-            if max(height, width) / min(height, width) > MAX_RATIO:
-                raise ValueError(
-                    f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
-                )
-            h_bar = max(factor, round_by_factor(height, factor))
-            w_bar = max(factor, round_by_factor(width, factor))
-            if h_bar * w_bar > max_pixels:
-                beta = math.sqrt((height * width) / max_pixels)
-                h_bar = floor_by_factor(height / beta, factor)
-                w_bar = floor_by_factor(width / beta, factor)
-            elif h_bar * w_bar < min_pixels:
-                beta = math.sqrt(min_pixels / (height * width))
-                h_bar = ceil_by_factor(height * beta, factor)
-                w_bar = ceil_by_factor(width * beta, factor)
-            return h_bar, w_bar
+        min_pixels = IMAGE_MIN_TOKEN_NUM * IMAGE_FACTOR**2
+        max_pixels = IMAGE_MAX_TOKEN_NUM * IMAGE_FACTOR**2
+        mm_processor_kwargs = compiler_options.pop("mm_processor_kwargs", None)
+        if mm_processor_kwargs:
+            min_pixels = mm_processor_kwargs.get("min_pixels", min_pixels)
+            max_pixels = mm_processor_kwargs.get("max_pixels", max_pixels)
 
         vision = []
         min_vision_size = ctx_len
-        height = [height] if isinstance(height, int) else height
-        width = [width] if isinstance(width, int) else width
         for h, w in zip(height, width):
-            resized_height, resized_width = smart_resize(height=h, width=w)
+            resized_height, resized_width = smart_resize(
+                height=h, width=w, factor=IMAGE_FACTOR, min_pixels=min_pixels, max_pixels=max_pixels
+            )
             grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
             grid_height = grid_h * grid_w
             grid_width = patch_size * patch_size * temporal_patch_size * channel
diff --git a/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py b/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py
index 4fd9b4c98..00dedbeb6 100644
--- a/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py
+++ b/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py
@@ -80,8 +80,9 @@
 else:
     batch_size = 1
     ctx_len = 14336
-    heights = [360, 320, 360, 454, 536, 640, 720, 910, 720, 1280, 1920]
-    widths = [120, 180, 240, 256, 354, 360, 480, 512, 576, 720, 1080]
+    widths = [360, 320, 360, 454, 536, 640, 720, 910, 720, 1280, 1920]
+    heights = [120, 180, 240, 256, 354, 360, 480, 512, 576, 720, 1080]
+    num_frames = [177, 139, 78, 64, 37, 30, 20, 16, 16, 7, 7]
 
     ## Vision + Text ##
     qeff_model.compile(
@@ -92,6 +93,11 @@
         num_devices=2,
         height=heights,
         width=widths,
+        num_frames=max(num_frames),
+        mm_processor_kwargs={
+            "min_pixels": 4 * 28 * 28,
+            "max_pixels": 16384 * 28 * 28,
+        },
         mxfp6_matmul=True,
         mxint8_kv_cache=True,
         aic_enable_depth_first=True,
@@ -100,12 +106,8 @@
 
     ### IMAGE + TEXT ###
     image_url = "https://picsum.photos/id/237/536/354"
-
     image = Image.open(requests.get(image_url, stream=True).raw)
-
-    ## Resize to any deimnsion present in specializations ##
-    image = image.resize((360, 120))
-
+    image = image.resize((360, 120))  # Resize to any deimnsion present in specializations (width, height)
     messages_1 = [
         {
             "role": "user",
@@ -115,7 +117,6 @@
             ],
         },
     ]
-
     messages_2 = [
         {
             "role": "user",
@@ -125,9 +126,7 @@
             ],
         },
     ]
-
     messages = [messages_1] * batch_size
-
     texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages]
 
     image_inputs, video_inputs = process_vision_info(messages)

From c8cf3229a78addf69421eae10f94471472b22e5d Mon Sep 17 00:00:00 2001
From: quic-sanising <sanising@qti.qualcomm.com>
Date: Wed, 18 Mar 2026 14:19:11 -0700
Subject: [PATCH 5/6] Reformat code

Signed-off-by: quic-sanising <sanising@qti.qualcomm.com>
---
 .../qwen_vl/multi_specialization_inference.py     | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py b/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py
index 00dedbeb6..dd36e8311 100644
--- a/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py
+++ b/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py
@@ -13,7 +13,7 @@
 
 from QEfficient import QEFFAutoModelForImageTextToText
 
-## For AWQ model update pytorch version to 2.8.*
+# For AWQ model update pytorch version to 2.8.*
 model_id = "Qwen/Qwen2.5-VL-3B-Instruct"
 config = AutoConfig.from_pretrained(model_id)
 config.text_config.num_hidden_layers = 2
@@ -24,13 +24,10 @@
 tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
 processor = AutoProcessor.from_pretrained(model_id)
 
-### use skip_vision=Ture, if want to run only text, ow false ###
+# use skip_vision=True, if want to run only text
 skip_vision = False
 
-if skip_vision:
-    ## Only Text ##
-
-    ## Set Batch_Size ##
+if skip_vision:  # Only Text
     batch_size = 1
     qeff_model.compile(
         batch_size=batch_size,
@@ -77,14 +74,13 @@
     print(tokenizer.batch_decode(output.generated_ids))
     print(output)
 
-else:
+else:  # Vision + Text
     batch_size = 1
     ctx_len = 14336
     widths = [360, 320, 360, 454, 536, 640, 720, 910, 720, 1280, 1920]
     heights = [120, 180, 240, 256, 354, 360, 480, 512, 576, 720, 1080]
     num_frames = [177, 139, 78, 64, 37, 30, 20, 16, 16, 7, 7]
 
-    ## Vision + Text ##
     qeff_model.compile(
         batch_size=batch_size,
         prefill_seq_len=128,
@@ -104,10 +100,9 @@
         mos=1,
     )
 
-    ### IMAGE + TEXT ###
     image_url = "https://picsum.photos/id/237/536/354"
     image = Image.open(requests.get(image_url, stream=True).raw)
-    image = image.resize((360, 120))  # Resize to any deimnsion present in specializations (width, height)
+    image = image.resize((360, 120))  # Resize to any dimension (width, height) present in specializations
     messages_1 = [
         {
             "role": "user",

From bba6252a9213a40ff2e320bbd752f65790e5ff03 Mon Sep 17 00:00:00 2001
From: quic-sanising <sanising@qti.qualcomm.com>
Date: Wed, 18 Mar 2026 15:44:51 -0700
Subject: [PATCH 6/6] Allow user to specify vision_size for decoder
 specialization

Signed-off-by: quic-sanising <sanising@qti.qualcomm.com>
---
 .../models/qwen2_5_vl/modeling_qwen2_5_vl.py   | 18 ++++++++++++------
 .../qwen_vl/multi_specialization_inference.py  |  2 ++
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
index 077e26582..61dd21bbb 100644
--- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
+++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
@@ -1047,7 +1047,12 @@ def get_specializations(
             max_pixels = mm_processor_kwargs.get("max_pixels", max_pixels)
 
         vision = []
-        min_vision_size = ctx_len
+        min_vision_size = None
+        user_vision_size = compiler_options.pop("vision_size", None)
+        if user_vision_size:
+            assert user_vision_size < ctx_len, "vision_size must be less than ctx_len"
+        else:
+            min_vision_size = ctx_len
         for h, w in zip(height, width):
             resized_height, resized_width = smart_resize(
                 height=h, width=w, factor=IMAGE_FACTOR, min_pixels=min_pixels, max_pixels=max_pixels
@@ -1057,7 +1062,8 @@ def get_specializations(
             grid_width = patch_size * patch_size * temporal_patch_size * channel
             vision_size = grid_height // 4
             grid_height = grid_height * batch_size
-            min_vision_size = min(min_vision_size, vision_size * num_frames)
+            if not user_vision_size:
+                min_vision_size = min(min_vision_size, vision_size * num_frames)
 
             vision.append(
                 {
@@ -1078,7 +1084,7 @@ def get_specializations(
                     "batch_size": 1 if continuous_batching else batch_size,
                     "seq_len": prefill_seq_len,
                     "ctx_len": ctx_len,
-                    "vision_size": min_vision_size,
+                    "vision_size": min_vision_size if not user_vision_size else user_vision_size,
                     "comp_ctx_lengths": comp_ctx_lengths_prefill[i],
                     "vision_batch_size": batch_size,
                 }
@@ -1097,7 +1103,7 @@ def get_specializations(
                     "batch_size": full_batch_size if continuous_batching else batch_size,
                     "seq_len": "1",
                     "ctx_len": ctx_len,
-                    "vision_size": min_vision_size,
+                    "vision_size": min_vision_size if not user_vision_size else user_vision_size,
                     "comp_ctx_lengths": comp_ctx_lengths_decode[i],
                     "vision_batch_size": batch_size,
                 }
@@ -1113,7 +1119,7 @@ def get_specializations(
                 "batch_size": 1 if continuous_batching else batch_size,
                 "seq_len": prefill_seq_len,
                 "ctx_len": ctx_len,
-                "vision_size": min_vision_size,
+                "vision_size": min_vision_size if not user_vision_size else user_vision_size,
                 "vision_batch_size": batch_size,
             }
 
@@ -1128,7 +1134,7 @@ def get_specializations(
                 "batch_size": full_batch_size if continuous_batching else batch_size,
                 "seq_len": 1,
                 "ctx_len": ctx_len,
-                "vision_size": min_vision_size,
+                "vision_size": min_vision_size if not user_vision_size else user_vision_size,
                 "vision_batch_size": batch_size,
             }
 
diff --git a/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py b/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py
index dd36e8311..b205bf65a 100644
--- a/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py
+++ b/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py
@@ -80,6 +80,7 @@
     widths = [360, 320, 360, 454, 536, 640, 720, 910, 720, 1280, 1920]
     heights = [120, 180, 240, 256, 354, 360, 480, 512, 576, 720, 1080]
     num_frames = [177, 139, 78, 64, 37, 30, 20, 16, 16, 7, 7]
+    user_vision_size = 9216
 
     qeff_model.compile(
         batch_size=batch_size,
@@ -94,6 +95,7 @@
             "min_pixels": 4 * 28 * 28,
             "max_pixels": 16384 * 28 * 28,
         },
+        vision_size=user_vision_size,
         mxfp6_matmul=True,
         mxint8_kv_cache=True,
         aic_enable_depth_first=True,