From 5dba890f3291939191c99fc435e4505c65a2bd41 Mon Sep 17 00:00:00 2001 From: quic-sanising Date: Mon, 16 Mar 2026 16:44:36 -0700 Subject: [PATCH 1/6] Copy changes from PR #755 Signed-off-by: quic-sanising --- .../transformers/models/modeling_auto.py | 20 +++ .../models/qwen2_5_vl/modeling_qwen2_5_vl.py | 57 ++++--- .../qwen_vl/multi_specialization_inference.py | 148 ++++++++++++++++++ 3 files changed, 201 insertions(+), 24 deletions(-) create mode 100644 examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index c242a97e3..62121d6b3 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1432,6 +1432,15 @@ def kv_offload_generate( vision_inputs_fp16 = {"pixel_values", "image_masks"} vision_inputs.update({k: vision_inputs[k].astype("float16") for k in vision_inputs_fp16 if k in vision_inputs}) + pixel_values_shape = list(vision_inputs["pixel_values"].shape) + idx = next(i for i, inner in enumerate(vision_session.allowed_shapes) if (2, pixel_values_shape) in inner) + + buffer_set = { + "vision_embeds": np.zeros(vision_session.allowed_shapes[idx][2][1], dtype=np.float16), + "image_grid_thw": np.zeros(vision_session.allowed_shapes[idx][0][1], dtype=np.int64), + } + vision_session.set_buffers(buffer_set) + vision_start = perf_counter() vision_outputs = {} @@ -1457,6 +1466,17 @@ def kv_offload_generate( vision_session.deactivate() lang_session.activate() + vision_outputs["vision_embeds"] = np.pad( + vision_outputs["vision_embeds"], + pad_width=( + (0, 0), + (0, lang_session.allowed_shapes[0][1][1][1] - vision_session.allowed_shapes[idx][2][1][1]), + (0, 0), + ), # pad axis=1 only + mode="constant", + constant_values=0, + ) + lang_session.set_buffers(vision_outputs) if self.comp_ctx_lengths_prefill is not None: diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index fa1bdd9b9..c9e622e08 100644 --- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -1008,8 +1008,8 @@ def get_specializations( prefill_seq_len: int, ctx_len: int, img_size: None, - height: int = None, - width: int = None, + height: int | List[int] = None, + width: int | List[int] = None, num_frames: int = 1, kv_offload: bool = False, continuous_batching: bool = False, @@ -1081,24 +1081,33 @@ def smart_resize( w_bar = ceil_by_factor(width * beta, factor) return h_bar, w_bar - resized_height, resized_width = smart_resize(height=height, width=width) - grid_h, grid_w = resized_height // patch_size, resized_width // patch_size - grid_height = grid_h * grid_w - grid_width = patch_size * patch_size * temporal_patch_size * channel - vision_size = grid_height // 4 - vision_size = vision_size * num_frames - grid_height = grid_height * batch_size - - vision = [ - { - "batch_size": batch_size, - "vision_size": vision_size, - "grid_height": grid_height, - "grid_width": grid_width, - "grid_h": grid_h, - "grid_w": grid_w, - } - ] + vision = [] + max_vision_size = 0 + + height = [height] if isinstance(height, int) else height + width = [width] if isinstance(width, int) else width + + for h, w in zip(height, width): + resized_height, resized_width = smart_resize(height=h, width=w) + grid_h, grid_w = resized_height // patch_size, resized_width // patch_size + grid_height = grid_h * grid_w + grid_width = patch_size * patch_size * temporal_patch_size * channel + vision_size = grid_height // 4 + vision_size = vision_size * num_frames + grid_height = grid_height * batch_size + + max_vision_size = max(max_vision_size, vision_size) + + vision.append( + { + "batch_size": batch_size, + "vision_size": vision_size, + "grid_height": grid_height, + "grid_width": grid_width, + "grid_h": grid_h, + "grid_w": grid_w, + } + ) if comp_ctx_lengths_prefill is not None: lang = [] @@ -1108,7 +1117,7 @@ def smart_resize( "batch_size": 1 if continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, - "vision_size": vision_size, + "vision_size": max_vision_size, "comp_ctx_lengths": comp_ctx_lengths_prefill[i], "vision_batch_size": batch_size, } @@ -1127,7 +1136,7 @@ def smart_resize( "batch_size": full_batch_size if continuous_batching else batch_size, "seq_len": "1", "ctx_len": ctx_len, - "vision_size": vision_size, + "vision_size": max_vision_size, "comp_ctx_lengths": comp_ctx_lengths_decode[i], "vision_batch_size": batch_size, } @@ -1143,7 +1152,7 @@ def smart_resize( "batch_size": 1 if continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, - "vision_size": vision_size, + "vision_size": max_vision_size, "vision_batch_size": batch_size, } @@ -1158,7 +1167,7 @@ def smart_resize( "batch_size": full_batch_size if continuous_batching else batch_size, "seq_len": 1, "ctx_len": ctx_len, - "vision_size": vision_size, + "vision_size": max_vision_size, "vision_batch_size": batch_size, } diff --git a/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py b/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py new file mode 100644 index 000000000..cdcefaa2b --- /dev/null +++ b/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py @@ -0,0 +1,148 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import requests +import transformers +from PIL import Image +from qwen_vl_utils import process_vision_info +from transformers import AutoConfig, AutoProcessor, TextStreamer + +from QEfficient import QEFFAutoModelForImageTextToText + +## For AWQ model update pytorch version to 2.8.* +model_id = "Qwen/Qwen2.5-VL-3B-Instruct" +config = AutoConfig.from_pretrained(model_id) +config.text_config.num_hidden_layers = 2 + +qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( + model_id, attn_implementation="eager", kv_offload=True, config=config +) +tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) +processor = AutoProcessor.from_pretrained(model_id) + +### use skip_vision=Ture, if want to run only text, ow false ### +skip_vision = False + +if skip_vision: + ## Only Text ## + + ## Set Batch_Size ## + batch_size = 1 + qeff_model.compile( + batch_size=batch_size, + prefill_seq_len=128, + ctx_len=4096, + num_cores=16, + num_devices=8, + height=354, + width=536, + mxfp6_matmul=False, + aic_enable_depth_first=True, + skip_vision=True, + mos=1, + ) + + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Tell me about yourself."}, + ], + }, + ] + + messages = [messages] * batch_size + + inputs = processor.apply_chat_template( + messages, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors="pt", + ) + + import ipdb + + ipdb.set_trace() + + inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size) + + streamer = TextStreamer(tokenizer) + output = qeff_model.generate(inputs=inputs, generation_len=100) + print(output.generated_ids) + print(tokenizer.batch_decode(output.generated_ids)) + print(output) + +else: + batch_size = 1 + ctx_len = 5120 + + ## The dimensions list stores all the height × width pairs required for compilation ## + dimensions = [[354, 536], [180, 320], [240, 360], [120, 360]] + + ## Vision + Text ## + qeff_model.compile( + batch_size=batch_size, + prefill_seq_len=128, + ctx_len=5120, + num_cores=16, + num_devices=8, + dimensions=dimensions, + mxfp6_matmul=True, + mxint8_kv_cache=True, + aic_enable_depth_first=True, + mos=1, + ) + + ### IMAGE + TEXT ### + image_url = "https://picsum.photos/id/237/536/354" + + image = Image.open(requests.get(image_url, stream=True).raw) + + ## Resize to any deimnsion present in specializations ## + image = image.resize((360, 120)) + + messages_1 = [ + { + "role": "user", + "content": [ + {"type": "image", "image": image}, + {"type": "text", "text": "Describe this image."}, + ], + }, + ] + + messages_2 = [ + { + "role": "user", + "content": [ + {"type": "image", "image": image}, + {"type": "text", "text": "Describe about the color of the dog."}, + ], + }, + ] + + messages = [messages_1] * batch_size + + texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages] + + image_inputs, video_inputs = process_vision_info(messages) + inputs = processor( + text=texts, + images=image_inputs, + videos=video_inputs, + padding=True, + return_tensors="pt", + ) + + inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size) + + streamer = TextStreamer(tokenizer) + output = qeff_model.generate(inputs=inputs, generation_len=100) + print(output.generated_ids) + print(tokenizer.batch_decode(output.generated_ids)) + print(output) From 3e6208f76bc7fae99f415de532584f7c696dec47 Mon Sep 17 00:00:00 2001 From: quic-sanising Date: Tue, 17 Mar 2026 17:07:14 -0700 Subject: [PATCH 2/6] Fix logic to calculate vision tokens Signed-off-by: quic-sanising --- .../models/qwen2_5_vl/modeling_qwen2_5_vl.py | 30 +++++++++---------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index c9e622e08..472a84e30 100644 --- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -1032,10 +1032,11 @@ def get_specializations( patch_size = self.config.vision_config.patch_size temporal_patch_size = self.config.vision_config.temporal_patch_size + # Modified from qwen_vl_utils/vision_process.py IMAGE_FACTOR = 28 - MIN_PIXELS = 4 * 28 * 28 - MAX_PIXELS = 16384 * 28 * 28 MAX_RATIO = 200 + IMAGE_MIN_TOKEN_NUM = 4 + IMAGE_MAX_TOKEN_NUM = 16384 def round_by_factor(number: int, factor: int) -> int: """Returns the closest integer to 'number' that is divisible by 'factor'.""" @@ -1053,18 +1054,19 @@ def smart_resize( height: int, width: int, factor: int = IMAGE_FACTOR, - min_pixels: int = MIN_PIXELS, - max_pixels: int = MAX_PIXELS, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, ) -> tuple[int, int]: """ Rescales the image so that the following conditions are met: 1. Both dimensions (height and width) are divisible by 'factor'. - 2. The total number of pixels is within the range ['min_pixels', 'max_pixels']. - 3. The aspect ratio of the image is maintained as closely as possible. """ + max_pixels = max_pixels if max_pixels is not None else (IMAGE_MAX_TOKEN_NUM * factor ** 2) + min_pixels = min_pixels if min_pixels is not None else (IMAGE_MIN_TOKEN_NUM * factor ** 2) + assert max_pixels >= min_pixels, "The max_pixels of image must be greater than or equal to min_pixels." if max(height, width) / min(height, width) > MAX_RATIO: raise ValueError( f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}" @@ -1082,21 +1084,17 @@ def smart_resize( return h_bar, w_bar vision = [] - max_vision_size = 0 - + min_vision_size = ctx_len height = [height] if isinstance(height, int) else height width = [width] if isinstance(width, int) else width - for h, w in zip(height, width): resized_height, resized_width = smart_resize(height=h, width=w) grid_h, grid_w = resized_height // patch_size, resized_width // patch_size grid_height = grid_h * grid_w grid_width = patch_size * patch_size * temporal_patch_size * channel vision_size = grid_height // 4 - vision_size = vision_size * num_frames grid_height = grid_height * batch_size - - max_vision_size = max(max_vision_size, vision_size) + min_vision_size = min(min_vision_size, vision_size * num_frames) vision.append( { @@ -1117,7 +1115,7 @@ def smart_resize( "batch_size": 1 if continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, - "vision_size": max_vision_size, + "vision_size": min_vision_size, "comp_ctx_lengths": comp_ctx_lengths_prefill[i], "vision_batch_size": batch_size, } @@ -1136,7 +1134,7 @@ def smart_resize( "batch_size": full_batch_size if continuous_batching else batch_size, "seq_len": "1", "ctx_len": ctx_len, - "vision_size": max_vision_size, + "vision_size": min_vision_size, "comp_ctx_lengths": comp_ctx_lengths_decode[i], "vision_batch_size": batch_size, } @@ -1152,7 +1150,7 @@ def smart_resize( "batch_size": 1 if continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, - "vision_size": max_vision_size, + "vision_size": min_vision_size, "vision_batch_size": batch_size, } @@ -1167,7 +1165,7 @@ def smart_resize( "batch_size": full_batch_size if continuous_batching else batch_size, "seq_len": 1, "ctx_len": ctx_len, - "vision_size": max_vision_size, + "vision_size": min_vision_size, "vision_batch_size": batch_size, } From 58183e9fc0ce44987944c6349041e45f71f034b8 Mon Sep 17 00:00:00 2001 From: quic-sanising Date: Tue, 17 Mar 2026 17:20:43 -0700 Subject: [PATCH 3/6] Update example to remove dimensions Signed-off-by: quic-sanising --- .../qwen_vl/multi_specialization_inference.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py b/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py index cdcefaa2b..4fd9b4c98 100644 --- a/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py +++ b/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py @@ -79,19 +79,19 @@ else: batch_size = 1 - ctx_len = 5120 - - ## The dimensions list stores all the height × width pairs required for compilation ## - dimensions = [[354, 536], [180, 320], [240, 360], [120, 360]] + ctx_len = 14336 + heights = [360, 320, 360, 454, 536, 640, 720, 910, 720, 1280, 1920] + widths = [120, 180, 240, 256, 354, 360, 480, 512, 576, 720, 1080] ## Vision + Text ## qeff_model.compile( batch_size=batch_size, prefill_seq_len=128, - ctx_len=5120, + ctx_len=ctx_len, num_cores=16, - num_devices=8, - dimensions=dimensions, + num_devices=2, + height=heights, + width=widths, mxfp6_matmul=True, mxint8_kv_cache=True, aic_enable_depth_first=True, From 6dede8c651687db084f8adaa4f6d7a6bcfc605a7 Mon Sep 17 00:00:00 2001 From: quic-sanising Date: Wed, 18 Mar 2026 14:14:26 -0700 Subject: [PATCH 4/6] Import smart_resize from qwen_vl_utils and allow user input for min and max pixels Signed-off-by: quic-sanising --- .../models/qwen2_5_vl/modeling_qwen2_5_vl.py | 63 ++++--------------- .../qwen_vl/multi_specialization_inference.py | 19 +++--- 2 files changed, 22 insertions(+), 60 deletions(-) diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index 472a84e30..077e26582 100644 --- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -12,6 +12,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from qwen_vl_utils import smart_resize from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLModel from transformers.cache_utils import Cache from transformers.modeling_outputs import ( @@ -1026,69 +1027,31 @@ def get_specializations( logger.warning( f"Setting height and width to be {height} and {width} respectively, as it was neither passed nor found in vision_config" ) + height = [height] if isinstance(height, int) else height + width = [width] if isinstance(width, int) else width + prefill_seq_len = prefill_seq_len if prefill_seq_len else 128 ctx_len = ctx_len if ctx_len else constants.INTERN_CTX_LEN channel = 3 patch_size = self.config.vision_config.patch_size temporal_patch_size = self.config.vision_config.temporal_patch_size - # Modified from qwen_vl_utils/vision_process.py IMAGE_FACTOR = 28 - MAX_RATIO = 200 IMAGE_MIN_TOKEN_NUM = 4 IMAGE_MAX_TOKEN_NUM = 16384 - - def round_by_factor(number: int, factor: int) -> int: - """Returns the closest integer to 'number' that is divisible by 'factor'.""" - return round(number / factor) * factor - - def ceil_by_factor(number: int, factor: int) -> int: - """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'.""" - return math.ceil(number / factor) * factor - - def floor_by_factor(number: int, factor: int) -> int: - """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'.""" - return math.floor(number / factor) * factor - - def smart_resize( - height: int, - width: int, - factor: int = IMAGE_FACTOR, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - ) -> tuple[int, int]: - """ - Rescales the image so that the following conditions are met: - - 1. Both dimensions (height and width) are divisible by 'factor'. - 2. The total number of pixels is within the range ['min_pixels', 'max_pixels']. - 3. The aspect ratio of the image is maintained as closely as possible. - """ - max_pixels = max_pixels if max_pixels is not None else (IMAGE_MAX_TOKEN_NUM * factor ** 2) - min_pixels = min_pixels if min_pixels is not None else (IMAGE_MIN_TOKEN_NUM * factor ** 2) - assert max_pixels >= min_pixels, "The max_pixels of image must be greater than or equal to min_pixels." - if max(height, width) / min(height, width) > MAX_RATIO: - raise ValueError( - f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}" - ) - h_bar = max(factor, round_by_factor(height, factor)) - w_bar = max(factor, round_by_factor(width, factor)) - if h_bar * w_bar > max_pixels: - beta = math.sqrt((height * width) / max_pixels) - h_bar = floor_by_factor(height / beta, factor) - w_bar = floor_by_factor(width / beta, factor) - elif h_bar * w_bar < min_pixels: - beta = math.sqrt(min_pixels / (height * width)) - h_bar = ceil_by_factor(height * beta, factor) - w_bar = ceil_by_factor(width * beta, factor) - return h_bar, w_bar + min_pixels = IMAGE_MIN_TOKEN_NUM * IMAGE_FACTOR**2 + max_pixels = IMAGE_MAX_TOKEN_NUM * IMAGE_FACTOR**2 + mm_processor_kwargs = compiler_options.pop("mm_processor_kwargs", None) + if mm_processor_kwargs: + min_pixels = mm_processor_kwargs.get("min_pixels", min_pixels) + max_pixels = mm_processor_kwargs.get("max_pixels", max_pixels) vision = [] min_vision_size = ctx_len - height = [height] if isinstance(height, int) else height - width = [width] if isinstance(width, int) else width for h, w in zip(height, width): - resized_height, resized_width = smart_resize(height=h, width=w) + resized_height, resized_width = smart_resize( + height=h, width=w, factor=IMAGE_FACTOR, min_pixels=min_pixels, max_pixels=max_pixels + ) grid_h, grid_w = resized_height // patch_size, resized_width // patch_size grid_height = grid_h * grid_w grid_width = patch_size * patch_size * temporal_patch_size * channel diff --git a/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py b/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py index 4fd9b4c98..00dedbeb6 100644 --- a/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py +++ b/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py @@ -80,8 +80,9 @@ else: batch_size = 1 ctx_len = 14336 - heights = [360, 320, 360, 454, 536, 640, 720, 910, 720, 1280, 1920] - widths = [120, 180, 240, 256, 354, 360, 480, 512, 576, 720, 1080] + widths = [360, 320, 360, 454, 536, 640, 720, 910, 720, 1280, 1920] + heights = [120, 180, 240, 256, 354, 360, 480, 512, 576, 720, 1080] + num_frames = [177, 139, 78, 64, 37, 30, 20, 16, 16, 7, 7] ## Vision + Text ## qeff_model.compile( @@ -92,6 +93,11 @@ num_devices=2, height=heights, width=widths, + num_frames=max(num_frames), + mm_processor_kwargs={ + "min_pixels": 4 * 28 * 28, + "max_pixels": 16384 * 28 * 28, + }, mxfp6_matmul=True, mxint8_kv_cache=True, aic_enable_depth_first=True, @@ -100,12 +106,8 @@ ### IMAGE + TEXT ### image_url = "https://picsum.photos/id/237/536/354" - image = Image.open(requests.get(image_url, stream=True).raw) - - ## Resize to any deimnsion present in specializations ## - image = image.resize((360, 120)) - + image = image.resize((360, 120)) # Resize to any deimnsion present in specializations (width, height) messages_1 = [ { "role": "user", @@ -115,7 +117,6 @@ ], }, ] - messages_2 = [ { "role": "user", @@ -125,9 +126,7 @@ ], }, ] - messages = [messages_1] * batch_size - texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages] image_inputs, video_inputs = process_vision_info(messages) From c8cf3229a78addf69421eae10f94471472b22e5d Mon Sep 17 00:00:00 2001 From: quic-sanising Date: Wed, 18 Mar 2026 14:19:11 -0700 Subject: [PATCH 5/6] Reformat code Signed-off-by: quic-sanising --- .../qwen_vl/multi_specialization_inference.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py b/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py index 00dedbeb6..dd36e8311 100644 --- a/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py +++ b/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py @@ -13,7 +13,7 @@ from QEfficient import QEFFAutoModelForImageTextToText -## For AWQ model update pytorch version to 2.8.* +# For AWQ model update pytorch version to 2.8.* model_id = "Qwen/Qwen2.5-VL-3B-Instruct" config = AutoConfig.from_pretrained(model_id) config.text_config.num_hidden_layers = 2 @@ -24,13 +24,10 @@ tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) processor = AutoProcessor.from_pretrained(model_id) -### use skip_vision=Ture, if want to run only text, ow false ### +# use skip_vision=True, if want to run only text skip_vision = False -if skip_vision: - ## Only Text ## - - ## Set Batch_Size ## +if skip_vision: # Only Text batch_size = 1 qeff_model.compile( batch_size=batch_size, @@ -77,14 +74,13 @@ print(tokenizer.batch_decode(output.generated_ids)) print(output) -else: +else: # Vision + Text batch_size = 1 ctx_len = 14336 widths = [360, 320, 360, 454, 536, 640, 720, 910, 720, 1280, 1920] heights = [120, 180, 240, 256, 354, 360, 480, 512, 576, 720, 1080] num_frames = [177, 139, 78, 64, 37, 30, 20, 16, 16, 7, 7] - ## Vision + Text ## qeff_model.compile( batch_size=batch_size, prefill_seq_len=128, @@ -104,10 +100,9 @@ mos=1, ) - ### IMAGE + TEXT ### image_url = "https://picsum.photos/id/237/536/354" image = Image.open(requests.get(image_url, stream=True).raw) - image = image.resize((360, 120)) # Resize to any deimnsion present in specializations (width, height) + image = image.resize((360, 120)) # Resize to any dimension (width, height) present in specializations messages_1 = [ { "role": "user", From bba6252a9213a40ff2e320bbd752f65790e5ff03 Mon Sep 17 00:00:00 2001 From: quic-sanising Date: Wed, 18 Mar 2026 15:44:51 -0700 Subject: [PATCH 6/6] Allow user to specify vision_size for decoder specialization Signed-off-by: quic-sanising --- .../models/qwen2_5_vl/modeling_qwen2_5_vl.py | 18 ++++++++++++------ .../qwen_vl/multi_specialization_inference.py | 2 ++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index 077e26582..61dd21bbb 100644 --- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -1047,7 +1047,12 @@ def get_specializations( max_pixels = mm_processor_kwargs.get("max_pixels", max_pixels) vision = [] - min_vision_size = ctx_len + min_vision_size = None + user_vision_size = compiler_options.pop("vision_size", None) + if user_vision_size: + assert user_vision_size < ctx_len, "vision_size must be less than ctx_len" + else: + min_vision_size = ctx_len for h, w in zip(height, width): resized_height, resized_width = smart_resize( height=h, width=w, factor=IMAGE_FACTOR, min_pixels=min_pixels, max_pixels=max_pixels @@ -1057,7 +1062,8 @@ def get_specializations( grid_width = patch_size * patch_size * temporal_patch_size * channel vision_size = grid_height // 4 grid_height = grid_height * batch_size - min_vision_size = min(min_vision_size, vision_size * num_frames) + if not user_vision_size: + min_vision_size = min(min_vision_size, vision_size * num_frames) vision.append( { @@ -1078,7 +1084,7 @@ def get_specializations( "batch_size": 1 if continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, - "vision_size": min_vision_size, + "vision_size": min_vision_size if not user_vision_size else user_vision_size, "comp_ctx_lengths": comp_ctx_lengths_prefill[i], "vision_batch_size": batch_size, } @@ -1097,7 +1103,7 @@ def get_specializations( "batch_size": full_batch_size if continuous_batching else batch_size, "seq_len": "1", "ctx_len": ctx_len, - "vision_size": min_vision_size, + "vision_size": min_vision_size if not user_vision_size else user_vision_size, "comp_ctx_lengths": comp_ctx_lengths_decode[i], "vision_batch_size": batch_size, } @@ -1113,7 +1119,7 @@ def get_specializations( "batch_size": 1 if continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, - "vision_size": min_vision_size, + "vision_size": min_vision_size if not user_vision_size else user_vision_size, "vision_batch_size": batch_size, } @@ -1128,7 +1134,7 @@ def get_specializations( "batch_size": full_batch_size if continuous_batching else batch_size, "seq_len": 1, "ctx_len": ctx_len, - "vision_size": min_vision_size, + "vision_size": min_vision_size if not user_vision_size else user_vision_size, "vision_batch_size": batch_size, } diff --git a/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py b/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py index dd36e8311..b205bf65a 100644 --- a/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py +++ b/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py @@ -80,6 +80,7 @@ widths = [360, 320, 360, 454, 536, 640, 720, 910, 720, 1280, 1920] heights = [120, 180, 240, 256, 354, 360, 480, 512, 576, 720, 1080] num_frames = [177, 139, 78, 64, 37, 30, 20, 16, 16, 7, 7] + user_vision_size = 9216 qeff_model.compile( batch_size=batch_size, @@ -94,6 +95,7 @@ "min_pixels": 4 * 28 * 28, "max_pixels": 16384 * 28 * 28, }, + vision_size=user_vision_size, mxfp6_matmul=True, mxint8_kv_cache=True, aic_enable_depth_first=True,