diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index c242a97e3..62121d6b3 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1432,6 +1432,15 @@ def kv_offload_generate( vision_inputs_fp16 = {"pixel_values", "image_masks"} vision_inputs.update({k: vision_inputs[k].astype("float16") for k in vision_inputs_fp16 if k in vision_inputs}) + pixel_values_shape = list(vision_inputs["pixel_values"].shape) + idx = next(i for i, inner in enumerate(vision_session.allowed_shapes) if (2, pixel_values_shape) in inner) + + buffer_set = { + "vision_embeds": np.zeros(vision_session.allowed_shapes[idx][2][1], dtype=np.float16), + "image_grid_thw": np.zeros(vision_session.allowed_shapes[idx][0][1], dtype=np.int64), + } + vision_session.set_buffers(buffer_set) + vision_start = perf_counter() vision_outputs = {} @@ -1457,6 +1466,17 @@ def kv_offload_generate( vision_session.deactivate() lang_session.activate() + vision_outputs["vision_embeds"] = np.pad( + vision_outputs["vision_embeds"], + pad_width=( + (0, 0), + (0, lang_session.allowed_shapes[0][1][1][1] - vision_session.allowed_shapes[idx][2][1][1]), + (0, 0), + ), # pad axis=1 only + mode="constant", + constant_values=0, + ) + lang_session.set_buffers(vision_outputs) if self.comp_ctx_lengths_prefill is not None: diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index fa1bdd9b9..61dd21bbb 100644 --- a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -12,6 +12,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from qwen_vl_utils import smart_resize from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLModel from transformers.cache_utils import Cache from transformers.modeling_outputs import ( @@ -1008,8 +1009,8 @@ def get_specializations( prefill_seq_len: int, ctx_len: int, img_size: None, - height: int = None, - width: int = None, + height: int | List[int] = None, + width: int | List[int] = None, num_frames: int = 1, kv_offload: bool = False, continuous_batching: bool = False, @@ -1026,6 +1027,9 @@ def get_specializations( logger.warning( f"Setting height and width to be {height} and {width} respectively, as it was neither passed nor found in vision_config" ) + height = [height] if isinstance(height, int) else height + width = [width] if isinstance(width, int) else width + prefill_seq_len = prefill_seq_len if prefill_seq_len else 128 ctx_len = ctx_len if ctx_len else constants.INTERN_CTX_LEN channel = 3 @@ -1033,72 +1037,44 @@ def get_specializations( temporal_patch_size = self.config.vision_config.temporal_patch_size IMAGE_FACTOR = 28 - MIN_PIXELS = 4 * 28 * 28 - MAX_PIXELS = 16384 * 28 * 28 - MAX_RATIO = 200 - - def round_by_factor(number: int, factor: int) -> int: - """Returns the closest integer to 'number' that is divisible by 'factor'.""" - return round(number / factor) * factor - - def ceil_by_factor(number: int, factor: int) -> int: - """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'.""" - return math.ceil(number / factor) * factor - - def floor_by_factor(number: int, factor: int) -> int: - """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'.""" - return math.floor(number / factor) * factor - - def smart_resize( - height: int, - width: int, - factor: int = IMAGE_FACTOR, - min_pixels: int = MIN_PIXELS, - max_pixels: int = MAX_PIXELS, - ) -> tuple[int, int]: - """ - Rescales the image so that the following conditions are met: - - 1. Both dimensions (height and width) are divisible by 'factor'. - - 2. The total number of pixels is within the range ['min_pixels', 'max_pixels']. - - 3. The aspect ratio of the image is maintained as closely as possible. - """ - if max(height, width) / min(height, width) > MAX_RATIO: - raise ValueError( - f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}" - ) - h_bar = max(factor, round_by_factor(height, factor)) - w_bar = max(factor, round_by_factor(width, factor)) - if h_bar * w_bar > max_pixels: - beta = math.sqrt((height * width) / max_pixels) - h_bar = floor_by_factor(height / beta, factor) - w_bar = floor_by_factor(width / beta, factor) - elif h_bar * w_bar < min_pixels: - beta = math.sqrt(min_pixels / (height * width)) - h_bar = ceil_by_factor(height * beta, factor) - w_bar = ceil_by_factor(width * beta, factor) - return h_bar, w_bar - - resized_height, resized_width = smart_resize(height=height, width=width) - grid_h, grid_w = resized_height // patch_size, resized_width // patch_size - grid_height = grid_h * grid_w - grid_width = patch_size * patch_size * temporal_patch_size * channel - vision_size = grid_height // 4 - vision_size = vision_size * num_frames - grid_height = grid_height * batch_size - - vision = [ - { - "batch_size": batch_size, - "vision_size": vision_size, - "grid_height": grid_height, - "grid_width": grid_width, - "grid_h": grid_h, - "grid_w": grid_w, - } - ] + IMAGE_MIN_TOKEN_NUM = 4 + IMAGE_MAX_TOKEN_NUM = 16384 + min_pixels = IMAGE_MIN_TOKEN_NUM * IMAGE_FACTOR**2 + max_pixels = IMAGE_MAX_TOKEN_NUM * IMAGE_FACTOR**2 + mm_processor_kwargs = compiler_options.pop("mm_processor_kwargs", None) + if mm_processor_kwargs: + min_pixels = mm_processor_kwargs.get("min_pixels", min_pixels) + max_pixels = mm_processor_kwargs.get("max_pixels", max_pixels) + + vision = [] + min_vision_size = None + user_vision_size = compiler_options.pop("vision_size", None) + if user_vision_size: + assert user_vision_size < ctx_len, "vision_size must be less than ctx_len" + else: + min_vision_size = ctx_len + for h, w in zip(height, width): + resized_height, resized_width = smart_resize( + height=h, width=w, factor=IMAGE_FACTOR, min_pixels=min_pixels, max_pixels=max_pixels + ) + grid_h, grid_w = resized_height // patch_size, resized_width // patch_size + grid_height = grid_h * grid_w + grid_width = patch_size * patch_size * temporal_patch_size * channel + vision_size = grid_height // 4 + grid_height = grid_height * batch_size + if not user_vision_size: + min_vision_size = min(min_vision_size, vision_size * num_frames) + + vision.append( + { + "batch_size": batch_size, + "vision_size": vision_size, + "grid_height": grid_height, + "grid_width": grid_width, + "grid_h": grid_h, + "grid_w": grid_w, + } + ) if comp_ctx_lengths_prefill is not None: lang = [] @@ -1108,7 +1084,7 @@ def smart_resize( "batch_size": 1 if continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, - "vision_size": vision_size, + "vision_size": min_vision_size if not user_vision_size else user_vision_size, "comp_ctx_lengths": comp_ctx_lengths_prefill[i], "vision_batch_size": batch_size, } @@ -1127,7 +1103,7 @@ def smart_resize( "batch_size": full_batch_size if continuous_batching else batch_size, "seq_len": "1", "ctx_len": ctx_len, - "vision_size": vision_size, + "vision_size": min_vision_size if not user_vision_size else user_vision_size, "comp_ctx_lengths": comp_ctx_lengths_decode[i], "vision_batch_size": batch_size, } @@ -1143,7 +1119,7 @@ def smart_resize( "batch_size": 1 if continuous_batching else batch_size, "seq_len": prefill_seq_len, "ctx_len": ctx_len, - "vision_size": vision_size, + "vision_size": min_vision_size if not user_vision_size else user_vision_size, "vision_batch_size": batch_size, } @@ -1158,7 +1134,7 @@ def smart_resize( "batch_size": full_batch_size if continuous_batching else batch_size, "seq_len": 1, "ctx_len": ctx_len, - "vision_size": vision_size, + "vision_size": min_vision_size if not user_vision_size else user_vision_size, "vision_batch_size": batch_size, } diff --git a/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py b/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py new file mode 100644 index 000000000..b205bf65a --- /dev/null +++ b/examples/image_text_to_text/models/qwen_vl/multi_specialization_inference.py @@ -0,0 +1,144 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import requests +import transformers +from PIL import Image +from qwen_vl_utils import process_vision_info +from transformers import AutoConfig, AutoProcessor, TextStreamer + +from QEfficient import QEFFAutoModelForImageTextToText + +# For AWQ model update pytorch version to 2.8.* +model_id = "Qwen/Qwen2.5-VL-3B-Instruct" +config = AutoConfig.from_pretrained(model_id) +config.text_config.num_hidden_layers = 2 + +qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( + model_id, attn_implementation="eager", kv_offload=True, config=config +) +tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) +processor = AutoProcessor.from_pretrained(model_id) + +# use skip_vision=True, if want to run only text +skip_vision = False + +if skip_vision: # Only Text + batch_size = 1 + qeff_model.compile( + batch_size=batch_size, + prefill_seq_len=128, + ctx_len=4096, + num_cores=16, + num_devices=8, + height=354, + width=536, + mxfp6_matmul=False, + aic_enable_depth_first=True, + skip_vision=True, + mos=1, + ) + + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Tell me about yourself."}, + ], + }, + ] + + messages = [messages] * batch_size + + inputs = processor.apply_chat_template( + messages, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors="pt", + ) + + import ipdb + + ipdb.set_trace() + + inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size) + + streamer = TextStreamer(tokenizer) + output = qeff_model.generate(inputs=inputs, generation_len=100) + print(output.generated_ids) + print(tokenizer.batch_decode(output.generated_ids)) + print(output) + +else: # Vision + Text + batch_size = 1 + ctx_len = 14336 + widths = [360, 320, 360, 454, 536, 640, 720, 910, 720, 1280, 1920] + heights = [120, 180, 240, 256, 354, 360, 480, 512, 576, 720, 1080] + num_frames = [177, 139, 78, 64, 37, 30, 20, 16, 16, 7, 7] + user_vision_size = 9216 + + qeff_model.compile( + batch_size=batch_size, + prefill_seq_len=128, + ctx_len=ctx_len, + num_cores=16, + num_devices=2, + height=heights, + width=widths, + num_frames=max(num_frames), + mm_processor_kwargs={ + "min_pixels": 4 * 28 * 28, + "max_pixels": 16384 * 28 * 28, + }, + vision_size=user_vision_size, + mxfp6_matmul=True, + mxint8_kv_cache=True, + aic_enable_depth_first=True, + mos=1, + ) + + image_url = "https://picsum.photos/id/237/536/354" + image = Image.open(requests.get(image_url, stream=True).raw) + image = image.resize((360, 120)) # Resize to any dimension (width, height) present in specializations + messages_1 = [ + { + "role": "user", + "content": [ + {"type": "image", "image": image}, + {"type": "text", "text": "Describe this image."}, + ], + }, + ] + messages_2 = [ + { + "role": "user", + "content": [ + {"type": "image", "image": image}, + {"type": "text", "text": "Describe about the color of the dog."}, + ], + }, + ] + messages = [messages_1] * batch_size + texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages] + + image_inputs, video_inputs = process_vision_info(messages) + inputs = processor( + text=texts, + images=image_inputs, + videos=video_inputs, + padding=True, + return_tensors="pt", + ) + + inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size) + + streamer = TextStreamer(tokenizer) + output = qeff_model.generate(inputs=inputs, generation_len=100) + print(output.generated_ids) + print(tokenizer.batch_decode(output.generated_ids)) + print(output)