Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions QEfficient/transformers/models/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -1432,6 +1432,15 @@ def kv_offload_generate(
vision_inputs_fp16 = {"pixel_values", "image_masks"}
vision_inputs.update({k: vision_inputs[k].astype("float16") for k in vision_inputs_fp16 if k in vision_inputs})

pixel_values_shape = list(vision_inputs["pixel_values"].shape)
idx = next(i for i, inner in enumerate(vision_session.allowed_shapes) if (2, pixel_values_shape) in inner)

buffer_set = {
"vision_embeds": np.zeros(vision_session.allowed_shapes[idx][2][1], dtype=np.float16),
"image_grid_thw": np.zeros(vision_session.allowed_shapes[idx][0][1], dtype=np.int64),
}
vision_session.set_buffers(buffer_set)

vision_start = perf_counter()

vision_outputs = {}
Expand All @@ -1457,6 +1466,17 @@ def kv_offload_generate(
vision_session.deactivate()
lang_session.activate()

vision_outputs["vision_embeds"] = np.pad(
vision_outputs["vision_embeds"],
pad_width=(
(0, 0),
(0, lang_session.allowed_shapes[0][1][1][1] - vision_session.allowed_shapes[idx][2][1][1]),
(0, 0),
), # pad axis=1 only
mode="constant",
constant_values=0,
)

lang_session.set_buffers(vision_outputs)

if self.comp_ctx_lengths_prefill is not None:
Expand Down
120 changes: 48 additions & 72 deletions QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
from qwen_vl_utils import smart_resize
from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLModel
from transformers.cache_utils import Cache
from transformers.modeling_outputs import (
Expand Down Expand Up @@ -1008,8 +1009,8 @@ def get_specializations(
prefill_seq_len: int,
ctx_len: int,
img_size: None,
height: int = None,
width: int = None,
height: int | List[int] = None,
width: int | List[int] = None,
num_frames: int = 1,
kv_offload: bool = False,
continuous_batching: bool = False,
Expand All @@ -1026,79 +1027,54 @@ def get_specializations(
logger.warning(
f"Setting height and width to be {height} and {width} respectively, as it was neither passed nor found in vision_config"
)
height = [height] if isinstance(height, int) else height
width = [width] if isinstance(width, int) else width

prefill_seq_len = prefill_seq_len if prefill_seq_len else 128
ctx_len = ctx_len if ctx_len else constants.INTERN_CTX_LEN
channel = 3
patch_size = self.config.vision_config.patch_size
temporal_patch_size = self.config.vision_config.temporal_patch_size

IMAGE_FACTOR = 28
MIN_PIXELS = 4 * 28 * 28
MAX_PIXELS = 16384 * 28 * 28
MAX_RATIO = 200

def round_by_factor(number: int, factor: int) -> int:
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
return round(number / factor) * factor

def ceil_by_factor(number: int, factor: int) -> int:
"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
return math.ceil(number / factor) * factor

def floor_by_factor(number: int, factor: int) -> int:
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
return math.floor(number / factor) * factor

def smart_resize(
height: int,
width: int,
factor: int = IMAGE_FACTOR,
min_pixels: int = MIN_PIXELS,
max_pixels: int = MAX_PIXELS,
) -> tuple[int, int]:
"""
Rescales the image so that the following conditions are met:

1. Both dimensions (height and width) are divisible by 'factor'.

2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].

3. The aspect ratio of the image is maintained as closely as possible.
"""
if max(height, width) / min(height, width) > MAX_RATIO:
raise ValueError(
f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
)
h_bar = max(factor, round_by_factor(height, factor))
w_bar = max(factor, round_by_factor(width, factor))
if h_bar * w_bar > max_pixels:
beta = math.sqrt((height * width) / max_pixels)
h_bar = floor_by_factor(height / beta, factor)
w_bar = floor_by_factor(width / beta, factor)
elif h_bar * w_bar < min_pixels:
beta = math.sqrt(min_pixels / (height * width))
h_bar = ceil_by_factor(height * beta, factor)
w_bar = ceil_by_factor(width * beta, factor)
return h_bar, w_bar

resized_height, resized_width = smart_resize(height=height, width=width)
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
grid_height = grid_h * grid_w
grid_width = patch_size * patch_size * temporal_patch_size * channel
vision_size = grid_height // 4
vision_size = vision_size * num_frames
grid_height = grid_height * batch_size

vision = [
{
"batch_size": batch_size,
"vision_size": vision_size,
"grid_height": grid_height,
"grid_width": grid_width,
"grid_h": grid_h,
"grid_w": grid_w,
}
]
IMAGE_MIN_TOKEN_NUM = 4
IMAGE_MAX_TOKEN_NUM = 16384
min_pixels = IMAGE_MIN_TOKEN_NUM * IMAGE_FACTOR**2
max_pixels = IMAGE_MAX_TOKEN_NUM * IMAGE_FACTOR**2
mm_processor_kwargs = compiler_options.pop("mm_processor_kwargs", None)
if mm_processor_kwargs:
min_pixels = mm_processor_kwargs.get("min_pixels", min_pixels)
max_pixels = mm_processor_kwargs.get("max_pixels", max_pixels)

vision = []
min_vision_size = None
user_vision_size = compiler_options.pop("vision_size", None)
if user_vision_size:
assert user_vision_size < ctx_len, "vision_size must be less than ctx_len"
else:
min_vision_size = ctx_len
for h, w in zip(height, width):
resized_height, resized_width = smart_resize(
height=h, width=w, factor=IMAGE_FACTOR, min_pixels=min_pixels, max_pixels=max_pixels
)
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
grid_height = grid_h * grid_w
grid_width = patch_size * patch_size * temporal_patch_size * channel
vision_size = grid_height // 4
grid_height = grid_height * batch_size
if not user_vision_size:
min_vision_size = min(min_vision_size, vision_size * num_frames)

vision.append(
{
"batch_size": batch_size,
"vision_size": vision_size,
"grid_height": grid_height,
"grid_width": grid_width,
"grid_h": grid_h,
"grid_w": grid_w,
}
)

if comp_ctx_lengths_prefill is not None:
lang = []
Expand All @@ -1108,7 +1084,7 @@ def smart_resize(
"batch_size": 1 if continuous_batching else batch_size,
"seq_len": prefill_seq_len,
"ctx_len": ctx_len,
"vision_size": vision_size,
"vision_size": min_vision_size if not user_vision_size else user_vision_size,
"comp_ctx_lengths": comp_ctx_lengths_prefill[i],
"vision_batch_size": batch_size,
}
Expand All @@ -1127,7 +1103,7 @@ def smart_resize(
"batch_size": full_batch_size if continuous_batching else batch_size,
"seq_len": "1",
"ctx_len": ctx_len,
"vision_size": vision_size,
"vision_size": min_vision_size if not user_vision_size else user_vision_size,
"comp_ctx_lengths": comp_ctx_lengths_decode[i],
"vision_batch_size": batch_size,
}
Expand All @@ -1143,7 +1119,7 @@ def smart_resize(
"batch_size": 1 if continuous_batching else batch_size,
"seq_len": prefill_seq_len,
"ctx_len": ctx_len,
"vision_size": vision_size,
"vision_size": min_vision_size if not user_vision_size else user_vision_size,
"vision_batch_size": batch_size,
}

Expand All @@ -1158,7 +1134,7 @@ def smart_resize(
"batch_size": full_batch_size if continuous_batching else batch_size,
"seq_len": 1,
"ctx_len": ctx_len,
"vision_size": vision_size,
"vision_size": min_vision_size if not user_vision_size else user_vision_size,
"vision_batch_size": batch_size,
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------

import requests
import transformers
from PIL import Image
from qwen_vl_utils import process_vision_info
from transformers import AutoConfig, AutoProcessor, TextStreamer

from QEfficient import QEFFAutoModelForImageTextToText

# For AWQ model update pytorch version to 2.8.*
model_id = "Qwen/Qwen2.5-VL-3B-Instruct"
config = AutoConfig.from_pretrained(model_id)
config.text_config.num_hidden_layers = 2

qeff_model = QEFFAutoModelForImageTextToText.from_pretrained(
model_id, attn_implementation="eager", kv_offload=True, config=config
)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
processor = AutoProcessor.from_pretrained(model_id)

# use skip_vision=True, if want to run only text
skip_vision = False

if skip_vision: # Only Text
batch_size = 1
qeff_model.compile(
batch_size=batch_size,
prefill_seq_len=128,
ctx_len=4096,
num_cores=16,
num_devices=8,
height=354,
width=536,
mxfp6_matmul=False,
aic_enable_depth_first=True,
skip_vision=True,
mos=1,
)

messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Tell me about yourself."},
],
},
]

messages = [messages] * batch_size

inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
)

import ipdb

ipdb.set_trace()

inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size)

streamer = TextStreamer(tokenizer)
output = qeff_model.generate(inputs=inputs, generation_len=100)
print(output.generated_ids)
print(tokenizer.batch_decode(output.generated_ids))
print(output)

else: # Vision + Text
batch_size = 1
ctx_len = 14336
widths = [360, 320, 360, 454, 536, 640, 720, 910, 720, 1280, 1920]
heights = [120, 180, 240, 256, 354, 360, 480, 512, 576, 720, 1080]
num_frames = [177, 139, 78, 64, 37, 30, 20, 16, 16, 7, 7]
user_vision_size = 9216

qeff_model.compile(
batch_size=batch_size,
prefill_seq_len=128,
ctx_len=ctx_len,
num_cores=16,
num_devices=2,
height=heights,
width=widths,
num_frames=max(num_frames),
mm_processor_kwargs={
"min_pixels": 4 * 28 * 28,
"max_pixels": 16384 * 28 * 28,
},
vision_size=user_vision_size,
mxfp6_matmul=True,
mxint8_kv_cache=True,
aic_enable_depth_first=True,
mos=1,
)

image_url = "https://picsum.photos/id/237/536/354"
image = Image.open(requests.get(image_url, stream=True).raw)
image = image.resize((360, 120)) # Resize to any dimension (width, height) present in specializations
messages_1 = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": "Describe this image."},
],
},
]
messages_2 = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": "Describe about the color of the dog."},
],
},
]
messages = [messages_1] * batch_size
texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages]

image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=texts,
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)

inputs = qeff_model.model.prepare_inputs_for_generation(inputs=inputs, prefill_seq_len=128, batch_size=batch_size)

streamer = TextStreamer(tokenizer)
output = qeff_model.generate(inputs=inputs, generation_len=100)
print(output.generated_ids)
print(tokenizer.batch_decode(output.generated_ids))
print(output)