Skip to content

Commit 1b4caad

Browse files
committed
fix(qwen3_vl): use image_patch_size=16 for video frame resizing
qwen_vl_utils.fetch_video() defaults to image_patch_size=14 (Qwen2 VL), causing video frames to be resized with factor=28 instead of the correct factor=32 for Qwen3 VL (patch_size=16). Fixes the dataset-side issue reported in #132.
1 parent d59ffb8 commit 1b4caad

2 files changed

Lines changed: 12 additions & 4 deletions

File tree

src/lmms_engine/datasets/iterable/qwen3_vl_iterable_dataset.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,13 +91,17 @@ def load_video_qwen_vl_utils(
9191
if self.config.video_sampling_strategy == "frame_num":
9292
n_frames = self.config.frame_num
9393
video_dict["nframes"] = n_frames
94-
video_inputs, sample_fps = fetch_video(video_dict, return_video_sample_fps=True, return_video_metadata=True)
94+
video_inputs, sample_fps = fetch_video(
95+
video_dict, image_patch_size=16, return_video_sample_fps=True, return_video_metadata=True
96+
)
9597
frames, video_metadata = video_inputs
9698
frames = frames.numpy()
9799
return frames, video_metadata, sample_fps
98100
elif self.config.video_sampling_strategy == "fps":
99101
video_dict["fps"] = fps
100-
video_inputs, sample_fps = fetch_video(video_dict, return_video_sample_fps=True, return_video_metadata=True)
102+
video_inputs, sample_fps = fetch_video(
103+
video_dict, image_patch_size=16, return_video_sample_fps=True, return_video_metadata=True
104+
)
101105
frames, video_metadata = video_inputs
102106
frames = frames.numpy()
103107
return frames, video_metadata, sample_fps

src/lmms_engine/datasets/naive/qwen3_vl_dataset.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,13 +101,17 @@ def load_video_qwen_vl_utils(
101101
if self.config.video_sampling_strategy == "frame_num":
102102
n_frames = self.config.frame_num
103103
video_dict["nframes"] = n_frames
104-
video_inputs, sample_fps = fetch_video(video_dict, return_video_sample_fps=True, return_video_metadata=True)
104+
video_inputs, sample_fps = fetch_video(
105+
video_dict, image_patch_size=16, return_video_sample_fps=True, return_video_metadata=True
106+
)
105107
frames, video_metadata = video_inputs
106108
frames = frames.numpy()
107109
return frames, video_metadata, sample_fps
108110
elif self.config.video_sampling_strategy == "fps":
109111
video_dict["fps"] = fps
110-
video_inputs, sample_fps = fetch_video(video_dict, return_video_sample_fps=True, return_video_metadata=True)
112+
video_inputs, sample_fps = fetch_video(
113+
video_dict, image_patch_size=16, return_video_sample_fps=True, return_video_metadata=True
114+
)
111115
frames, video_metadata = video_inputs
112116
frames = frames.numpy()
113117
return frames, video_metadata, sample_fps

0 commit comments

Comments
 (0)