Skip to content

Commit 63c6331

Browse files
authored
Qwen 2.5 Omni: apply video defaults (#37660)
* Apply video defaults for min_pixels and max_pixels * fps kwarg should not be a list * Update test to account for new resizing
1 parent 1e90873 commit 63c6331

File tree

2 files changed

+8
-7
lines changed

2 files changed

+8
-7
lines changed

src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ class Qwen2_5OmniProcessorKwargs(ProcessingKwargs, total=False):
6161
"seconds_per_chunk": 2.0,
6262
"position_id_per_seconds": 25,
6363
"use_audio_in_video": False,
64+
"min_pixels": 128 * 28 * 28,
65+
"max_pixels": 768 * 28 * 28,
6466
},
6567
"audio_kwargs": {
6668
"sampling_rate": 16000,
@@ -147,7 +149,7 @@ def __call__(
147149
seconds_per_chunk = output_kwargs["videos_kwargs"].pop("seconds_per_chunk")
148150
position_id_per_seconds = output_kwargs["videos_kwargs"].pop("position_id_per_seconds")
149151
use_audio_in_video = output_kwargs["videos_kwargs"].pop("use_audio_in_video")
150-
fps = output_kwargs["videos_kwargs"].pop("fps", None)
152+
fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
151153

152154
if audio is not None:
153155
output_kwargs["audio_kwargs"]["padding"] = "max_length" # Support "max_length" padding only here
@@ -174,8 +176,7 @@ def __call__(
174176
if videos is not None:
175177
videos = make_batched_videos(videos)
176178
videos_inputs = self.image_processor(images=None, videos=videos, **output_kwargs["videos_kwargs"])
177-
if fps is None:
178-
fps = [2.0] * len(videos)
179+
fps = [fps] * len(videos)
179180
videos_inputs["video_second_per_grid"] = [
180181
self.image_processor.temporal_patch_size / fps[i] for i in range(len(fps))
181182
]

tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,7 @@ def test_apply_chat_template_video_frame_sampling(self):
433433
num_frames=num_frames,
434434
)
435435
self.assertTrue(self.videos_input_name in out_dict_with_video)
436-
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 9568)
436+
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 5760)
437437

438438
# Load with `video_fps` arg
439439
video_fps = 1
@@ -445,7 +445,7 @@ def test_apply_chat_template_video_frame_sampling(self):
445445
video_fps=video_fps,
446446
)
447447
self.assertTrue(self.videos_input_name in out_dict_with_video)
448-
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 23920)
448+
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 14400)
449449

450450
# Load with `video_fps` and `num_frames` args, should raise an error
451451
with self.assertRaises(ValueError):
@@ -466,7 +466,7 @@ def test_apply_chat_template_video_frame_sampling(self):
466466
return_dict=True,
467467
)
468468
self.assertTrue(self.videos_input_name in out_dict_with_video)
469-
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 717600)
469+
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 432000)
470470

471471
# Load video as a list of frames (i.e. images). NOTE: each frame should have same size
472472
# because we assume they come from one video
@@ -484,7 +484,7 @@ def test_apply_chat_template_video_frame_sampling(self):
484484
return_dict=True,
485485
)
486486
self.assertTrue(self.videos_input_name in out_dict_with_video)
487-
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 5704)
487+
self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 2904)
488488

489489
@require_av
490490
def test_apply_chat_template_video_special_processing(self):

0 commit comments

Comments
 (0)