Qwen 2.5 Omni: apply video defaults (#37660)

pcuenca · web-flow · commit 63c6331387d7 · 2025-04-23T17:08:11.000+02:00
* Apply video defaults for min_pixels and max_pixels

* fps kwarg should not be a list

* Update test to account for new resizing
diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
@@ -61,6 +61,8 @@ class Qwen2_5OmniProcessorKwargs(ProcessingKwargs, total=False):
             "seconds_per_chunk": 2.0,
             "position_id_per_seconds": 25,
             "use_audio_in_video": False,
+            "min_pixels": 128 * 28 * 28,
+            "max_pixels": 768 * 28 * 28,
         },
         "audio_kwargs": {
             "sampling_rate": 16000,
@@ -147,7 +149,7 @@ def __call__(
         seconds_per_chunk = output_kwargs["videos_kwargs"].pop("seconds_per_chunk")
         position_id_per_seconds = output_kwargs["videos_kwargs"].pop("position_id_per_seconds")
         use_audio_in_video = output_kwargs["videos_kwargs"].pop("use_audio_in_video")
-        fps = output_kwargs["videos_kwargs"].pop("fps", None)
+        fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
 
         if audio is not None:
             output_kwargs["audio_kwargs"]["padding"] = "max_length"  # Support "max_length" padding only here
@@ -174,8 +176,7 @@ def __call__(
         if videos is not None:
             videos = make_batched_videos(videos)
             videos_inputs = self.image_processor(images=None, videos=videos, **output_kwargs["videos_kwargs"])
-            if fps is None:
-                fps = [2.0] * len(videos)
+            fps = [fps] * len(videos)
             videos_inputs["video_second_per_grid"] = [
                 self.image_processor.temporal_patch_size / fps[i] for i in range(len(fps))
             ]
diff --git a/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py b/tests/models/qwen2_5_omni/test_processor_qwen2_5_omni.py
@@ -433,7 +433,7 @@ def test_apply_chat_template_video_frame_sampling(self):
             num_frames=num_frames,
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 9568)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 5760)
 
         # Load with `video_fps` arg
         video_fps = 1
@@ -445,7 +445,7 @@ def test_apply_chat_template_video_frame_sampling(self):
             video_fps=video_fps,
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 23920)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 14400)
 
         # Load with `video_fps` and `num_frames` args, should raise an error
         with self.assertRaises(ValueError):
@@ -466,7 +466,7 @@ def test_apply_chat_template_video_frame_sampling(self):
             return_dict=True,
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 717600)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 432000)
 
         # Load video as a list of frames (i.e. images). NOTE: each frame should have same size
         # because we assume they come from one video
@@ -484,7 +484,7 @@ def test_apply_chat_template_video_frame_sampling(self):
             return_dict=True,
         )
         self.assertTrue(self.videos_input_name in out_dict_with_video)
-        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 5704)
+        self.assertEqual(len(out_dict_with_video[self.videos_input_name]), 2904)
 
     @require_av
     def test_apply_chat_template_video_special_processing(self):