Process inputs directly in apply_chat_template in image-text-to-text pipeline (#35616)

yonigozlan · web-flow · commit 5cd6b64059a2 · 2025-04-23T13:31:33.000-04:00
* tokenize inputs directly in apply_chat_template

* refactor processing

* revert changes processing llava

* Update docs

* fix issue with str being iterable

* add test chat text only

* change function name
diff --git a/docs/source/en/tasks/image_text_to_text.md b/docs/source/en/tasks/image_text_to_text.md
@@ -160,7 +160,48 @@ outputs[0]["generated_text"]
 #  with a yellow center in the foreground. The flower is surrounded by red and white flowers with green stems
 ```
 
-## Streaming
+If you prefer, you can also load the images separately and pass them to the pipeline like so:
+
+```python
+pipe = pipeline("image-text-to-text", model="HuggingFaceTB/SmolVLM-256M-Instruct")
+
+img_urls = [
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png",
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg",
+]
+images = [
+    Image.open(requests.get(img_urls[0], stream=True).raw),
+    Image.open(requests.get(img_urls[1], stream=True).raw),
+]
+
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image"},
+            {"type": "image"},
+            {"type": "text", "text": "What do you see in these images?"},
+        ],
+    }
+]
+outputs = pipe(text=messages, images=images, max_new_tokens=50, return_full_text=False)
+outputs[0]["generated_text"]
+" In the first image, there are two cats sitting on a plant. In the second image, there are flowers with a pinkish hue."
+```
+
+The images will still be included in the `"input_text"` field of the output:
+
+```python
+outputs[0]['input_text']
+"""
+[{'role': 'user',
+  'content': [{'type': 'image',
+    'image': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=622x412>},
+   {'type': 'image',
+    'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=5184x3456>},
+   {'type': 'text', 'text': 'What do you see in these images?'}]}]## Streaming
+"""
+```
 
 We can use [text streaming](./generation_strategies#streaming) for a better generation experience. Transformers supports streaming with the [`TextStreamer`] or [`TextIteratorStreamer`] classes. We will use the [`TextIteratorStreamer`] with IDEFICS-8B.
 
diff --git a/src/transformers/pipelines/image_text_to_text.py b/src/transformers/pipelines/image_text_to_text.py
@@ -58,60 +58,56 @@ def __init__(self, messages: Dict, images: Union[str, List[str], "Image.Image",
         for message in messages:
             if not ("role" in message and "content" in message):
                 raise ValueError("When passing chat dicts as input, each dict must have a 'role' and 'content' key.")
-        images = retrieve_images_in_messages(messages, images)
+        messages = add_images_to_messages(messages, images)
 
         self.messages = messages
-        self.images = images
 
 
-def retrieve_images_in_messages(
+def add_images_to_messages(
     messages: dict, images: Optional[Union[str, List[str], "Image.Image", List["Image.Image"]]]
 ):
     """
     Retrieve and combine images from the chat and the images passed as input.
     """
     if images is None:
         images = []
-    elif not isinstance(images, Iterable):
+    elif not isinstance(images, Iterable) or isinstance(images, str):
         images = [images]
     idx_images = 0
-    retrieved_images = []
     for message in messages:
         for content in message["content"]:
-            if isinstance(content, dict):
-                if content.get("type") == "image":
-                    for key in ["image", "url", "path", "base64"]:
-                        if key in content:
-                            retrieved_images.append(content[key])
-                            break
-                    else:
-                        if idx_images < len(images):
-                            retrieved_images.append(images[idx_images])
-                            idx_images += 1
-                        else:
-                            raise ValueError(
-                                "The number of images in the chat messages should be the same as the number of images passed to the pipeline."
-                            )
-                # Add support for OpenAI/TGI chat format
-                elif content.get("type") == "image_url":
-                    if isinstance(content.get("image_url"), dict) and "url" in content["image_url"]:
-                        retrieved_images.append(content["image_url"]["url"])
-                        # Rewrite content to be in the Transformers chat format
-                        content["type"] = "image"
-                        content["image"] = content["image_url"]["url"]
-                        del content["image_url"]
+            if not isinstance(content, dict):
+                continue
+            content_type = content.get("type")
+            if content_type == "image":
+                if not any(key in content for key in ["image", "url", "path", "base64"]):
+                    if idx_images < len(images):
+                        # Insert the image passed as argument in the chat message
+                        content["image"] = images[idx_images]
+                        idx_images += 1
                     else:
                         raise ValueError(
-                            "Wrong format for 'image_url' content type. The content should have an 'image_url' dict with a 'url' key."
+                            "The number of images in the chat messages should be the same as the number of images passed to the pipeline."
                         )
+            # Add support for OpenAI/TGI chat format
+            elif content_type == "image_url":
+                if isinstance(content.get("image_url"), dict) and "url" in content["image_url"]:
+                    # Rewrite content to be in the Transformers chat format
+                    content["type"] = "image"
+                    content["image"] = content["image_url"]["url"]
+                    del content["image_url"]
+                else:
+                    raise ValueError(
+                        "Wrong format for 'image_url' content type. The content should have an 'image_url' dict with a 'url' key."
+                    )
 
     # The number of images passed should be consistent with the number of images in the chat without an image key
     if idx_images != len(images):
         raise ValueError(
             "The number of images in the chat messages should be the same as the number of images passed to the pipeline."
         )
 
-    return retrieved_images
+    return messages
 
 
 @add_end_docstrings(build_pipeline_init_args(has_processor=True))
@@ -331,32 +327,30 @@ def __call__(
         return super().__call__({"images": images, "text": text}, **kwargs)
 
     def preprocess(self, inputs=None, timeout=None, continue_final_message=None, **processing_kwargs):
+        if isinstance(inputs, Chat):
+            # If the user passes a chat that ends in an assistant message, we treat it as a prefill by default
+            # because very few models support multiple separate, consecutive assistant messages
+            if continue_final_message is None:
+                continue_final_message = inputs.messages[-1]["role"] == "assistant"
+            model_inputs = self.processor.apply_chat_template(
+                inputs.messages,
+                add_generation_prompt=not continue_final_message,
+                continue_final_message=continue_final_message,
+                return_tensors=self.framework,
+                tokenize=True,
+                return_dict=True,
+            )
+            model_inputs["text"] = inputs
+            return model_inputs
         # In case we only have text inputs
         if isinstance(inputs, (list, tuple, str)):
             images = None
             text = inputs
             inputs_text = inputs
         else:
-            if isinstance(inputs, Chat):
-                # If the user passes a chat that ends in an assistant message, we treat it as a prefill by default
-                # because very few models support multiple separate, consecutive assistant messages
-                if continue_final_message is None:
-                    continue_final_message = inputs.messages[-1]["role"] == "assistant"
-                text = self.processor.apply_chat_template(
-                    inputs.messages,
-                    add_generation_prompt=not continue_final_message,
-                    continue_final_message=continue_final_message,
-                    return_tensors=self.framework,
-                    **processing_kwargs,
-                )
-                inputs_text = inputs
-                images = inputs.images
-            else:
-                text = inputs["text"]
-                inputs_text = inputs["text"]
-                images = inputs["images"]
-
-            images = load_images(images, timeout=timeout)
+            images = load_images(inputs["images"], timeout=timeout)
+            text = inputs["text"]
+            inputs_text = inputs["text"]
 
         # if batched text inputs, we set padding to True unless specified otherwise
         if isinstance(text, (list, tuple)) and len(text) > 1:
diff --git a/tests/pipelines/test_pipelines_image_text_to_text.py b/tests/pipelines/test_pipelines_image_text_to_text.py
@@ -66,6 +66,78 @@ def run_pipeline_test(self, pipe, examples):
             ],
         )
 
+    @require_torch
+    def test_small_model_pt_token_text_only(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+        text = "What is the capital of France? Assistant:"
+
+        outputs = pipe(text=text)
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "input_text": "What is the capital of France? Assistant:",
+                    "generated_text": "What is the capital of France? Assistant: The capital of France is Paris.",
+                }
+            ],
+        )
+
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "Write a poem on Hugging Face, the company"},
+                    ],
+                },
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What is the capital of France?"},
+                    ],
+                },
+            ],
+        ]
+        outputs = pipe(text=messages)
+        self.assertEqual(
+            outputs,
+            [
+                [
+                    {
+                        "input_text": [
+                            {
+                                "role": "user",
+                                "content": [{"type": "text", "text": "Write a poem on Hugging Face, the company"}],
+                            }
+                        ],
+                        "generated_text": [
+                            {
+                                "role": "user",
+                                "content": [{"type": "text", "text": "Write a poem on Hugging Face, the company"}],
+                            },
+                            {
+                                "role": "assistant",
+                                "content": "Hugging Face, a company of minds\nWith tools and services that make our lives easier\nFrom",
+                            },
+                        ],
+                    }
+                ],
+                [
+                    {
+                        "input_text": [
+                            {"role": "user", "content": [{"type": "text", "text": "What is the capital of France?"}]}
+                        ],
+                        "generated_text": [
+                            {"role": "user", "content": [{"type": "text", "text": "What is the capital of France?"}]},
+                            {"role": "assistant", "content": "Paris"},
+                        ],
+                    }
+                ],
+            ],
+        )
+
     @require_torch
     def test_small_model_pt_token(self):
         pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
@@ -124,7 +196,7 @@ def test_model_pt_chat_template(self):
                 ],
             }
         ]
-        outputs = pipe([image_ny, image_chicago], text=messages, return_full_text=False, max_new_tokens=10)
+        outputs = pipe([image_ny, image_chicago], text=messages, return_full_text=True, max_new_tokens=10)
         self.assertEqual(
             outputs,
             [
@@ -134,12 +206,37 @@ def test_model_pt_chat_template(self):
                             "role": "user",
                             "content": [
                                 {"type": "text", "text": "What’s the difference between these two images?"},
-                                {"type": "image"},
-                                {"type": "image"},
+                                {
+                                    "type": "image",
+                                    "image": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
+                                },
+                                {
+                                    "type": "image",
+                                    "image": "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg",
+                                },
                             ],
                         }
                     ],
-                    "generated_text": "The first image shows a statue of Liberty in the",
+                    "generated_text": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": "What’s the difference between these two images?"},
+                                {
+                                    "type": "image",
+                                    "image": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
+                                },
+                                {
+                                    "type": "image",
+                                    "image": "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg",
+                                },
+                            ],
+                        },
+                        {
+                            "role": "assistant",
+                            "content": "The first image shows a statue of Liberty in the",
+                        },
+                    ],
                 }
             ],
         )