Skip to content

Commit 6e00007

Browse files
feat: add BLIP support in TransformersImageToText (#4912)
* add blip support * fix typo Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
1 parent 4c98430 commit 6e00007

File tree

2 files changed

+8
-16
lines changed

2 files changed

+8
-16
lines changed

haystack/nodes/image_to_text/transformers.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,11 @@
1717

1818
# supported models classes should be extended when HF image-to-text pipeline willl support more classes
1919
# see https://github.com/huggingface/transformers/issues/21110
20-
SUPPORTED_MODELS_CLASSES = ["VisionEncoderDecoderModel"]
20+
SUPPORTED_MODELS_CLASSES = [
21+
"VisionEncoderDecoderModel",
22+
"BlipForConditionalGeneration",
23+
"Blip2ForConditionalGeneration",
24+
]
2125

2226
UNSUPPORTED_MODEL_MESSAGE = (
2327
f"The supported classes are: {SUPPORTED_MODELS_CLASSES}. \n"
@@ -33,8 +37,6 @@ class TransformersImageToText(BaseImageToText):
3337
"""
3438
A transformer-based model to generate captions for images using the Hugging Face's transformers framework.
3539
36-
Currently, this node supports `VisionEncoderDecoderModel` models.
37-
3840
**Example**
3941
4042
```python
@@ -64,7 +66,7 @@ class TransformersImageToText(BaseImageToText):
6466

6567
def __init__(
6668
self,
67-
model_name_or_path: str = "nlpconnect/vit-gpt2-image-captioning",
69+
model_name_or_path: str = "Salesforce/blip-image-captioning-base",
6870
model_version: Optional[str] = None,
6971
generation_kwargs: Optional[dict] = None,
7072
use_gpu: bool = True,
@@ -74,15 +76,14 @@ def __init__(
7476
devices: Optional[List[Union[str, torch.device]]] = None,
7577
):
7678
"""
77-
Load a `VisionEncoderDecoderModel` model from transformers.
79+
Load an Image-to-Text model from transformers.
7880
7981
:param model_name_or_path: Directory of a saved model or the name of a public model.
80-
Currently, only `VisionEncoderDecoderModel` models are supported.
8182
To find these models:
8283
1. Visit [Hugging Face image to text models](https://huggingface.co/models?pipeline_tag=image-to-text).`
8384
2. Open the model you want to check.
8485
3. On the model page, go to the "Files and Versions" tab.
85-
4. Open the `config.json` file and make sure the `architectures` field contains `VisionEncoderDecoderModel`.
86+
4. Open the `config.json` file and make sure the `architectures` field contains `VisionEncoderDecoderModel`, `BlipForConditionalGeneration`, or `Blip2ForConditionalGeneration`.
8687
:param model_version: The version of the model to use from the Hugging Face model hub. This can be the tag name, branch name, or commit hash.
8788
:param generation_kwargs: Dictionary containing arguments for the `generate()` method of the Hugging Face model.
8889
See [generate()](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate) in Hugging Face documentation.

test/nodes/test_image_to_text.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -91,12 +91,3 @@ def test_image_to_text_unsupported_model_after_loading():
9191
match="The model 'deepset/minilm-uncased-squad2' \(class 'BertForQuestionAnswering'\) is not supported for ImageToText",
9292
):
9393
_ = TransformersImageToText(model_name_or_path="deepset/minilm-uncased-squad2")
94-
95-
96-
@pytest.mark.integration
97-
def test_image_to_text_unsupported_model_before_loading():
98-
with pytest.raises(
99-
ValueError,
100-
match=r"The model '.*' \(class '.*'\) is not supported for ImageToText. The supported classes are: \['VisionEncoderDecoderModel'\]",
101-
):
102-
_ = TransformersImageToText(model_name_or_path="Salesforce/blip-image-captioning-base")

0 commit comments

Comments
 (0)