feat: Add GoogleAITextEmbedder and GoogleAIDocumentEmbedder components (#1783)

garybadwal · web-flow · commit 9bd9134a8387 · 2025-06-10T09:51:40.000+02:00
* feat: Add GoogleAITextEmbedder and GoogleAIDocumentEmbedder components

* fix: Improve error messages for input type validation in GoogleAITextEmbedder and GoogleAIDocumentEmbedder

* feat: add Google GenAI embedder components for document and text embeddings

* feat: add unit tests for GoogleAIDocumentEmbedder and GoogleAITextEmbedder

* refactor: clean up imports and improve list handling in GoogleAIDocumentEmbedder and GoogleAITextEmbedder tests

* refactor: Rename classes and update imports for Google GenAI components

* feat: Add additional modules for Google GenAI embedders in config

* chore: add 'more-itertools' to lint environment dependencies

* refactor: update GoogleGenAIDocumentEmbedder and GoogleGenAITextEmbedder to use private attributes for initialization

* refactor: update _prepare_texts_to_embed to return a list instead of a dictionary

* refactor: format code for better readability and consistency in document embedder

* refactor: improve code formatting for consistency and readability in document embedder and tests

* refactor: update _prepare_texts_to_embed to return a list instead of a dictionary

* feat: add new author to project metadata in pyproject.toml
diff --git a/integrations/google_genai/pydoc/config.yml b/integrations/google_genai/pydoc/config.yml
@@ -3,6 +3,8 @@ loaders:
     search_path: [../src]
     modules: [
       "haystack_integrations.components.generators.google_genai.chat.chat_generator",
+      "haystack_integrations.components.embedders.google_genai.document_embedder",
+      "haystack_integrations.components.embedders.google_genai.text_embedder"
     ]
     ignore_when_discovered: ["__init__"]
 processors:
diff --git a/integrations/google_genai/pyproject.toml b/integrations/google_genai/pyproject.toml
@@ -10,7 +10,10 @@ readme = "README.md"
 requires-python = ">=3.9"
 license = "Apache-2.0"
 keywords = []
-authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }]
+authors = [
+  { name = "deepset GmbH", email = "info@deepset.ai" },
+  { name = "Gary Badwal", email = "gurpreet071999@gmail.com" }
+]
 classifiers = [
   "License :: OSI Approved :: Apache Software License",
   "Development Status :: 4 - Beta",
@@ -74,7 +77,7 @@ types = "mypy --install-types --non-interactive --explicit-package-bases {args:s
 [tool.hatch.envs.lint]
 installer = "uv"
 detached = true
-dependencies = ["pip", "black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243"]
+dependencies = ["pip", "black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243", "more-itertools"]
 
 [tool.hatch.envs.lint.scripts]
 typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}"
diff --git a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/__init__.py b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/__init__.py
@@ -0,0 +1,7 @@
+# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+from .document_embedder import GoogleGenAIDocumentEmbedder
+from .text_embedder import GoogleGenAITextEmbedder
+
+__all__ = ["GoogleGenAIDocumentEmbedder", "GoogleGenAITextEmbedder"]
diff --git a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py
@@ -0,0 +1,193 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from google import genai
+from google.genai import types
+from haystack import Document, component, default_from_dict, default_to_dict, logging
+from haystack.utils import Secret, deserialize_secrets_inplace
+from more_itertools import batched
+from tqdm import tqdm
+
+logger = logging.getLogger(__name__)
+
+
+@component
+class GoogleGenAIDocumentEmbedder:
+    """
+    Computes document embeddings using Google AI models.
+
+    ### Usage example
+
+    ```python
+    from haystack import Document
+    from haystack_integrations.components.embedders import GoogleGenAIDocumentEmbedder
+
+    doc = Document(content="I love pizza!")
+
+    document_embedder = GoogleGenAIDocumentEmbedder()
+
+    result = document_embedder.run([doc])
+    print(result['documents'][0].embedding)
+
+    # [0.017020374536514282, -0.023255806416273117, ...]
+    ```
+    """
+
+    def __init__(
+        self,
+        *,
+        api_key: Secret = Secret.from_env_var("GOOGLE_API_KEY"),
+        model: str = "text-embedding-004",
+        prefix: str = "",
+        suffix: str = "",
+        batch_size: int = 32,
+        progress_bar: bool = True,
+        meta_fields_to_embed: Optional[List[str]] = None,
+        embedding_separator: str = "\n",
+        config: Optional[Dict[str, Any]] = None,
+    ):
+        """
+        Creates an GoogleGenAIDocumentEmbedder component.
+
+        :param api_key:
+            The Google API key.
+            You can set it with the environment variable `GOOGLE_API_KEY`, or pass it via this parameter
+            during initialization.
+        :param model:
+            The name of the model to use for calculating embeddings.
+            The default model is `text-embedding-ada-002`.
+        :param prefix:
+            A string to add at the beginning of each text.
+        :param suffix:
+            A string to add at the end of each text.
+        :param batch_size:
+            Number of documents to embed at once.
+        :param progress_bar:
+            If `True`, shows a progress bar when running.
+        :param meta_fields_to_embed:
+            List of metadata fields to embed along with the document text.
+        :param embedding_separator:
+            Separator used to concatenate the metadata fields to the document text.
+        :param config:
+            A dictionary of keyword arguments to configure embedding content configuration `types.EmbedContentConfig`.
+            If not specified, it defaults to {"task_type": "SEMANTIC_SIMILARITY"}.
+            For more information, see the [Google AI Task types](https://ai.google.dev/gemini-api/docs/embeddings#task-types).
+        """
+        self._api_key = api_key
+        self._model = model
+        self._prefix = prefix
+        self._suffix = suffix
+        self._batch_size = batch_size
+        self._progress_bar = progress_bar
+        self._meta_fields_to_embed = meta_fields_to_embed or []
+        self._embedding_separator = embedding_separator
+        self._client = genai.Client(api_key=api_key.resolve_value())
+        self._config = config if config is not None else {"task_type": "SEMANTIC_SIMILARITY"}
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
+        """
+        return default_to_dict(
+            self,
+            model=self._model,
+            prefix=self._prefix,
+            suffix=self._suffix,
+            batch_size=self._batch_size,
+            progress_bar=self._progress_bar,
+            meta_fields_to_embed=self._meta_fields_to_embed,
+            embedding_separator=self._embedding_separator,
+            api_key=self._api_key.to_dict(),
+            config=self._config,
+        )
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "GoogleGenAIDocumentEmbedder":
+        """
+        Deserializes the component from a dictionary.
+
+        :param data:
+            Dictionary to deserialize from.
+        :returns:
+            Deserialized component.
+        """
+        deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
+        return default_from_dict(cls, data)
+
+    def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]:
+        """
+        Prepare the texts to embed by concatenating the Document text with the metadata fields to embed.
+        """
+        texts_to_embed: List[str] = []
+        for doc in documents:
+            meta_values_to_embed = [
+                str(doc.meta[key])
+                for key in self._meta_fields_to_embed
+                if key in doc.meta and doc.meta[key] is not None
+            ]
+
+            text_to_embed = (
+                self._prefix + self._embedding_separator.join([*meta_values_to_embed, doc.content or ""]) + self._suffix
+            )
+            texts_to_embed.append(text_to_embed)
+
+        return texts_to_embed
+
+    def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> Tuple[List[List[float]], Dict[str, Any]]:
+        """
+        Embed a list of texts in batches.
+        """
+
+        all_embeddings = []
+        meta: Dict[str, Any] = {}
+        for batch in tqdm(
+            batched(texts_to_embed, batch_size), disable=not self._progress_bar, desc="Calculating embeddings"
+        ):
+            args: Dict[str, Any] = {"model": self._model, "contents": [b[1] for b in batch]}
+            if self._config:
+                args["config"] = types.EmbedContentConfig(**self._config) if self._config else None
+
+            response = self._client.models.embed_content(**args)
+
+            embeddings = [el.values for el in response.embeddings]
+            all_embeddings.extend(embeddings)
+
+            if "model" not in meta:
+                meta["model"] = self._model
+
+        return all_embeddings, meta
+
+    @component.output_types(documents=List[Document], meta=Dict[str, Any])
+    def run(self, documents: List[Document]) -> Dict[str, Union[List[Document], Dict[str, Any]]]:
+        """
+        Embeds a list of documents.
+
+        :param documents:
+            A list of documents to embed.
+
+        :returns:
+            A dictionary with the following keys:
+            - `documents`: A list of documents with embeddings.
+            - `meta`: Information about the usage of the model.
+        """
+        if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
+            error_message_documents = (
+                "GoogleGenAIDocumentEmbedder expects a list of Documents as input. "
+                "In case you want to embed a string, please use the GoogleGenAITextEmbedder."
+            )
+            raise TypeError(error_message_documents)
+
+        texts_to_embed = self._prepare_texts_to_embed(documents=documents)
+
+        embeddings, meta = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self._batch_size)
+
+        for doc, emb in zip(documents, embeddings):
+            doc.embedding = emb
+
+        return {"documents": documents, "meta": meta}
diff --git a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/text_embedder.py b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/text_embedder.py
@@ -0,0 +1,139 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Dict, List, Optional, Union
+
+from google import genai
+from google.genai import types
+from haystack import component, default_from_dict, default_to_dict, logging
+from haystack.utils import Secret, deserialize_secrets_inplace
+
+logger = logging.getLogger(__name__)
+
+
+@component
+class GoogleGenAITextEmbedder:
+    """
+    Embeds strings using Google AI models.
+
+    You can use it to embed user query and send it to an embedding Retriever.
+
+    ### Usage example
+
+    ```python
+    from haystack_integrations.components.embedders.google_genai import GoogleGenAITextEmbedder
+
+    text_to_embed = "I love pizza!"
+
+    text_embedder = GoogleGenAITextEmbedder()
+
+    print(text_embedder.run(text_to_embed))
+
+    # {'embedding': [0.017020374536514282, -0.023255806416273117, ...],
+    # 'meta': {'model': 'text-embedding-004-v2',
+    #          'usage': {'prompt_tokens': 4, 'total_tokens': 4}}}
+    ```
+    """
+
+    def __init__(
+        self,
+        *,
+        api_key: Secret = Secret.from_env_var("GOOGLE_API_KEY"),
+        model: str = "text-embedding-004",
+        prefix: str = "",
+        suffix: str = "",
+        config: Optional[Dict[str, Any]] = None,
+    ):
+        """
+        Creates an GoogleGenAITextEmbedder component.
+
+        :param api_key:
+            The Google API key.
+            You can set it with the environment variable `GOOGLE_API_KEY`, or pass it via this parameter
+            during initialization.
+        :param model:
+            The name of the model to use for calculating embeddings.
+            The default model is `text-embedding-004`.
+        :param prefix:
+            A string to add at the beginning of each text to embed.
+        :param suffix:
+            A string to add at the end of each text to embed.
+        :param config:
+            A dictionary of keyword arguments to configure embedding content configuration `types.EmbedContentConfig`.
+            If not specified, it defaults to {"task_type": "SEMANTIC_SIMILARITY"}.
+            For more information, see the [Google AI Task types](https://ai.google.dev/gemini-api/docs/embeddings#task-types).
+        """
+
+        self._api_key = api_key
+        self._model_name = model
+        self._prefix = prefix
+        self._suffix = suffix
+        self._config = config if config is not None else {"task_type": "SEMANTIC_SIMILARITY"}
+        self._client = genai.Client(api_key=api_key.resolve_value())
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
+        """
+        return default_to_dict(
+            self,
+            model=self._model_name,
+            api_key=self._api_key.to_dict(),
+            prefix=self._prefix,
+            suffix=self._suffix,
+            config=self._config,
+        )
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "GoogleGenAITextEmbedder":
+        """
+        Deserializes the component from a dictionary.
+
+        :param data:
+            Dictionary to deserialize from.
+        :returns:
+            Deserialized component.
+        """
+        deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
+        return default_from_dict(cls, data)
+
+    def _prepare_input(self, text: str) -> Dict[str, Any]:
+        if not isinstance(text, str):
+            error_message_text = (
+                "GoogleGenAITextEmbedder expects a string as an input. "
+                "In case you want to embed a list of Documents, please use the GoogleAIDocumentEmbedder."
+            )
+
+            raise TypeError(error_message_text)
+
+        text_to_embed = self._prefix + text + self._suffix
+
+        kwargs: Dict[str, Any] = {"model": self._model_name, "contents": text_to_embed}
+        if self._config:
+            kwargs["config"] = types.EmbedContentConfig(**self._config)
+
+        return kwargs
+
+    def _prepare_output(self, result: types.EmbedContentResponse) -> Dict[str, Any]:
+        return {"embedding": result.embeddings[0].values, "meta": {"model": self._model_name}}
+
+    @component.output_types(embedding=List[float], meta=Dict[str, Any])
+    def run(self, text: str) -> Union[Dict[str, List[float]], Dict[str, Any]]:
+        """
+        Embeds a single string.
+
+        :param text:
+            Text to embed.
+
+        :returns:
+            A dictionary with the following keys:
+            - `embedding`: The embedding of the input text.
+            - `meta`: Information about the usage of the model.
+        """
+        create_kwargs = self._prepare_input(text=text)
+        response = self._client.models.embed_content(**create_kwargs)
+        return self._prepare_output(result=response)
diff --git a/integrations/google_genai/tests/test_document_embedder.py b/integrations/google_genai/tests/test_document_embedder.py
diff --git a/integrations/google_genai/tests/test_text_embedder.py b/integrations/google_genai/tests/test_text_embedder.py

Original file line number	Diff line number	Diff line change
`@@ -3,6 +3,8 @@ loaders:`
`3`	`3`	`search_path: [../src]`
`4`	`4`	`modules: [`
`5`	`5`	`"haystack_integrations.components.generators.google_genai.chat.chat_generator",`
	`6`	`+ "haystack_integrations.components.embedders.google_genai.document_embedder",`
	`7`	`+ "haystack_integrations.components.embedders.google_genai.text_embedder"`
`6`	`8`	`]`
`7`	`9`	`ignore_when_discovered: ["__init__"]`
`8`	`10`	`processors:`