From 85a7ef27fb786315cd33542a8650536359dd6077 Mon Sep 17 00:00:00 2001 From: garybadwal Date: Sat, 24 May 2025 16:24:45 +0530 Subject: [PATCH 01/14] feat: Add GoogleAITextEmbedder and GoogleAIDocumentEmbedder components --- .../embedders/google_ai/__init__.py | 3 + .../embedders/google_ai/google_embedder.py | 322 ++++++++++++++++++ 2 files changed, 325 insertions(+) create mode 100644 integrations/google_ai/src/haystack_integrations/components/embedders/google_ai/__init__.py create mode 100644 integrations/google_ai/src/haystack_integrations/components/embedders/google_ai/google_embedder.py diff --git a/integrations/google_ai/src/haystack_integrations/components/embedders/google_ai/__init__.py b/integrations/google_ai/src/haystack_integrations/components/embedders/google_ai/__init__.py new file mode 100644 index 0000000000..bbdabff160 --- /dev/null +++ b/integrations/google_ai/src/haystack_integrations/components/embedders/google_ai/__init__.py @@ -0,0 +1,3 @@ +from .google_embedder import GoogleAIDocumentEmbedder, GoogleAITextEmbedder + +__all__ = ["GoogleAIDocumentEmbedder", "GoogleAITextEmbedder"] \ No newline at end of file diff --git a/integrations/google_ai/src/haystack_integrations/components/embedders/google_ai/google_embedder.py b/integrations/google_ai/src/haystack_integrations/components/embedders/google_ai/google_embedder.py new file mode 100644 index 0000000000..901a476856 --- /dev/null +++ b/integrations/google_ai/src/haystack_integrations/components/embedders/google_ai/google_embedder.py @@ -0,0 +1,322 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import os + +from typing import Any, Dict, List, Optional, Tuple +from more_itertools import batched +from tqdm import tqdm + +from google import genai +from google.genai import types + +from haystack import Document, component, default_from_dict, default_to_dict, logging +from haystack.utils import Secret, deserialize_secrets_inplace + +logger = logging.getLogger(__name__) + + +@component +class GoogleAITextEmbedder: + """ + Embeds strings using OpenAI models. + + You can use it to embed user query and send it to an embedding Retriever. + + ### Usage example + + ```python + from haystack.components.embedders import GoogleAITextEmbedder + + text_to_embed = "I love pizza!" + + text_embedder = GoogleAITextEmbedder() + + print(text_embedder.run(text_to_embed)) + + # {'embedding': [0.017020374536514282, -0.023255806416273117, ...], + # 'meta': {'model': 'text-embedding-004-v2', + # 'usage': {'prompt_tokens': 4, 'total_tokens': 4}}} + ``` + """ + + def __init__( # pylint: disable=too-many-positional-arguments + self, + api_key: Secret = Secret.from_env_var("GOOGLE_API_KEY"), + model: str = "text-embedding-004", + config: Optional[types.EmbedContentConfig] = types.EmbedContentConfig( + task_type="SEMANTIC_SIMILARITY"), + + ): + """ + Creates an GoogleAITextEmbedder component. + + :param api_key: + The Google API key. + You can set it with an environment variable `GOOGLE_API_KEY`, or pass with this parameter + during initialization. + :param model: + The name of the model to use for calculating embeddings. + The default model is `text-embedding-004`. + :param config: + A dictionary of keyword arguments to configure embedding content configuration `types.EmbedContentConfig`. + For more information, see the [Google AI Task types](https://ai.google.dev/gemini-api/docs/embeddings#task-types). + """ + + self._api_key = api_key + self._model_name = model + self._config = config + self._client = genai.Client(api_key=api_key.resolve_value()) + + def _get_telemetry_data(self) -> Dict[str, Any]: + """ + Data that is sent to Posthog for usage analytics. + """ + return {"model": self.model} + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + return default_to_dict( + self, + model=self._model_name, + api_key=self._api_key.to_dict(), + config=self._config.to_json_dict() + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "GoogleAITextEmbedder": + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ + deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"]) + return default_from_dict(cls, data) + + def _prepare_input(self, text: str) -> Dict[str, Any]: + if not isinstance(text, str): + raise TypeError( + "GoogleAITextEmbedder expects a string as an input." + "In case you want to embed a list of Documents, please use the GoogleAIDocumentEmbedder." + ) + + text_to_embed = text + + kwargs: Dict[str, Any] = { + "model": self._model_name, "contents": text_to_embed} + if self._config: + kwargs["config"] = self._config + + return kwargs + + def _prepare_output(self, result: types.EmbedContentResponse) -> Dict[str, Any]: + return {"embedding": result.embeddings[0].values, "meta": {"model": self._model_name}} + + @component.output_types(embedding=List[float], meta=Dict[str, Any]) + def run(self, text: str): + """ + Embeds a single string. + + :param text: + Text to embed. + + :returns: + A dictionary with the following keys: + - `embedding`: The embedding of the input text. + - `meta`: Information about the usage of the model. + """ + create_kwargs = self._prepare_input(text=text) + response = self._client.models.embed_content(**create_kwargs) + return self._prepare_output(result=response) + + +@component +class GoogleAIDocumentEmbedder: + """ + Computes document embeddings using OpenAI models. + + ### Usage example + + ```python + from haystack import Document + from haystack.components.embedders import GoogleAIDocumentEmbedder + + doc = Document(content="I love pizza!") + + document_embedder = GoogleAIDocumentEmbedder() + + result = document_embedder.run([doc]) + print(result['documents'][0].embedding) + + # [0.017020374536514282, -0.023255806416273117, ...] + ``` + """ + + def __init__( # pylint: disable=too-many-positional-arguments + self, + api_key: Secret = Secret.from_env_var("GOOGLE_API_KEY"), + model: str = "text-embedding-004", + batch_size: int = 32, + progress_bar: bool = True, + meta_fields_to_embed: Optional[List[str]] = None, + embedding_separator: str = "\n", + config: Optional[types.EmbedContentConfig] = types.EmbedContentConfig( + task_type="SEMANTIC_SIMILARITY"), + ): + """ + Creates an GoogleAIDocumentEmbedder component. + + Before initializing the component, you can set the 'OPENAI_TIMEOUT' and 'OPENAI_MAX_RETRIES' + environment variables to override the `timeout` and `max_retries` parameters respectively + in the OpenAI client. + + :param api_key: + The OpenAI API key. + You can set it with an environment variable `OPENAI_API_KEY`, or pass with this parameter + during initialization. + :param model: + The name of the model to use for calculating embeddings. + The default model is `text-embedding-ada-002`. + :param batch_size: + Number of documents to embed at once. + :param progress_bar: + If `True`, shows a progress bar when running. + :param meta_fields_to_embed: + List of metadata fields to embed along with the document text. + :param embedding_separator: + Separator used to concatenate the metadata fields to the document text. + :param config: + A dictionary of keyword arguments to configure embedding content configuration `types.EmbedContentConfig`. + For more information, see the [Google AI Task types](https://ai.google.dev/gemini-api/docs/embeddings#task-types). + """ + self.api_key = api_key + self.model = model + self.batch_size = batch_size + self.progress_bar = progress_bar + self.meta_fields_to_embed = meta_fields_to_embed or [] + self.embedding_separator = embedding_separator + self.client = genai.Client(api_key=api_key.resolve_value()) + self.config = config + + def _get_telemetry_data(self) -> Dict[str, Any]: + """ + Data that is sent to Posthog for usage analytics. + """ + return {"model": self.model} + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + return default_to_dict( + self, + model=self.model, + batch_size=self.batch_size, + progress_bar=self.progress_bar, + meta_fields_to_embed=self.meta_fields_to_embed, + embedding_separator=self.embedding_separator, + api_key=self.api_key.to_dict(), + config=self.config.to_json_dict() + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "GoogleAIDocumentEmbedder": + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ + deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"]) + return default_from_dict(cls, data) + + def _prepare_texts_to_embed(self, documents: List[Document]) -> Dict[str, str]: + """ + Prepare the texts to embed by concatenating the Document text with the metadata fields to embed. + """ + texts_to_embed = {} + for doc in documents: + meta_values_to_embed = [ + str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta[key] is not None + ] + + texts_to_embed[doc.id] = ( + self.embedding_separator.join( + meta_values_to_embed + [doc.content or ""]) + ) + + return texts_to_embed + + def _embed_batch(self, texts_to_embed: Dict[str, str], batch_size: int) -> Tuple[List[List[float]], Dict[str, Any]]: + """ + Embed a list of texts in batches. + """ + + all_embeddings = [] + meta: Dict[str, Any] = {} + for batch in tqdm( + batched(texts_to_embed.items(), batch_size), disable=not self.progress_bar, desc="Calculating embeddings" + ): + args: Dict[str, Any] = {"model": self.model, + "contents": [b[1] for b in batch]} + if self.config: + args["config"] = self.config + + try: + response = self.client.models.embed_content(**args) + except Exception as exc: + ids = ", ".join(b[0] for b in batch) + msg = "Failed embedding of documents {ids} caused by {exc}" + logger.exception(msg, ids=ids, exc=exc) + continue + + embeddings = [el.values for el in response.embeddings] + all_embeddings.extend(embeddings) + + if "model" not in meta: + meta["model"] = self.model + + return all_embeddings, meta + + @component.output_types(documents=List[Document], meta=Dict[str, Any]) + def run(self, documents: List[Document]): + """ + Embeds a list of documents. + + :param documents: + A list of documents to embed. + + :returns: + A dictionary with the following keys: + - `documents`: A list of documents with embeddings. + - `meta`: Information about the usage of the model. + """ + if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): + raise TypeError( + "GoogleAIDocumentEmbedder expects a list of Documents as input." + "In case you want to embed a string, please use the OpenAITextEmbedder." + ) + + texts_to_embed = self._prepare_texts_to_embed(documents=documents) + + embeddings, meta = self._embed_batch( + texts_to_embed=texts_to_embed, batch_size=self.batch_size) + + for doc, emb in zip(documents, embeddings): + doc.embedding = emb + + return {"documents": documents, "meta": meta} From b9f94c762cbbbd17e41b59e3ce8fe99316f0ab38 Mon Sep 17 00:00:00 2001 From: garybadwal Date: Sat, 24 May 2025 16:33:43 +0530 Subject: [PATCH 02/14] fix: Improve error messages for input type validation in GoogleAITextEmbedder and GoogleAIDocumentEmbedder --- .../components/embedders/google_ai/google_embedder.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/integrations/google_ai/src/haystack_integrations/components/embedders/google_ai/google_embedder.py b/integrations/google_ai/src/haystack_integrations/components/embedders/google_ai/google_embedder.py index 901a476856..9b7f43e338 100644 --- a/integrations/google_ai/src/haystack_integrations/components/embedders/google_ai/google_embedder.py +++ b/integrations/google_ai/src/haystack_integrations/components/embedders/google_ai/google_embedder.py @@ -104,11 +104,13 @@ def from_dict(cls, data: Dict[str, Any]) -> "GoogleAITextEmbedder": def _prepare_input(self, text: str) -> Dict[str, Any]: if not isinstance(text, str): - raise TypeError( - "GoogleAITextEmbedder expects a string as an input." + error_message_text = ( + "GoogleAITextEmbedder expects a string as an input. " "In case you want to embed a list of Documents, please use the GoogleAIDocumentEmbedder." ) + raise TypeError(error_message_text) + text_to_embed = text kwargs: Dict[str, Any] = { @@ -306,10 +308,11 @@ def run(self, documents: List[Document]): - `meta`: Information about the usage of the model. """ if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): - raise TypeError( - "GoogleAIDocumentEmbedder expects a list of Documents as input." + error_message_documents = ( + "GoogleAIDocumentEmbedder expects a list of Documents as input. " "In case you want to embed a string, please use the OpenAITextEmbedder." ) + raise TypeError(error_message_documents) texts_to_embed = self._prepare_texts_to_embed(documents=documents) From 682a4e210b3d094d42b8056524b0c2fc786b11f0 Mon Sep 17 00:00:00 2001 From: garybadwal Date: Thu, 5 Jun 2025 10:17:39 +0530 Subject: [PATCH 03/14] feat: add Google GenAI embedder components for document and text embeddings --- .../embedders/google_ai/__init__.py | 3 - .../embedders/google_genai/__init__.py | 7 + .../google_genai/document_embedder.py} | 126 ---------------- .../embedders/google_genai/text_embedder.py | 137 ++++++++++++++++++ 4 files changed, 144 insertions(+), 129 deletions(-) delete mode 100644 integrations/google_ai/src/haystack_integrations/components/embedders/google_ai/__init__.py create mode 100644 integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/__init__.py rename integrations/{google_ai/src/haystack_integrations/components/embedders/google_ai/google_embedder.py => google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py} (63%) create mode 100644 integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/text_embedder.py diff --git a/integrations/google_ai/src/haystack_integrations/components/embedders/google_ai/__init__.py b/integrations/google_ai/src/haystack_integrations/components/embedders/google_ai/__init__.py deleted file mode 100644 index bbdabff160..0000000000 --- a/integrations/google_ai/src/haystack_integrations/components/embedders/google_ai/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .google_embedder import GoogleAIDocumentEmbedder, GoogleAITextEmbedder - -__all__ = ["GoogleAIDocumentEmbedder", "GoogleAITextEmbedder"] \ No newline at end of file diff --git a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/__init__.py b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/__init__.py new file mode 100644 index 0000000000..3bebddbb56 --- /dev/null +++ b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +from .document_embedder import GoogleAIDocumentEmbedder +from .text_embedder import GoogleAITextEmbedder + +__all__ = ["GoogleAIDocumentEmbedder", "GoogleAITextEmbedder"] diff --git a/integrations/google_ai/src/haystack_integrations/components/embedders/google_ai/google_embedder.py b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py similarity index 63% rename from integrations/google_ai/src/haystack_integrations/components/embedders/google_ai/google_embedder.py rename to integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py index 9b7f43e338..f2ebe507de 100644 --- a/integrations/google_ai/src/haystack_integrations/components/embedders/google_ai/google_embedder.py +++ b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py @@ -2,8 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -import os - from typing import Any, Dict, List, Optional, Tuple from more_itertools import batched from tqdm import tqdm @@ -17,130 +15,6 @@ logger = logging.getLogger(__name__) -@component -class GoogleAITextEmbedder: - """ - Embeds strings using OpenAI models. - - You can use it to embed user query and send it to an embedding Retriever. - - ### Usage example - - ```python - from haystack.components.embedders import GoogleAITextEmbedder - - text_to_embed = "I love pizza!" - - text_embedder = GoogleAITextEmbedder() - - print(text_embedder.run(text_to_embed)) - - # {'embedding': [0.017020374536514282, -0.023255806416273117, ...], - # 'meta': {'model': 'text-embedding-004-v2', - # 'usage': {'prompt_tokens': 4, 'total_tokens': 4}}} - ``` - """ - - def __init__( # pylint: disable=too-many-positional-arguments - self, - api_key: Secret = Secret.from_env_var("GOOGLE_API_KEY"), - model: str = "text-embedding-004", - config: Optional[types.EmbedContentConfig] = types.EmbedContentConfig( - task_type="SEMANTIC_SIMILARITY"), - - ): - """ - Creates an GoogleAITextEmbedder component. - - :param api_key: - The Google API key. - You can set it with an environment variable `GOOGLE_API_KEY`, or pass with this parameter - during initialization. - :param model: - The name of the model to use for calculating embeddings. - The default model is `text-embedding-004`. - :param config: - A dictionary of keyword arguments to configure embedding content configuration `types.EmbedContentConfig`. - For more information, see the [Google AI Task types](https://ai.google.dev/gemini-api/docs/embeddings#task-types). - """ - - self._api_key = api_key - self._model_name = model - self._config = config - self._client = genai.Client(api_key=api_key.resolve_value()) - - def _get_telemetry_data(self) -> Dict[str, Any]: - """ - Data that is sent to Posthog for usage analytics. - """ - return {"model": self.model} - - def to_dict(self) -> Dict[str, Any]: - """ - Serializes the component to a dictionary. - - :returns: - Dictionary with serialized data. - """ - return default_to_dict( - self, - model=self._model_name, - api_key=self._api_key.to_dict(), - config=self._config.to_json_dict() - ) - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "GoogleAITextEmbedder": - """ - Deserializes the component from a dictionary. - - :param data: - Dictionary to deserialize from. - :returns: - Deserialized component. - """ - deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"]) - return default_from_dict(cls, data) - - def _prepare_input(self, text: str) -> Dict[str, Any]: - if not isinstance(text, str): - error_message_text = ( - "GoogleAITextEmbedder expects a string as an input. " - "In case you want to embed a list of Documents, please use the GoogleAIDocumentEmbedder." - ) - - raise TypeError(error_message_text) - - text_to_embed = text - - kwargs: Dict[str, Any] = { - "model": self._model_name, "contents": text_to_embed} - if self._config: - kwargs["config"] = self._config - - return kwargs - - def _prepare_output(self, result: types.EmbedContentResponse) -> Dict[str, Any]: - return {"embedding": result.embeddings[0].values, "meta": {"model": self._model_name}} - - @component.output_types(embedding=List[float], meta=Dict[str, Any]) - def run(self, text: str): - """ - Embeds a single string. - - :param text: - Text to embed. - - :returns: - A dictionary with the following keys: - - `embedding`: The embedding of the input text. - - `meta`: Information about the usage of the model. - """ - create_kwargs = self._prepare_input(text=text) - response = self._client.models.embed_content(**create_kwargs) - return self._prepare_output(result=response) - - @component class GoogleAIDocumentEmbedder: """ diff --git a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/text_embedder.py b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/text_embedder.py new file mode 100644 index 0000000000..c09e41b360 --- /dev/null +++ b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/text_embedder.py @@ -0,0 +1,137 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Dict, List, Optional + +from google import genai +from google.genai import types + +from haystack import component, default_from_dict, default_to_dict, logging +from haystack.utils import Secret, deserialize_secrets_inplace + +logger = logging.getLogger(__name__) + + +@component +class GoogleAITextEmbedder: + """ + Embeds strings using OpenAI models. + + You can use it to embed user query and send it to an embedding Retriever. + + ### Usage example + + ```python + from haystack.components.embedders import GoogleAITextEmbedder + + text_to_embed = "I love pizza!" + + text_embedder = GoogleAITextEmbedder() + + print(text_embedder.run(text_to_embed)) + + # {'embedding': [0.017020374536514282, -0.023255806416273117, ...], + # 'meta': {'model': 'text-embedding-004-v2', + # 'usage': {'prompt_tokens': 4, 'total_tokens': 4}}} + ``` + """ + + def __init__( # pylint: disable=too-many-positional-arguments + self, + api_key: Secret = Secret.from_env_var("GOOGLE_API_KEY"), + model: str = "text-embedding-004", + config: Optional[types.EmbedContentConfig] = types.EmbedContentConfig( + task_type="SEMANTIC_SIMILARITY"), + + ): + """ + Creates an GoogleAITextEmbedder component. + + :param api_key: + The Google API key. + You can set it with an environment variable `GOOGLE_API_KEY`, or pass with this parameter + during initialization. + :param model: + The name of the model to use for calculating embeddings. + The default model is `text-embedding-004`. + :param config: + A dictionary of keyword arguments to configure embedding content configuration `types.EmbedContentConfig`. + For more information, see the [Google AI Task types](https://ai.google.dev/gemini-api/docs/embeddings#task-types). + """ + + self._api_key = api_key + self._model_name = model + self._config = config + self._client = genai.Client(api_key=api_key.resolve_value()) + + def _get_telemetry_data(self) -> Dict[str, Any]: + """ + Data that is sent to Posthog for usage analytics. + """ + return {"model": self.model} + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + return default_to_dict( + self, + model=self._model_name, + api_key=self._api_key.to_dict(), + config=self._config.to_json_dict() + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "GoogleAITextEmbedder": + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ + deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"]) + return default_from_dict(cls, data) + + def _prepare_input(self, text: str) -> Dict[str, Any]: + if not isinstance(text, str): + error_message_text = ( + "GoogleAITextEmbedder expects a string as an input. " + "In case you want to embed a list of Documents, please use the GoogleAIDocumentEmbedder." + ) + + raise TypeError(error_message_text) + + text_to_embed = text + + kwargs: Dict[str, Any] = { + "model": self._model_name, "contents": text_to_embed} + if self._config: + kwargs["config"] = self._config + + return kwargs + + def _prepare_output(self, result: types.EmbedContentResponse) -> Dict[str, Any]: + return {"embedding": result.embeddings[0].values, "meta": {"model": self._model_name}} + + @component.output_types(embedding=List[float], meta=Dict[str, Any]) + def run(self, text: str): + """ + Embeds a single string. + + :param text: + Text to embed. + + :returns: + A dictionary with the following keys: + - `embedding`: The embedding of the input text. + - `meta`: Information about the usage of the model. + """ + create_kwargs = self._prepare_input(text=text) + response = self._client.models.embed_content(**create_kwargs) + return self._prepare_output(result=response) From 778f702bd3937d9d49274eb9f6745cf158afc3c0 Mon Sep 17 00:00:00 2001 From: garybadwal Date: Thu, 5 Jun 2025 12:57:10 +0530 Subject: [PATCH 04/14] feat: add unit tests for GoogleAIDocumentEmbedder and GoogleAITextEmbedder --- .../tests/test_document_embedder.py | 186 ++++++++++++++++++ .../google_genai/tests/test_text_embedder.py | 152 ++++++++++++++ 2 files changed, 338 insertions(+) create mode 100644 integrations/google_genai/tests/test_document_embedder.py create mode 100644 integrations/google_genai/tests/test_text_embedder.py diff --git a/integrations/google_genai/tests/test_document_embedder.py b/integrations/google_genai/tests/test_document_embedder.py new file mode 100644 index 0000000000..8017d8d4ef --- /dev/null +++ b/integrations/google_genai/tests/test_document_embedder.py @@ -0,0 +1,186 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import os +import random +from typing import List + +import pytest + +from haystack import Document +from haystack_integrations.components.embedders.google_genai import GoogleAIDocumentEmbedder +from haystack.utils.auth import Secret + +def mock_google_response(input: List[str], model: str = "text-embedding-004", **kwargs) -> dict: + dict_response = { + "embedding": [[random.random() for _ in range(768)] for _ in input], + "meta": { + "model": model + } + } + + return dict_response + + +class TestGoogleAIDocumentEmbedder: + def test_init_default(self, monkeypatch): + monkeypatch.setenv("GOOGLE_API_KEY", "fake-api-key") + embedder = GoogleAIDocumentEmbedder() + assert embedder.api_key.resolve_value() == "fake-api-key" + assert embedder.model == "text-embedding-004" + assert embedder.batch_size == 32 + assert embedder.progress_bar is True + assert embedder.meta_fields_to_embed == [] + assert embedder.embedding_separator == "\n" + + def test_init_with_parameters(self, monkeypatch): + embedder = GoogleAIDocumentEmbedder( + api_key=Secret.from_token("fake-api-key-2"), + model="model", + batch_size=64, + progress_bar=False, + meta_fields_to_embed=["test_field"], + embedding_separator=" | ", + ) + assert embedder.api_key.resolve_value() == "fake-api-key-2" + assert embedder.model == "model" + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.meta_fields_to_embed == ["test_field"] + assert embedder.embedding_separator == " | " + + def test_init_with_parameters_and_env_vars(self, monkeypatch): + embedder = GoogleAIDocumentEmbedder( + api_key=Secret.from_token("fake-api-key-2"), + model="model", + batch_size=64, + progress_bar=False, + meta_fields_to_embed=["test_field"], + embedding_separator=" | ", + ) + assert embedder.api_key.resolve_value() == "fake-api-key-2" + assert embedder.model == "model" + assert embedder.batch_size == 64 + assert embedder.progress_bar is False + assert embedder.meta_fields_to_embed == ["test_field"] + assert embedder.embedding_separator == " | " + + def test_init_fail_wo_api_key(self, monkeypatch): + monkeypatch.delenv("GOOGLE_API_KEY", raising=False) + with pytest.raises(ValueError, match="None of the .* environment variables are set"): + GoogleAIDocumentEmbedder() + + def test_to_dict(self, monkeypatch): + monkeypatch.setenv("GOOGLE_API_KEY", "fake-api-key") + component = GoogleAIDocumentEmbedder() + data = component.to_dict() + assert data == { + 'type': 'haystack_integrations.components.embedders.google_genai.GoogleAIDocumentEmbedder', + 'init_parameters': { + 'model': 'text-embedding-004', + 'batch_size': 32, + 'progress_bar': True, + 'meta_fields_to_embed': [], + 'embedding_separator': '\n', + 'api_key': {'type': 'env_var', 'env_vars': ['GOOGLE_API_KEY'], 'strict': True}, + 'config': {'task_type': 'SEMANTIC_SIMILARITY'} + } + } + + def test_to_dict_with_custom_init_parameters(self, monkeypatch): + monkeypatch.setenv("ENV_VAR", "fake-api-key") + component = GoogleAIDocumentEmbedder( + api_key=Secret.from_env_var("ENV_VAR", strict=False), + model="model", + batch_size=64, + progress_bar=False, + meta_fields_to_embed=["test_field"], + embedding_separator=" | ", + ) + data = component.to_dict() + assert data == { + 'type': 'haystack_integrations.components.embedders.google_genai.GoogleAIDocumentEmbedder', + 'init_parameters': { + 'model': 'model', + 'batch_size': 64, + 'progress_bar': False, + 'meta_fields_to_embed': ['test_field'], + 'embedding_separator': ' | ', + 'api_key': {'type': 'env_var', 'env_vars': ['ENV_VAR'], 'strict': False}, + 'config': {'task_type': 'SEMANTIC_SIMILARITY'} + } + } + + def test_prepare_texts_to_embed_w_metadata(self): + documents = [ + Document(id=f"{i}", content=f"document number {i}:\ncontent", meta={ + "meta_field": f"meta_value {i}"}) + for i in range(5) + ] + + embedder = GoogleAIDocumentEmbedder( + api_key=Secret.from_token("fake-api-key"), meta_fields_to_embed=["meta_field"], embedding_separator=" | " + ) + + prepared_texts = embedder._prepare_texts_to_embed(documents) + + assert prepared_texts == { + "0": "meta_value 0 | document number 0:\ncontent", + "1": "meta_value 1 | document number 1:\ncontent", + "2": "meta_value 2 | document number 2:\ncontent", + "3": "meta_value 3 | document number 3:\ncontent", + "4": "meta_value 4 | document number 4:\ncontent", + } + + def test_run_wrong_input_format(self): + embedder = GoogleAIDocumentEmbedder( + api_key=Secret.from_token("fake-api-key")) + + # wrong formats + string_input = "text" + list_integers_input = [1, 2, 3] + + with pytest.raises(TypeError, match="GoogleAIDocumentEmbedder expects a list of Documents as input"): + embedder.run(documents=string_input) + + with pytest.raises(TypeError, match="GoogleAIDocumentEmbedder expects a list of Documents as input"): + embedder.run(documents=list_integers_input) + + def test_run_on_empty_list(self): + embedder = GoogleAIDocumentEmbedder( + api_key=Secret.from_token("fake-api-key")) + + empty_list_input = [] + result = embedder.run(documents=empty_list_input) + + assert result["documents"] is not None + assert not result["documents"] # empty list + + @pytest.mark.skipif(os.environ.get("GOOGLE_API_KEY", "") == "", reason="GOOGLE_API_KEY is not set") + @pytest.mark.integration + def test_run(self): + docs = [ + Document(content="I love cheese", meta={"topic": "Cuisine"}), + Document(content="A transformer is a deep learning architecture", meta={ + "topic": "ML"}), + ] + + model = "text-embedding-004" + + embedder = GoogleAIDocumentEmbedder(model=model, meta_fields_to_embed=[ + "topic"], embedding_separator=" | ") + + result = embedder.run(documents=docs) + documents_with_embeddings = result["documents"] + assert isinstance(documents_with_embeddings, list) + assert len(documents_with_embeddings) == len(docs) + for doc in documents_with_embeddings: + assert isinstance(doc, Document) + assert isinstance(doc.embedding, list) + assert len(doc.embedding) == 768 + assert all(isinstance(x, float) for x in doc.embedding) + + assert "text" in result["meta"]["model"] and "004" in result["meta"]["model"], ( + "The model name does not contain 'text' and '004'" + ) diff --git a/integrations/google_genai/tests/test_text_embedder.py b/integrations/google_genai/tests/test_text_embedder.py new file mode 100644 index 0000000000..85a4981d4e --- /dev/null +++ b/integrations/google_genai/tests/test_text_embedder.py @@ -0,0 +1,152 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import os + +import pytest + +from haystack_integrations.components.embedders.google_genai import GoogleAITextEmbedder +from haystack.utils.auth import Secret + +from google.genai import types +from google.genai.types import EmbedContentResponse, ContentEmbedding, EmbedContentConfig + + +class TestGoogleAITextEmbedder: + def test_init_default(self, monkeypatch): + monkeypatch.setenv("GOOGLE_API_KEY", "fake-api-key") + embedder = GoogleAITextEmbedder() + + assert embedder._api_key.resolve_value() == "fake-api-key" + assert embedder._model_name == "text-embedding-004" + + def test_init_with_parameters(self): + embedder = GoogleAITextEmbedder( + api_key=Secret.from_token("fake-api-key"), + model="model", + ) + assert embedder._api_key.resolve_value() == "fake-api-key" + assert embedder._model_name == "model" + + def test_init_with_parameters_and_env_vars(self, monkeypatch): + embedder = GoogleAITextEmbedder( + api_key=Secret.from_token("fake-api-key"), + model="model", + config=types.EmbedContentConfig(task_type="SEMANTIC_SIMILARITY") + ) + assert embedder._api_key.resolve_value() == "fake-api-key" + assert embedder._model_name == "model" + assert embedder._config == types.EmbedContentConfig( + task_type="SEMANTIC_SIMILARITY") + + def test_init_fail_wo_api_key(self, monkeypatch): + monkeypatch.delenv("GOOGLE_API_KEY", raising=False) + with pytest.raises(ValueError, match="None of the .* environment variables are set"): + GoogleAITextEmbedder() + + def test_to_dict(self, monkeypatch): + monkeypatch.setenv("GOOGLE_API_KEY", "fake-api-key") + component = GoogleAITextEmbedder() + data = component.to_dict() + assert data == { + "type": "aystack_integrations.components.embedders.google_genai.GoogleAITextEmbedder", + "init_parameters": { + 'api_key': {'type': 'env_var', 'env_vars': ['GOOGLE_API_KEY'], 'strict': True}, + "model": "text-embedding-004", + 'config': {'task_type': 'SEMANTIC_SIMILARITY'} + }, + } + + def test_to_dict_with_custom_init_parameters(self, monkeypatch): + monkeypatch.setenv("ENV_VAR", "fake-api-key") + component = GoogleAITextEmbedder( + api_key=Secret.from_env_var("ENV_VAR", strict=False), + model="model", + config=types.EmbedContentConfig( + task_type="SEMANTIC_SIMILARITY" + ) + ) + data = component.to_dict() + assert data == { + 'type': 'aystack_integrations.components.embedders.google_genai.GoogleAITextEmbedder', + 'init_parameters': { + 'model': 'model', + 'api_key': { + 'type': 'env_var', + 'env_vars': ['ENV_VAR'], + 'strict': False + }, + 'config': {'task_type': 'SEMANTIC_SIMILARITY'} + } + } + + def test_from_dict(self, monkeypatch): + monkeypatch.setenv("GOOGLE_API_KEY", "fake-api-key") + data = { + "type": "aystack_integrations.components.embedders.google_genai.GoogleAITextEmbedder", + "init_parameters": { + "api_key": {'type': 'env_var', 'env_vars': ['GOOGLE_API_KEY'], 'strict': True}, + "model": "text-embedding-004", + }, + } + component = GoogleAITextEmbedder.from_dict(data) + assert component._api_key.resolve_value() == "fake-api-key" + assert component._model_name == "text-embedding-004" + + def test_prepare_input(self, monkeypatch): + monkeypatch.setenv("GOOGLE_API_KEY", "fake-api-key") + embedder = GoogleAITextEmbedder() + + contents = "The food was delicious" + prepared_input = embedder._prepare_input(contents) + assert prepared_input == { + "model": "text-embedding-004", + "contents": "The food was delicious", + "config": EmbedContentConfig( + http_options=None, + task_type='SEMANTIC_SIMILARITY', + title=None, + output_dimensionality=None, + mime_type=None, + auto_truncate=None + ) + } + + def test_prepare_output(self, monkeypatch): + monkeypatch.setenv("GOOGLE_API_KEY", "fake-api-key") + + response = EmbedContentResponse( + embeddings=[ContentEmbedding(values=[0.1, 0.2, 0.3])], + ) + + embedder = GoogleAITextEmbedder() + result = embedder._prepare_output(result=response) + assert result == { + "embedding": [0.1, 0.2, 0.3], + "meta": {"model": "text-embedding-004"}, + } + + def test_run_wrong_input_format(self): + embedder = GoogleAITextEmbedder( + api_key=Secret.from_token("fake-api-key")) + + list_integers_input = [1, 2, 3] + + with pytest.raises(TypeError, match="GoogleAITextEmbedder expects a string as an input"): + embedder.run(text=list_integers_input) + + @pytest.mark.skipif(os.environ.get("GOOGLE_API_KEY", "") == "", reason="GOOGLE_API_KEY is not set") + @pytest.mark.integration + def test_run(self): + model = "text-embedding-004" + + embedder = GoogleAITextEmbedder(model=model) + result = embedder.run(text="The food was delicious") + + assert len(result["embedding"]) == 768 + assert all(isinstance(x, float) for x in result["embedding"]) + + assert "text" in result["meta"]["model"] and "004" in result["meta"]["model"], ( + "The model name does not contain 'text' and '004'" + ) From 3de6d1e34dbf301da266a05f9e0a193198b51aec Mon Sep 17 00:00:00 2001 From: garybadwal Date: Thu, 5 Jun 2025 14:49:51 +0530 Subject: [PATCH 05/14] refactor: clean up imports and improve list handling in GoogleAIDocumentEmbedder and GoogleAITextEmbedder tests --- .../google_genai/document_embedder.py | 10 ++-- .../embedders/google_genai/text_embedder.py | 1 - .../tests/test_document_embedder.py | 52 +++++++++++-------- .../google_genai/tests/test_text_embedder.py | 35 ++++++------- 4 files changed, 52 insertions(+), 46 deletions(-) diff --git a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py index f2ebe507de..58520a31b0 100644 --- a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py +++ b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py @@ -3,14 +3,13 @@ # SPDX-License-Identifier: Apache-2.0 from typing import Any, Dict, List, Optional, Tuple -from more_itertools import batched -from tqdm import tqdm from google import genai from google.genai import types - from haystack import Document, component, default_from_dict, default_to_dict, logging from haystack.utils import Secret, deserialize_secrets_inplace +from more_itertools import batched +from tqdm import tqdm logger = logging.getLogger(__name__) @@ -132,7 +131,8 @@ def _prepare_texts_to_embed(self, documents: List[Document]) -> Dict[str, str]: texts_to_embed[doc.id] = ( self.embedding_separator.join( - meta_values_to_embed + [doc.content or ""]) + [*meta_values_to_embed, doc.content or ""] + ) ) return texts_to_embed @@ -181,7 +181,7 @@ def run(self, documents: List[Document]): - `documents`: A list of documents with embeddings. - `meta`: Information about the usage of the model. """ - if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): + if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)): error_message_documents = ( "GoogleAIDocumentEmbedder expects a list of Documents as input. " "In case you want to embed a string, please use the OpenAITextEmbedder." diff --git a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/text_embedder.py b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/text_embedder.py index c09e41b360..0c0d9fbcf9 100644 --- a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/text_embedder.py +++ b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/text_embedder.py @@ -6,7 +6,6 @@ from google import genai from google.genai import types - from haystack import component, default_from_dict, default_to_dict, logging from haystack.utils import Secret, deserialize_secrets_inplace diff --git a/integrations/google_genai/tests/test_document_embedder.py b/integrations/google_genai/tests/test_document_embedder.py index 8017d8d4ef..534b4c9a31 100644 --- a/integrations/google_genai/tests/test_document_embedder.py +++ b/integrations/google_genai/tests/test_document_embedder.py @@ -7,14 +7,16 @@ from typing import List import pytest - from haystack import Document -from haystack_integrations.components.embedders.google_genai import GoogleAIDocumentEmbedder from haystack.utils.auth import Secret -def mock_google_response(input: List[str], model: str = "text-embedding-004", **kwargs) -> dict: +from haystack_integrations.components.embedders.google_genai import GoogleAIDocumentEmbedder + + +def mock_google_response(contents: List[str], model: str = "text-embedding-004", **kwargs) -> dict: + secure_random = random.SystemRandom() dict_response = { - "embedding": [[random.random() for _ in range(768)] for _ in input], + "embedding": [[secure_random.random() for _ in range(768)] for _ in contents], "meta": { "model": model } @@ -76,15 +78,18 @@ def test_to_dict(self, monkeypatch): component = GoogleAIDocumentEmbedder() data = component.to_dict() assert data == { - 'type': 'haystack_integrations.components.embedders.google_genai.GoogleAIDocumentEmbedder', - 'init_parameters': { - 'model': 'text-embedding-004', - 'batch_size': 32, - 'progress_bar': True, - 'meta_fields_to_embed': [], - 'embedding_separator': '\n', - 'api_key': {'type': 'env_var', 'env_vars': ['GOOGLE_API_KEY'], 'strict': True}, - 'config': {'task_type': 'SEMANTIC_SIMILARITY'} + "type": ( + "haystack_integrations.components.embedders." + "google_genai.document_embedder.GoogleAIDocumentEmbedder" + ), + "init_parameters": { + "model": "text-embedding-004", + "batch_size": 32, + "progress_bar": True, + "meta_fields_to_embed": [], + "embedding_separator": "\n", + "api_key": {"type": "env_var", "env_vars": ["GOOGLE_API_KEY"], "strict": True}, + "config": {"task_type": "SEMANTIC_SIMILARITY"} } } @@ -100,15 +105,18 @@ def test_to_dict_with_custom_init_parameters(self, monkeypatch): ) data = component.to_dict() assert data == { - 'type': 'haystack_integrations.components.embedders.google_genai.GoogleAIDocumentEmbedder', - 'init_parameters': { - 'model': 'model', - 'batch_size': 64, - 'progress_bar': False, - 'meta_fields_to_embed': ['test_field'], - 'embedding_separator': ' | ', - 'api_key': {'type': 'env_var', 'env_vars': ['ENV_VAR'], 'strict': False}, - 'config': {'task_type': 'SEMANTIC_SIMILARITY'} + "type": ( + "haystack_integrations.components.embedders." + "google_genai.document_embedder.GoogleAIDocumentEmbedder" + ), + "init_parameters": { + "model": "model", + "batch_size": 64, + "progress_bar": False, + "meta_fields_to_embed": ["test_field"], + "embedding_separator": " | ", + "api_key": {"type": "env_var", "env_vars": ["ENV_VAR"], "strict": False}, + "config": {"task_type": "SEMANTIC_SIMILARITY"} } } diff --git a/integrations/google_genai/tests/test_text_embedder.py b/integrations/google_genai/tests/test_text_embedder.py index 85a4981d4e..7327d3901e 100644 --- a/integrations/google_genai/tests/test_text_embedder.py +++ b/integrations/google_genai/tests/test_text_embedder.py @@ -5,12 +5,11 @@ import os import pytest - -from haystack_integrations.components.embedders.google_genai import GoogleAITextEmbedder +from google.genai import types +from google.genai.types import ContentEmbedding, EmbedContentConfig, EmbedContentResponse from haystack.utils.auth import Secret -from google.genai import types -from google.genai.types import EmbedContentResponse, ContentEmbedding, EmbedContentConfig +from haystack_integrations.components.embedders.google_genai import GoogleAITextEmbedder class TestGoogleAITextEmbedder: @@ -50,11 +49,11 @@ def test_to_dict(self, monkeypatch): component = GoogleAITextEmbedder() data = component.to_dict() assert data == { - "type": "aystack_integrations.components.embedders.google_genai.GoogleAITextEmbedder", + "type": "haystack_integrations.components.embedders.google_genai.text_embedder.GoogleAITextEmbedder", "init_parameters": { - 'api_key': {'type': 'env_var', 'env_vars': ['GOOGLE_API_KEY'], 'strict': True}, + "api_key": {"type": "env_var", "env_vars": ["GOOGLE_API_KEY"], "strict": True}, "model": "text-embedding-004", - 'config': {'task_type': 'SEMANTIC_SIMILARITY'} + "config": {"task_type": "SEMANTIC_SIMILARITY"} }, } @@ -69,24 +68,24 @@ def test_to_dict_with_custom_init_parameters(self, monkeypatch): ) data = component.to_dict() assert data == { - 'type': 'aystack_integrations.components.embedders.google_genai.GoogleAITextEmbedder', - 'init_parameters': { - 'model': 'model', - 'api_key': { - 'type': 'env_var', - 'env_vars': ['ENV_VAR'], - 'strict': False + "type": "haystack_integrations.components.embedders.google_genai.text_embedder.GoogleAITextEmbedder", + "init_parameters": { + "model": "model", + "api_key": { + "type": "env_var", + "env_vars": ["ENV_VAR"], + "strict": False }, - 'config': {'task_type': 'SEMANTIC_SIMILARITY'} + "config": {"task_type": "SEMANTIC_SIMILARITY"} } } def test_from_dict(self, monkeypatch): monkeypatch.setenv("GOOGLE_API_KEY", "fake-api-key") data = { - "type": "aystack_integrations.components.embedders.google_genai.GoogleAITextEmbedder", + "type": "haystack_integrations.components.embedders.google_genai.text_embedder.GoogleAITextEmbedder", "init_parameters": { - "api_key": {'type': 'env_var', 'env_vars': ['GOOGLE_API_KEY'], 'strict': True}, + "api_key": {"type": "env_var", "env_vars": ["GOOGLE_API_KEY"], "strict": True}, "model": "text-embedding-004", }, } @@ -105,7 +104,7 @@ def test_prepare_input(self, monkeypatch): "contents": "The food was delicious", "config": EmbedContentConfig( http_options=None, - task_type='SEMANTIC_SIMILARITY', + task_type="SEMANTIC_SIMILARITY", title=None, output_dimensionality=None, mime_type=None, From 9c6cb1a3b118178b542720ba6ca070144f0a39ec Mon Sep 17 00:00:00 2001 From: garybadwal Date: Thu, 5 Jun 2025 20:44:24 +0530 Subject: [PATCH 06/14] refactor: Rename classes and update imports for Google GenAI components --- .../embedders/google_genai/__init__.py | 6 +- .../google_genai/document_embedder.py | 64 +++++------ .../embedders/google_genai/text_embedder.py | 52 ++++----- .../tests/test_document_embedder.py | 91 +++++++++------- .../google_genai/tests/test_text_embedder.py | 100 ++++++++++-------- 5 files changed, 173 insertions(+), 140 deletions(-) diff --git a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/__init__.py b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/__init__.py index 3bebddbb56..f426cd6287 100644 --- a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/__init__.py +++ b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/__init__.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 -from .document_embedder import GoogleAIDocumentEmbedder -from .text_embedder import GoogleAITextEmbedder +from .document_embedder import GoogleGenAIDocumentEmbedder +from .text_embedder import GoogleGenAITextEmbedder -__all__ = ["GoogleAIDocumentEmbedder", "GoogleAITextEmbedder"] +__all__ = ["GoogleGenAIDocumentEmbedder", "GoogleGenAITextEmbedder"] diff --git a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py index 58520a31b0..07c0c16f7e 100644 --- a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py +++ b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, Union from google import genai from google.genai import types @@ -15,19 +15,19 @@ @component -class GoogleAIDocumentEmbedder: +class GoogleGenAIDocumentEmbedder: """ - Computes document embeddings using OpenAI models. + Computes document embeddings using Google AI models. ### Usage example ```python from haystack import Document - from haystack.components.embedders import GoogleAIDocumentEmbedder + from haystack_integrations.components.embedders import GoogleGenAIDocumentEmbedder doc = Document(content="I love pizza!") - document_embedder = GoogleAIDocumentEmbedder() + document_embedder = GoogleGenAIDocumentEmbedder() result = document_embedder.run([doc]) print(result['documents'][0].embedding) @@ -38,29 +38,35 @@ class GoogleAIDocumentEmbedder: def __init__( # pylint: disable=too-many-positional-arguments self, + *, api_key: Secret = Secret.from_env_var("GOOGLE_API_KEY"), model: str = "text-embedding-004", + prefix: str = "", + suffix: str = "", batch_size: int = 32, progress_bar: bool = True, meta_fields_to_embed: Optional[List[str]] = None, embedding_separator: str = "\n", - config: Optional[types.EmbedContentConfig] = types.EmbedContentConfig( - task_type="SEMANTIC_SIMILARITY"), + config: Optional[Dict[str, Any]] = None, ): """ - Creates an GoogleAIDocumentEmbedder component. + Creates an GoogleGenAIDocumentEmbedder component. - Before initializing the component, you can set the 'OPENAI_TIMEOUT' and 'OPENAI_MAX_RETRIES' + Before initializing the component, you can set the 'GoogleGenAI_TIMEOUT' and 'GoogleGenAI_MAX_RETRIES' environment variables to override the `timeout` and `max_retries` parameters respectively - in the OpenAI client. + in the GoogleGenAI client. :param api_key: - The OpenAI API key. - You can set it with an environment variable `OPENAI_API_KEY`, or pass with this parameter + The Google API key. + You can set it with the environment variable `GOOGLE_API_KEY`, or pass it via this parameter during initialization. :param model: The name of the model to use for calculating embeddings. The default model is `text-embedding-ada-002`. + :param prefix: + A string to add at the beginning of each text. + :param suffix: + A string to add at the end of each text. :param batch_size: Number of documents to embed at once. :param progress_bar: @@ -75,18 +81,14 @@ def __init__( # pylint: disable=too-many-positional-arguments """ self.api_key = api_key self.model = model + self.prefix = prefix + self.suffix = suffix self.batch_size = batch_size self.progress_bar = progress_bar self.meta_fields_to_embed = meta_fields_to_embed or [] self.embedding_separator = embedding_separator self.client = genai.Client(api_key=api_key.resolve_value()) - self.config = config - - def _get_telemetry_data(self) -> Dict[str, Any]: - """ - Data that is sent to Posthog for usage analytics. - """ - return {"model": self.model} + self.config = config if config is not None else {"task_type": "SEMANTIC_SIMILARITY"} def to_dict(self) -> Dict[str, Any]: """ @@ -98,16 +100,18 @@ def to_dict(self) -> Dict[str, Any]: return default_to_dict( self, model=self.model, + prefix=self.prefix, + suffix=self.suffix, batch_size=self.batch_size, progress_bar=self.progress_bar, meta_fields_to_embed=self.meta_fields_to_embed, embedding_separator=self.embedding_separator, api_key=self.api_key.to_dict(), - config=self.config.to_json_dict() + config=self.config, ) @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "GoogleAIDocumentEmbedder": + def from_dict(cls, data: Dict[str, Any]) -> "GoogleGenAIDocumentEmbedder": """ Deserializes the component from a dictionary. @@ -130,9 +134,7 @@ def _prepare_texts_to_embed(self, documents: List[Document]) -> Dict[str, str]: ] texts_to_embed[doc.id] = ( - self.embedding_separator.join( - [*meta_values_to_embed, doc.content or ""] - ) + self.prefix + self.embedding_separator.join([*meta_values_to_embed, doc.content or ""]) + self.suffix ) return texts_to_embed @@ -147,10 +149,9 @@ def _embed_batch(self, texts_to_embed: Dict[str, str], batch_size: int) -> Tuple for batch in tqdm( batched(texts_to_embed.items(), batch_size), disable=not self.progress_bar, desc="Calculating embeddings" ): - args: Dict[str, Any] = {"model": self.model, - "contents": [b[1] for b in batch]} + args: Dict[str, Any] = {"model": self.model, "contents": [b[1] for b in batch]} if self.config: - args["config"] = self.config + args["config"] = types.EmbedContentConfig(**self.config) if self.config else None try: response = self.client.models.embed_content(**args) @@ -169,7 +170,7 @@ def _embed_batch(self, texts_to_embed: Dict[str, str], batch_size: int) -> Tuple return all_embeddings, meta @component.output_types(documents=List[Document], meta=Dict[str, Any]) - def run(self, documents: List[Document]): + def run(self, documents: List[Document]) -> Dict[str, Union[List[Document], Dict[str, Any]]]: """ Embeds a list of documents. @@ -183,15 +184,14 @@ def run(self, documents: List[Document]): """ if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)): error_message_documents = ( - "GoogleAIDocumentEmbedder expects a list of Documents as input. " - "In case you want to embed a string, please use the OpenAITextEmbedder." + "GoogleGenAIDocumentEmbedder expects a list of Documents as input. " + "In case you want to embed a string, please use the GoogleGenAITextEmbedder." ) raise TypeError(error_message_documents) texts_to_embed = self._prepare_texts_to_embed(documents=documents) - embeddings, meta = self._embed_batch( - texts_to_embed=texts_to_embed, batch_size=self.batch_size) + embeddings, meta = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self.batch_size) for doc, emb in zip(documents, embeddings): doc.embedding = emb diff --git a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/text_embedder.py b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/text_embedder.py index 0c0d9fbcf9..7f8608c1ec 100644 --- a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/text_embedder.py +++ b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/text_embedder.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union from google import genai from google.genai import types @@ -13,20 +13,20 @@ @component -class GoogleAITextEmbedder: +class GoogleGenAITextEmbedder: """ - Embeds strings using OpenAI models. + Embeds strings using Google AI models. You can use it to embed user query and send it to an embedding Retriever. ### Usage example ```python - from haystack.components.embedders import GoogleAITextEmbedder + from haystack_integrations.components.embedders.google_genai import GoogleGenAITextEmbedder text_to_embed = "I love pizza!" - text_embedder = GoogleAITextEmbedder() + text_embedder = GoogleGenAITextEmbedder() print(text_embedder.run(text_to_embed)) @@ -38,22 +38,27 @@ class GoogleAITextEmbedder: def __init__( # pylint: disable=too-many-positional-arguments self, + *, api_key: Secret = Secret.from_env_var("GOOGLE_API_KEY"), model: str = "text-embedding-004", - config: Optional[types.EmbedContentConfig] = types.EmbedContentConfig( - task_type="SEMANTIC_SIMILARITY"), - + prefix: str = "", + suffix: str = "", + config: Optional[Dict[str, Any]] = None, ): """ - Creates an GoogleAITextEmbedder component. + Creates an GoogleGenAITextEmbedder component. :param api_key: The Google API key. - You can set it with an environment variable `GOOGLE_API_KEY`, or pass with this parameter + You can set it with the environment variable `GOOGLE_API_KEY`, or pass it via this parameter during initialization. :param model: The name of the model to use for calculating embeddings. The default model is `text-embedding-004`. + :param prefix: + A string to add at the beginning of each text to embed. + :param suffix: + A string to add at the end of each text to embed. :param config: A dictionary of keyword arguments to configure embedding content configuration `types.EmbedContentConfig`. For more information, see the [Google AI Task types](https://ai.google.dev/gemini-api/docs/embeddings#task-types). @@ -61,15 +66,11 @@ def __init__( # pylint: disable=too-many-positional-arguments self._api_key = api_key self._model_name = model - self._config = config + self._prefix = prefix + self._suffix = suffix + self._config = config if config is not None else {"task_type": "SEMANTIC_SIMILARITY"} self._client = genai.Client(api_key=api_key.resolve_value()) - def _get_telemetry_data(self) -> Dict[str, Any]: - """ - Data that is sent to Posthog for usage analytics. - """ - return {"model": self.model} - def to_dict(self) -> Dict[str, Any]: """ Serializes the component to a dictionary. @@ -81,11 +82,13 @@ def to_dict(self) -> Dict[str, Any]: self, model=self._model_name, api_key=self._api_key.to_dict(), - config=self._config.to_json_dict() + prefix=self._prefix, + suffix=self._suffix, + config=self._config, ) @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "GoogleAITextEmbedder": + def from_dict(cls, data: Dict[str, Any]) -> "GoogleGenAITextEmbedder": """ Deserializes the component from a dictionary. @@ -100,18 +103,17 @@ def from_dict(cls, data: Dict[str, Any]) -> "GoogleAITextEmbedder": def _prepare_input(self, text: str) -> Dict[str, Any]: if not isinstance(text, str): error_message_text = ( - "GoogleAITextEmbedder expects a string as an input. " + "GoogleGenAITextEmbedder expects a string as an input. " "In case you want to embed a list of Documents, please use the GoogleAIDocumentEmbedder." ) raise TypeError(error_message_text) - text_to_embed = text + text_to_embed = self._prefix + text + self._suffix - kwargs: Dict[str, Any] = { - "model": self._model_name, "contents": text_to_embed} + kwargs: Dict[str, Any] = {"model": self._model_name, "contents": text_to_embed} if self._config: - kwargs["config"] = self._config + kwargs["config"] = types.EmbedContentConfig(**self._config) return kwargs @@ -119,7 +121,7 @@ def _prepare_output(self, result: types.EmbedContentResponse) -> Dict[str, Any]: return {"embedding": result.embeddings[0].values, "meta": {"model": self._model_name}} @component.output_types(embedding=List[float], meta=Dict[str, Any]) - def run(self, text: str): + def run(self, text: str) -> Union[Dict[str, List[float]], Dict[str, Any]]: """ Embeds a single string. diff --git a/integrations/google_genai/tests/test_document_embedder.py b/integrations/google_genai/tests/test_document_embedder.py index 534b4c9a31..eedf2f13ad 100644 --- a/integrations/google_genai/tests/test_document_embedder.py +++ b/integrations/google_genai/tests/test_document_embedder.py @@ -10,129 +10,147 @@ from haystack import Document from haystack.utils.auth import Secret -from haystack_integrations.components.embedders.google_genai import GoogleAIDocumentEmbedder +from haystack_integrations.components.embedders.google_genai import GoogleGenAIDocumentEmbedder def mock_google_response(contents: List[str], model: str = "text-embedding-004", **kwargs) -> dict: secure_random = random.SystemRandom() dict_response = { "embedding": [[secure_random.random() for _ in range(768)] for _ in contents], - "meta": { - "model": model - } + "meta": {"model": model}, } return dict_response -class TestGoogleAIDocumentEmbedder: +class TestGoogleGenAIDocumentEmbedder: def test_init_default(self, monkeypatch): monkeypatch.setenv("GOOGLE_API_KEY", "fake-api-key") - embedder = GoogleAIDocumentEmbedder() + embedder = GoogleGenAIDocumentEmbedder() assert embedder.api_key.resolve_value() == "fake-api-key" assert embedder.model == "text-embedding-004" + assert embedder.prefix == "" + assert embedder.suffix == "" assert embedder.batch_size == 32 assert embedder.progress_bar is True assert embedder.meta_fields_to_embed == [] assert embedder.embedding_separator == "\n" + assert embedder.config == {"task_type": "SEMANTIC_SIMILARITY"} def test_init_with_parameters(self, monkeypatch): - embedder = GoogleAIDocumentEmbedder( + embedder = GoogleGenAIDocumentEmbedder( api_key=Secret.from_token("fake-api-key-2"), model="model", + prefix="prefix", + suffix="suffix", batch_size=64, progress_bar=False, meta_fields_to_embed=["test_field"], embedding_separator=" | ", + config={"task_type": "CLASSIFICATION"}, ) assert embedder.api_key.resolve_value() == "fake-api-key-2" assert embedder.model == "model" + assert embedder.prefix == "prefix" + assert embedder.suffix == "suffix" assert embedder.batch_size == 64 assert embedder.progress_bar is False assert embedder.meta_fields_to_embed == ["test_field"] assert embedder.embedding_separator == " | " + assert embedder.config == {"task_type": "CLASSIFICATION"} def test_init_with_parameters_and_env_vars(self, monkeypatch): - embedder = GoogleAIDocumentEmbedder( + embedder = GoogleGenAIDocumentEmbedder( api_key=Secret.from_token("fake-api-key-2"), model="model", + prefix="prefix", + suffix="suffix", batch_size=64, progress_bar=False, meta_fields_to_embed=["test_field"], embedding_separator=" | ", + config={"task_type": "CLASSIFICATION"}, ) assert embedder.api_key.resolve_value() == "fake-api-key-2" assert embedder.model == "model" + assert embedder.prefix == "prefix" + assert embedder.suffix == "suffix" assert embedder.batch_size == 64 assert embedder.progress_bar is False assert embedder.meta_fields_to_embed == ["test_field"] assert embedder.embedding_separator == " | " + assert embedder.config == {"task_type": "CLASSIFICATION"} def test_init_fail_wo_api_key(self, monkeypatch): monkeypatch.delenv("GOOGLE_API_KEY", raising=False) with pytest.raises(ValueError, match="None of the .* environment variables are set"): - GoogleAIDocumentEmbedder() + GoogleGenAIDocumentEmbedder() def test_to_dict(self, monkeypatch): monkeypatch.setenv("GOOGLE_API_KEY", "fake-api-key") - component = GoogleAIDocumentEmbedder() + component = GoogleGenAIDocumentEmbedder() data = component.to_dict() assert data == { "type": ( - "haystack_integrations.components.embedders." - "google_genai.document_embedder.GoogleAIDocumentEmbedder" + "haystack_integrations.components.embedders" + ".google_genai.document_embedder.GoogleGenAIDocumentEmbedder" ), "init_parameters": { "model": "text-embedding-004", + "prefix": "", + "suffix": "", "batch_size": 32, "progress_bar": True, "meta_fields_to_embed": [], "embedding_separator": "\n", "api_key": {"type": "env_var", "env_vars": ["GOOGLE_API_KEY"], "strict": True}, - "config": {"task_type": "SEMANTIC_SIMILARITY"} - } + "config": {"task_type": "SEMANTIC_SIMILARITY"}, + }, } def test_to_dict_with_custom_init_parameters(self, monkeypatch): monkeypatch.setenv("ENV_VAR", "fake-api-key") - component = GoogleAIDocumentEmbedder( + component = GoogleGenAIDocumentEmbedder( api_key=Secret.from_env_var("ENV_VAR", strict=False), model="model", + prefix="prefix", + suffix="suffix", batch_size=64, progress_bar=False, meta_fields_to_embed=["test_field"], embedding_separator=" | ", + config={"task_type": "CLASSIFICATION"}, ) data = component.to_dict() assert data == { "type": ( - "haystack_integrations.components.embedders." - "google_genai.document_embedder.GoogleAIDocumentEmbedder" + "haystack_integrations.components.embedders" + ".google_genai.document_embedder.GoogleGenAIDocumentEmbedder" ), "init_parameters": { "model": "model", + "prefix": "prefix", + "suffix": "suffix", "batch_size": 64, "progress_bar": False, "meta_fields_to_embed": ["test_field"], "embedding_separator": " | ", "api_key": {"type": "env_var", "env_vars": ["ENV_VAR"], "strict": False}, - "config": {"task_type": "SEMANTIC_SIMILARITY"} - } + "config": {"task_type": "CLASSIFICATION"}, + }, } def test_prepare_texts_to_embed_w_metadata(self): documents = [ - Document(id=f"{i}", content=f"document number {i}:\ncontent", meta={ - "meta_field": f"meta_value {i}"}) + Document(id=f"{i}", content=f"document number {i}:\ncontent", meta={"meta_field": f"meta_value {i}"}) for i in range(5) ] - embedder = GoogleAIDocumentEmbedder( + embedder = GoogleGenAIDocumentEmbedder( api_key=Secret.from_token("fake-api-key"), meta_fields_to_embed=["meta_field"], embedding_separator=" | " ) prepared_texts = embedder._prepare_texts_to_embed(documents) - assert prepared_texts == { "0": "meta_value 0 | document number 0:\ncontent", "1": "meta_value 1 | document number 1:\ncontent", @@ -142,22 +160,20 @@ def test_prepare_texts_to_embed_w_metadata(self): } def test_run_wrong_input_format(self): - embedder = GoogleAIDocumentEmbedder( - api_key=Secret.from_token("fake-api-key")) + embedder = GoogleGenAIDocumentEmbedder(api_key=Secret.from_token("fake-api-key")) # wrong formats string_input = "text" list_integers_input = [1, 2, 3] - with pytest.raises(TypeError, match="GoogleAIDocumentEmbedder expects a list of Documents as input"): + with pytest.raises(TypeError, match="GoogleGenAIDocumentEmbedder expects a list of Documents as input"): embedder.run(documents=string_input) - with pytest.raises(TypeError, match="GoogleAIDocumentEmbedder expects a list of Documents as input"): + with pytest.raises(TypeError, match="GoogleGenAIDocumentEmbedder expects a list of Documents as input"): embedder.run(documents=list_integers_input) def test_run_on_empty_list(self): - embedder = GoogleAIDocumentEmbedder( - api_key=Secret.from_token("fake-api-key")) + embedder = GoogleGenAIDocumentEmbedder(api_key=Secret.from_token("fake-api-key")) empty_list_input = [] result = embedder.run(documents=empty_list_input) @@ -165,19 +181,20 @@ def test_run_on_empty_list(self): assert result["documents"] is not None assert not result["documents"] # empty list - @pytest.mark.skipif(os.environ.get("GOOGLE_API_KEY", "") == "", reason="GOOGLE_API_KEY is not set") + @pytest.mark.skipif( + not os.environ.get("GOOGLE_API_KEY", None), + reason="Export an env var called GOOGLE_API_KEY containing the Google API key to run this test.", + ) @pytest.mark.integration def test_run(self): docs = [ Document(content="I love cheese", meta={"topic": "Cuisine"}), - Document(content="A transformer is a deep learning architecture", meta={ - "topic": "ML"}), + Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}), ] model = "text-embedding-004" - embedder = GoogleAIDocumentEmbedder(model=model, meta_fields_to_embed=[ - "topic"], embedding_separator=" | ") + embedder = GoogleGenAIDocumentEmbedder(model=model, meta_fields_to_embed=["topic"], embedding_separator=" | ") result = embedder.run(documents=docs) documents_with_embeddings = result["documents"] @@ -189,6 +206,6 @@ def test_run(self): assert len(doc.embedding) == 768 assert all(isinstance(x, float) for x in doc.embedding) - assert "text" in result["meta"]["model"] and "004" in result["meta"]["model"], ( - "The model name does not contain 'text' and '004'" - ) + assert ( + "text" in result["meta"]["model"] and "004" in result["meta"]["model"] + ), "The model name does not contain 'text' and '004'" diff --git a/integrations/google_genai/tests/test_text_embedder.py b/integrations/google_genai/tests/test_text_embedder.py index 7327d3901e..e5af84f664 100644 --- a/integrations/google_genai/tests/test_text_embedder.py +++ b/integrations/google_genai/tests/test_text_embedder.py @@ -5,97 +5,109 @@ import os import pytest -from google.genai import types from google.genai.types import ContentEmbedding, EmbedContentConfig, EmbedContentResponse from haystack.utils.auth import Secret -from haystack_integrations.components.embedders.google_genai import GoogleAITextEmbedder +from haystack_integrations.components.embedders.google_genai import GoogleGenAITextEmbedder -class TestGoogleAITextEmbedder: +class TestGoogleGenAITextEmbedder: def test_init_default(self, monkeypatch): monkeypatch.setenv("GOOGLE_API_KEY", "fake-api-key") - embedder = GoogleAITextEmbedder() + embedder = GoogleGenAITextEmbedder() assert embedder._api_key.resolve_value() == "fake-api-key" assert embedder._model_name == "text-embedding-004" + assert embedder._prefix == "" + assert embedder._suffix == "" + assert embedder._config == {"task_type": "SEMANTIC_SIMILARITY"} def test_init_with_parameters(self): - embedder = GoogleAITextEmbedder( + embedder = GoogleGenAITextEmbedder( api_key=Secret.from_token("fake-api-key"), model="model", + prefix="prefix", + suffix="suffix", + config={"task_type": "CLASSIFICATION"}, ) assert embedder._api_key.resolve_value() == "fake-api-key" assert embedder._model_name == "model" + assert embedder._prefix == "prefix" + assert embedder._suffix == "suffix" + assert embedder._config == {"task_type": "CLASSIFICATION"} def test_init_with_parameters_and_env_vars(self, monkeypatch): - embedder = GoogleAITextEmbedder( + embedder = GoogleGenAITextEmbedder( api_key=Secret.from_token("fake-api-key"), model="model", - config=types.EmbedContentConfig(task_type="SEMANTIC_SIMILARITY") + prefix="prefix", + suffix="suffix", + config={"task_type": "CLASSIFICATION"}, ) assert embedder._api_key.resolve_value() == "fake-api-key" assert embedder._model_name == "model" - assert embedder._config == types.EmbedContentConfig( - task_type="SEMANTIC_SIMILARITY") - - def test_init_fail_wo_api_key(self, monkeypatch): - monkeypatch.delenv("GOOGLE_API_KEY", raising=False) - with pytest.raises(ValueError, match="None of the .* environment variables are set"): - GoogleAITextEmbedder() + assert embedder._prefix == "prefix" + assert embedder._suffix == "suffix" + assert embedder._config == {"task_type": "CLASSIFICATION"} def test_to_dict(self, monkeypatch): monkeypatch.setenv("GOOGLE_API_KEY", "fake-api-key") - component = GoogleAITextEmbedder() + component = GoogleGenAITextEmbedder() data = component.to_dict() assert data == { - "type": "haystack_integrations.components.embedders.google_genai.text_embedder.GoogleAITextEmbedder", + "type": "haystack_integrations.components.embedders.google_genai.text_embedder.GoogleGenAITextEmbedder", "init_parameters": { "api_key": {"type": "env_var", "env_vars": ["GOOGLE_API_KEY"], "strict": True}, "model": "text-embedding-004", - "config": {"task_type": "SEMANTIC_SIMILARITY"} + "prefix": "", + "suffix": "", + "config": {"task_type": "SEMANTIC_SIMILARITY"}, }, } def test_to_dict_with_custom_init_parameters(self, monkeypatch): monkeypatch.setenv("ENV_VAR", "fake-api-key") - component = GoogleAITextEmbedder( + component = GoogleGenAITextEmbedder( api_key=Secret.from_env_var("ENV_VAR", strict=False), model="model", - config=types.EmbedContentConfig( - task_type="SEMANTIC_SIMILARITY" - ) + prefix="prefix", + suffix="suffix", + config={"task_type": "CLASSIFICATION"}, ) data = component.to_dict() assert data == { - "type": "haystack_integrations.components.embedders.google_genai.text_embedder.GoogleAITextEmbedder", + "type": "haystack_integrations.components.embedders.google_genai.text_embedder.GoogleGenAITextEmbedder", "init_parameters": { "model": "model", - "api_key": { - "type": "env_var", - "env_vars": ["ENV_VAR"], - "strict": False - }, - "config": {"task_type": "SEMANTIC_SIMILARITY"} - } + "api_key": {"type": "env_var", "env_vars": ["ENV_VAR"], "strict": False}, + "prefix": "prefix", + "suffix": "suffix", + "config": {"task_type": "CLASSIFICATION"}, + }, } def test_from_dict(self, monkeypatch): monkeypatch.setenv("GOOGLE_API_KEY", "fake-api-key") data = { - "type": "haystack_integrations.components.embedders.google_genai.text_embedder.GoogleAITextEmbedder", + "type": "haystack_integrations.components.embedders.google_genai.text_embedder.GoogleGenAITextEmbedder", "init_parameters": { "api_key": {"type": "env_var", "env_vars": ["GOOGLE_API_KEY"], "strict": True}, "model": "text-embedding-004", + "prefix": "", + "suffix": "", + "config": {"task_type": "CLASSIFICATION"}, }, } - component = GoogleAITextEmbedder.from_dict(data) + component = GoogleGenAITextEmbedder.from_dict(data) assert component._api_key.resolve_value() == "fake-api-key" assert component._model_name == "text-embedding-004" + assert component._prefix == "" + assert component._suffix == "" + assert component._config == {"task_type": "CLASSIFICATION"} def test_prepare_input(self, monkeypatch): monkeypatch.setenv("GOOGLE_API_KEY", "fake-api-key") - embedder = GoogleAITextEmbedder() + embedder = GoogleGenAITextEmbedder() contents = "The food was delicious" prepared_input = embedder._prepare_input(contents) @@ -108,8 +120,8 @@ def test_prepare_input(self, monkeypatch): title=None, output_dimensionality=None, mime_type=None, - auto_truncate=None - ) + auto_truncate=None, + ), } def test_prepare_output(self, monkeypatch): @@ -119,7 +131,7 @@ def test_prepare_output(self, monkeypatch): embeddings=[ContentEmbedding(values=[0.1, 0.2, 0.3])], ) - embedder = GoogleAITextEmbedder() + embedder = GoogleGenAITextEmbedder() result = embedder._prepare_output(result=response) assert result == { "embedding": [0.1, 0.2, 0.3], @@ -127,25 +139,27 @@ def test_prepare_output(self, monkeypatch): } def test_run_wrong_input_format(self): - embedder = GoogleAITextEmbedder( - api_key=Secret.from_token("fake-api-key")) + embedder = GoogleGenAITextEmbedder(api_key=Secret.from_token("fake-api-key")) list_integers_input = [1, 2, 3] - with pytest.raises(TypeError, match="GoogleAITextEmbedder expects a string as an input"): + with pytest.raises(TypeError, match="GoogleGenAITextEmbedder expects a string as an input"): embedder.run(text=list_integers_input) - @pytest.mark.skipif(os.environ.get("GOOGLE_API_KEY", "") == "", reason="GOOGLE_API_KEY is not set") + @pytest.mark.skipif( + not os.environ.get("GOOGLE_API_KEY", None), + reason="Export an env var called GOOGLE_API_KEY containing the Google API key to run this test.", + ) @pytest.mark.integration def test_run(self): model = "text-embedding-004" - embedder = GoogleAITextEmbedder(model=model) + embedder = GoogleGenAITextEmbedder(model=model) result = embedder.run(text="The food was delicious") assert len(result["embedding"]) == 768 assert all(isinstance(x, float) for x in result["embedding"]) - assert "text" in result["meta"]["model"] and "004" in result["meta"]["model"], ( - "The model name does not contain 'text' and '004'" - ) + assert ( + "text" in result["meta"]["model"] and "004" in result["meta"]["model"] + ), "The model name does not contain 'text' and '004'" From 89bb3becf0a57ef5ad742036c98dc7ebda0f467b Mon Sep 17 00:00:00 2001 From: garybadwal Date: Thu, 5 Jun 2025 20:47:23 +0530 Subject: [PATCH 07/14] feat: Add additional modules for Google GenAI embedders in config --- integrations/google_genai/pydoc/config.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/integrations/google_genai/pydoc/config.yml b/integrations/google_genai/pydoc/config.yml index e87f53cd0b..095a67c077 100644 --- a/integrations/google_genai/pydoc/config.yml +++ b/integrations/google_genai/pydoc/config.yml @@ -3,6 +3,8 @@ loaders: search_path: [../src] modules: [ "haystack_integrations.components.generators.google_genai.chat.chat_generator", + "haystack_integrations.components.embedders.google_genai.document_embedder", + "haystack_integrations.components.embedders.google_genai.text_embedder" ] ignore_when_discovered: ["__init__"] processors: From f2d2a0c1929effd8aaace4efb4de87588d65a4f9 Mon Sep 17 00:00:00 2001 From: garybadwal Date: Fri, 6 Jun 2025 18:16:33 +0530 Subject: [PATCH 08/14] chore: add 'more-itertools' to lint environment dependencies --- integrations/google_genai/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/google_genai/pyproject.toml b/integrations/google_genai/pyproject.toml index 41021b5ff6..e3c94cc601 100644 --- a/integrations/google_genai/pyproject.toml +++ b/integrations/google_genai/pyproject.toml @@ -74,7 +74,7 @@ types = "mypy --install-types --non-interactive --explicit-package-bases {args:s [tool.hatch.envs.lint] installer = "uv" detached = true -dependencies = ["pip", "black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243"] +dependencies = ["pip", "black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243", "more-itertools"] [tool.hatch.envs.lint.scripts] typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}" From 171ab378b4336d04c6c784a2b52bfb4889a528bd Mon Sep 17 00:00:00 2001 From: garybadwal Date: Fri, 6 Jun 2025 18:27:59 +0530 Subject: [PATCH 09/14] refactor: update GoogleGenAIDocumentEmbedder and GoogleGenAITextEmbedder to use private attributes for initialization --- .../google_genai/document_embedder.py | 78 +++++++++---------- .../embedders/google_genai/text_embedder.py | 3 +- .../tests/test_document_embedder.py | 54 ++++++------- 3 files changed, 64 insertions(+), 71 deletions(-) diff --git a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py index 07c0c16f7e..ff457891fe 100644 --- a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py +++ b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py @@ -36,7 +36,7 @@ class GoogleGenAIDocumentEmbedder: ``` """ - def __init__( # pylint: disable=too-many-positional-arguments + def __init__( self, *, api_key: Secret = Secret.from_env_var("GOOGLE_API_KEY"), @@ -52,10 +52,6 @@ def __init__( # pylint: disable=too-many-positional-arguments """ Creates an GoogleGenAIDocumentEmbedder component. - Before initializing the component, you can set the 'GoogleGenAI_TIMEOUT' and 'GoogleGenAI_MAX_RETRIES' - environment variables to override the `timeout` and `max_retries` parameters respectively - in the GoogleGenAI client. - :param api_key: The Google API key. You can set it with the environment variable `GOOGLE_API_KEY`, or pass it via this parameter @@ -77,18 +73,19 @@ def __init__( # pylint: disable=too-many-positional-arguments Separator used to concatenate the metadata fields to the document text. :param config: A dictionary of keyword arguments to configure embedding content configuration `types.EmbedContentConfig`. + If not specified, it defaults to {"task_type": "SEMANTIC_SIMILARITY"}. For more information, see the [Google AI Task types](https://ai.google.dev/gemini-api/docs/embeddings#task-types). """ - self.api_key = api_key - self.model = model - self.prefix = prefix - self.suffix = suffix - self.batch_size = batch_size - self.progress_bar = progress_bar - self.meta_fields_to_embed = meta_fields_to_embed or [] - self.embedding_separator = embedding_separator - self.client = genai.Client(api_key=api_key.resolve_value()) - self.config = config if config is not None else {"task_type": "SEMANTIC_SIMILARITY"} + self._api_key = api_key + self._model = model + self._prefix = prefix + self._suffix = suffix + self._batch_size = batch_size + self._progress_bar = progress_bar + self._meta_fields_to_embed = meta_fields_to_embed or [] + self._embedding_separator = embedding_separator + self._client = genai.Client(api_key=api_key.resolve_value()) + self._config = config if config is not None else {"task_type": "SEMANTIC_SIMILARITY"} def to_dict(self) -> Dict[str, Any]: """ @@ -99,15 +96,15 @@ def to_dict(self) -> Dict[str, Any]: """ return default_to_dict( self, - model=self.model, - prefix=self.prefix, - suffix=self.suffix, - batch_size=self.batch_size, - progress_bar=self.progress_bar, - meta_fields_to_embed=self.meta_fields_to_embed, - embedding_separator=self.embedding_separator, - api_key=self.api_key.to_dict(), - config=self.config, + model=self._model, + prefix=self._prefix, + suffix=self._suffix, + batch_size=self._batch_size, + progress_bar=self._progress_bar, + meta_fields_to_embed=self._meta_fields_to_embed, + embedding_separator=self._embedding_separator, + api_key=self._api_key.to_dict(), + config=self._config, ) @classmethod @@ -127,19 +124,20 @@ def _prepare_texts_to_embed(self, documents: List[Document]) -> Dict[str, str]: """ Prepare the texts to embed by concatenating the Document text with the metadata fields to embed. """ - texts_to_embed = {} + texts_to_embed: List[str] = [] for doc in documents: meta_values_to_embed = [ - str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta[key] is not None + str(doc.meta[key]) for key in self._meta_fields_to_embed if key in doc.meta and doc.meta[key] is not None ] - texts_to_embed[doc.id] = ( - self.prefix + self.embedding_separator.join([*meta_values_to_embed, doc.content or ""]) + self.suffix + text_to_embed = ( + self._prefix + self._embedding_separator.join([*meta_values_to_embed, doc.content or ""]) + self._suffix ) + texts_to_embed.append(text_to_embed) return texts_to_embed - def _embed_batch(self, texts_to_embed: Dict[str, str], batch_size: int) -> Tuple[List[List[float]], Dict[str, Any]]: + def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> Tuple[List[List[float]], Dict[str, Any]]: """ Embed a list of texts in batches. """ @@ -147,25 +145,19 @@ def _embed_batch(self, texts_to_embed: Dict[str, str], batch_size: int) -> Tuple all_embeddings = [] meta: Dict[str, Any] = {} for batch in tqdm( - batched(texts_to_embed.items(), batch_size), disable=not self.progress_bar, desc="Calculating embeddings" + batched(texts_to_embed, batch_size), disable=not self._progress_bar, desc="Calculating embeddings" ): - args: Dict[str, Any] = {"model": self.model, "contents": [b[1] for b in batch]} - if self.config: - args["config"] = types.EmbedContentConfig(**self.config) if self.config else None - - try: - response = self.client.models.embed_content(**args) - except Exception as exc: - ids = ", ".join(b[0] for b in batch) - msg = "Failed embedding of documents {ids} caused by {exc}" - logger.exception(msg, ids=ids, exc=exc) - continue + args: Dict[str, Any] = {"model": self._model, "contents": [b[1] for b in batch]} + if self._config: + args["config"] = types.EmbedContentConfig(**self._config) if self._config else None + + response = self._client.models.embed_content(**args) embeddings = [el.values for el in response.embeddings] all_embeddings.extend(embeddings) if "model" not in meta: - meta["model"] = self.model + meta["model"] = self._model return all_embeddings, meta @@ -191,7 +183,7 @@ def run(self, documents: List[Document]) -> Dict[str, Union[List[Document], Dict texts_to_embed = self._prepare_texts_to_embed(documents=documents) - embeddings, meta = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self.batch_size) + embeddings, meta = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self._batch_size) for doc, emb in zip(documents, embeddings): doc.embedding = emb diff --git a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/text_embedder.py b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/text_embedder.py index 7f8608c1ec..415d5fc21d 100644 --- a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/text_embedder.py +++ b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/text_embedder.py @@ -36,7 +36,7 @@ class GoogleGenAITextEmbedder: ``` """ - def __init__( # pylint: disable=too-many-positional-arguments + def __init__( self, *, api_key: Secret = Secret.from_env_var("GOOGLE_API_KEY"), @@ -61,6 +61,7 @@ def __init__( # pylint: disable=too-many-positional-arguments A string to add at the end of each text to embed. :param config: A dictionary of keyword arguments to configure embedding content configuration `types.EmbedContentConfig`. + If not specified, it defaults to {"task_type": "SEMANTIC_SIMILARITY"}. For more information, see the [Google AI Task types](https://ai.google.dev/gemini-api/docs/embeddings#task-types). """ diff --git a/integrations/google_genai/tests/test_document_embedder.py b/integrations/google_genai/tests/test_document_embedder.py index eedf2f13ad..0d59ba462a 100644 --- a/integrations/google_genai/tests/test_document_embedder.py +++ b/integrations/google_genai/tests/test_document_embedder.py @@ -27,15 +27,15 @@ class TestGoogleGenAIDocumentEmbedder: def test_init_default(self, monkeypatch): monkeypatch.setenv("GOOGLE_API_KEY", "fake-api-key") embedder = GoogleGenAIDocumentEmbedder() - assert embedder.api_key.resolve_value() == "fake-api-key" - assert embedder.model == "text-embedding-004" - assert embedder.prefix == "" - assert embedder.suffix == "" - assert embedder.batch_size == 32 - assert embedder.progress_bar is True - assert embedder.meta_fields_to_embed == [] - assert embedder.embedding_separator == "\n" - assert embedder.config == {"task_type": "SEMANTIC_SIMILARITY"} + assert embedder._api_key.resolve_value() == "fake-api-key" + assert embedder._model == "text-embedding-004" + assert embedder._prefix == "" + assert embedder._suffix == "" + assert embedder._batch_size == 32 + assert embedder._progress_bar is True + assert embedder._meta_fields_to_embed == [] + assert embedder._embedding_separator == "\n" + assert embedder._config == {"task_type": "SEMANTIC_SIMILARITY"} def test_init_with_parameters(self, monkeypatch): embedder = GoogleGenAIDocumentEmbedder( @@ -49,15 +49,15 @@ def test_init_with_parameters(self, monkeypatch): embedding_separator=" | ", config={"task_type": "CLASSIFICATION"}, ) - assert embedder.api_key.resolve_value() == "fake-api-key-2" - assert embedder.model == "model" - assert embedder.prefix == "prefix" - assert embedder.suffix == "suffix" - assert embedder.batch_size == 64 - assert embedder.progress_bar is False - assert embedder.meta_fields_to_embed == ["test_field"] - assert embedder.embedding_separator == " | " - assert embedder.config == {"task_type": "CLASSIFICATION"} + assert embedder._api_key.resolve_value() == "fake-api-key-2" + assert embedder._model == "model" + assert embedder._prefix == "prefix" + assert embedder._suffix == "suffix" + assert embedder._batch_size == 64 + assert embedder._progress_bar is False + assert embedder._meta_fields_to_embed == ["test_field"] + assert embedder._embedding_separator == " | " + assert embedder._config == {"task_type": "CLASSIFICATION"} def test_init_with_parameters_and_env_vars(self, monkeypatch): embedder = GoogleGenAIDocumentEmbedder( @@ -71,15 +71,15 @@ def test_init_with_parameters_and_env_vars(self, monkeypatch): embedding_separator=" | ", config={"task_type": "CLASSIFICATION"}, ) - assert embedder.api_key.resolve_value() == "fake-api-key-2" - assert embedder.model == "model" - assert embedder.prefix == "prefix" - assert embedder.suffix == "suffix" - assert embedder.batch_size == 64 - assert embedder.progress_bar is False - assert embedder.meta_fields_to_embed == ["test_field"] - assert embedder.embedding_separator == " | " - assert embedder.config == {"task_type": "CLASSIFICATION"} + assert embedder._api_key.resolve_value() == "fake-api-key-2" + assert embedder._model == "model" + assert embedder._prefix == "prefix" + assert embedder._suffix == "suffix" + assert embedder._batch_size == 64 + assert embedder._progress_bar is False + assert embedder._meta_fields_to_embed == ["test_field"] + assert embedder._embedding_separator == " | " + assert embedder._config == {"task_type": "CLASSIFICATION"} def test_init_fail_wo_api_key(self, monkeypatch): monkeypatch.delenv("GOOGLE_API_KEY", raising=False) From f20bdffcd5bd8a10477ca607c0706bad42e35cff Mon Sep 17 00:00:00 2001 From: garybadwal Date: Fri, 6 Jun 2025 18:34:39 +0530 Subject: [PATCH 10/14] refactor: update _prepare_texts_to_embed to return a list instead of a dictionary --- .../embedders/google_genai/document_embedder.py | 5 ++++- .../google_genai/tests/test_document_embedder.py | 14 +++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py index ff457891fe..fcd0d9f441 100644 --- a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py +++ b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py @@ -127,7 +127,10 @@ def _prepare_texts_to_embed(self, documents: List[Document]) -> Dict[str, str]: texts_to_embed: List[str] = [] for doc in documents: meta_values_to_embed = [ - str(doc.meta[key]) for key in self._meta_fields_to_embed if key in doc.meta and doc.meta[key] is not None + str(doc.meta[key]) + for key in + self._meta_fields_to_embed + if key in doc.meta and doc.meta[key] is not None ] text_to_embed = ( diff --git a/integrations/google_genai/tests/test_document_embedder.py b/integrations/google_genai/tests/test_document_embedder.py index 0d59ba462a..21936d2f6c 100644 --- a/integrations/google_genai/tests/test_document_embedder.py +++ b/integrations/google_genai/tests/test_document_embedder.py @@ -151,13 +151,13 @@ def test_prepare_texts_to_embed_w_metadata(self): ) prepared_texts = embedder._prepare_texts_to_embed(documents) - assert prepared_texts == { - "0": "meta_value 0 | document number 0:\ncontent", - "1": "meta_value 1 | document number 1:\ncontent", - "2": "meta_value 2 | document number 2:\ncontent", - "3": "meta_value 3 | document number 3:\ncontent", - "4": "meta_value 4 | document number 4:\ncontent", - } + assert prepared_texts == [ + 'meta_value 0 | document number 0:\ncontent', + 'meta_value 1 | document number 1:\ncontent', + 'meta_value 2 | document number 2:\ncontent', + 'meta_value 3 | document number 3:\ncontent', + 'meta_value 4 | document number 4:\ncontent' + ] def test_run_wrong_input_format(self): embedder = GoogleGenAIDocumentEmbedder(api_key=Secret.from_token("fake-api-key")) From 38525c05b63d688cd545d67b1c9f539cebd1dfc5 Mon Sep 17 00:00:00 2001 From: garybadwal Date: Fri, 6 Jun 2025 18:38:14 +0530 Subject: [PATCH 11/14] refactor: format code for better readability and consistency in document embedder --- .../google_genai/document_embedder.py | 20 +++++++++------ .../tests/test_document_embedder.py | 25 +++++++++++-------- 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py index fcd0d9f441..f38082b332 100644 --- a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py +++ b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py @@ -85,7 +85,8 @@ def __init__( self._meta_fields_to_embed = meta_fields_to_embed or [] self._embedding_separator = embedding_separator self._client = genai.Client(api_key=api_key.resolve_value()) - self._config = config if config is not None else {"task_type": "SEMANTIC_SIMILARITY"} + self._config = config if config is not None else { + "task_type": "SEMANTIC_SIMILARITY"} def to_dict(self) -> Dict[str, Any]: """ @@ -127,14 +128,14 @@ def _prepare_texts_to_embed(self, documents: List[Document]) -> Dict[str, str]: texts_to_embed: List[str] = [] for doc in documents: meta_values_to_embed = [ - str(doc.meta[key]) - for key in - self._meta_fields_to_embed + str(doc.meta[key]) + for key in self._meta_fields_to_embed if key in doc.meta and doc.meta[key] is not None ] text_to_embed = ( - self._prefix + self._embedding_separator.join([*meta_values_to_embed, doc.content or ""]) + self._suffix + self._prefix + self._embedding_separator.join( + [*meta_values_to_embed, doc.content or ""]) + self._suffix ) texts_to_embed.append(text_to_embed) @@ -150,9 +151,11 @@ def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> Tuple[List for batch in tqdm( batched(texts_to_embed, batch_size), disable=not self._progress_bar, desc="Calculating embeddings" ): - args: Dict[str, Any] = {"model": self._model, "contents": [b[1] for b in batch]} + args: Dict[str, Any] = {"model": self._model, + "contents": [b[1] for b in batch]} if self._config: - args["config"] = types.EmbedContentConfig(**self._config) if self._config else None + args["config"] = types.EmbedContentConfig( + **self._config) if self._config else None response = self._client.models.embed_content(**args) @@ -186,7 +189,8 @@ def run(self, documents: List[Document]) -> Dict[str, Union[List[Document], Dict texts_to_embed = self._prepare_texts_to_embed(documents=documents) - embeddings, meta = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self._batch_size) + embeddings, meta = self._embed_batch( + texts_to_embed=texts_to_embed, batch_size=self._batch_size) for doc, emb in zip(documents, embeddings): doc.embedding = emb diff --git a/integrations/google_genai/tests/test_document_embedder.py b/integrations/google_genai/tests/test_document_embedder.py index 21936d2f6c..46da7092c2 100644 --- a/integrations/google_genai/tests/test_document_embedder.py +++ b/integrations/google_genai/tests/test_document_embedder.py @@ -142,7 +142,8 @@ def test_to_dict_with_custom_init_parameters(self, monkeypatch): def test_prepare_texts_to_embed_w_metadata(self): documents = [ - Document(id=f"{i}", content=f"document number {i}:\ncontent", meta={"meta_field": f"meta_value {i}"}) + Document(id=f"{i}", content=f"document number {i}:\ncontent", meta={ + "meta_field": f"meta_value {i}"}) for i in range(5) ] @@ -152,15 +153,16 @@ def test_prepare_texts_to_embed_w_metadata(self): prepared_texts = embedder._prepare_texts_to_embed(documents) assert prepared_texts == [ - 'meta_value 0 | document number 0:\ncontent', - 'meta_value 1 | document number 1:\ncontent', - 'meta_value 2 | document number 2:\ncontent', - 'meta_value 3 | document number 3:\ncontent', - 'meta_value 4 | document number 4:\ncontent' + "meta_value 0 | document number 0:\ncontent", + "meta_value 1 | document number 1:\ncontent", + "meta_value 2 | document number 2:\ncontent", + "meta_value 3 | document number 3:\ncontent", + "meta_value 4 | document number 4:\ncontent" ] def test_run_wrong_input_format(self): - embedder = GoogleGenAIDocumentEmbedder(api_key=Secret.from_token("fake-api-key")) + embedder = GoogleGenAIDocumentEmbedder( + api_key=Secret.from_token("fake-api-key")) # wrong formats string_input = "text" @@ -173,7 +175,8 @@ def test_run_wrong_input_format(self): embedder.run(documents=list_integers_input) def test_run_on_empty_list(self): - embedder = GoogleGenAIDocumentEmbedder(api_key=Secret.from_token("fake-api-key")) + embedder = GoogleGenAIDocumentEmbedder( + api_key=Secret.from_token("fake-api-key")) empty_list_input = [] result = embedder.run(documents=empty_list_input) @@ -189,12 +192,14 @@ def test_run_on_empty_list(self): def test_run(self): docs = [ Document(content="I love cheese", meta={"topic": "Cuisine"}), - Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}), + Document(content="A transformer is a deep learning architecture", meta={ + "topic": "ML"}), ] model = "text-embedding-004" - embedder = GoogleGenAIDocumentEmbedder(model=model, meta_fields_to_embed=["topic"], embedding_separator=" | ") + embedder = GoogleGenAIDocumentEmbedder(model=model, meta_fields_to_embed=[ + "topic"], embedding_separator=" | ") result = embedder.run(documents=docs) documents_with_embeddings = result["documents"] From f8e5f8a893b0441fd6a29859727ed22e694a6224 Mon Sep 17 00:00:00 2001 From: garybadwal Date: Fri, 6 Jun 2025 18:42:31 +0530 Subject: [PATCH 12/14] refactor: improve code formatting for consistency and readability in document embedder and tests --- .../google_genai/document_embedder.py | 15 ++++------ .../tests/test_document_embedder.py | 29 +++++++------------ .../google_genai/tests/test_text_embedder.py | 6 ++-- 3 files changed, 19 insertions(+), 31 deletions(-) diff --git a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py index f38082b332..3da4aec8a3 100644 --- a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py +++ b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py @@ -85,8 +85,7 @@ def __init__( self._meta_fields_to_embed = meta_fields_to_embed or [] self._embedding_separator = embedding_separator self._client = genai.Client(api_key=api_key.resolve_value()) - self._config = config if config is not None else { - "task_type": "SEMANTIC_SIMILARITY"} + self._config = config if config is not None else {"task_type": "SEMANTIC_SIMILARITY"} def to_dict(self) -> Dict[str, Any]: """ @@ -134,8 +133,7 @@ def _prepare_texts_to_embed(self, documents: List[Document]) -> Dict[str, str]: ] text_to_embed = ( - self._prefix + self._embedding_separator.join( - [*meta_values_to_embed, doc.content or ""]) + self._suffix + self._prefix + self._embedding_separator.join([*meta_values_to_embed, doc.content or ""]) + self._suffix ) texts_to_embed.append(text_to_embed) @@ -151,11 +149,9 @@ def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> Tuple[List for batch in tqdm( batched(texts_to_embed, batch_size), disable=not self._progress_bar, desc="Calculating embeddings" ): - args: Dict[str, Any] = {"model": self._model, - "contents": [b[1] for b in batch]} + args: Dict[str, Any] = {"model": self._model, "contents": [b[1] for b in batch]} if self._config: - args["config"] = types.EmbedContentConfig( - **self._config) if self._config else None + args["config"] = types.EmbedContentConfig(**self._config) if self._config else None response = self._client.models.embed_content(**args) @@ -189,8 +185,7 @@ def run(self, documents: List[Document]) -> Dict[str, Union[List[Document], Dict texts_to_embed = self._prepare_texts_to_embed(documents=documents) - embeddings, meta = self._embed_batch( - texts_to_embed=texts_to_embed, batch_size=self._batch_size) + embeddings, meta = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self._batch_size) for doc, emb in zip(documents, embeddings): doc.embedding = emb diff --git a/integrations/google_genai/tests/test_document_embedder.py b/integrations/google_genai/tests/test_document_embedder.py index 46da7092c2..31e55baf43 100644 --- a/integrations/google_genai/tests/test_document_embedder.py +++ b/integrations/google_genai/tests/test_document_embedder.py @@ -92,8 +92,7 @@ def test_to_dict(self, monkeypatch): data = component.to_dict() assert data == { "type": ( - "haystack_integrations.components.embedders" - ".google_genai.document_embedder.GoogleGenAIDocumentEmbedder" + "haystack_integrations.components.embedders.google_genai.document_embedder.GoogleGenAIDocumentEmbedder" ), "init_parameters": { "model": "text-embedding-004", @@ -124,8 +123,7 @@ def test_to_dict_with_custom_init_parameters(self, monkeypatch): data = component.to_dict() assert data == { "type": ( - "haystack_integrations.components.embedders" - ".google_genai.document_embedder.GoogleGenAIDocumentEmbedder" + "haystack_integrations.components.embedders.google_genai.document_embedder.GoogleGenAIDocumentEmbedder" ), "init_parameters": { "model": "model", @@ -142,8 +140,7 @@ def test_to_dict_with_custom_init_parameters(self, monkeypatch): def test_prepare_texts_to_embed_w_metadata(self): documents = [ - Document(id=f"{i}", content=f"document number {i}:\ncontent", meta={ - "meta_field": f"meta_value {i}"}) + Document(id=f"{i}", content=f"document number {i}:\ncontent", meta={"meta_field": f"meta_value {i}"}) for i in range(5) ] @@ -157,12 +154,11 @@ def test_prepare_texts_to_embed_w_metadata(self): "meta_value 1 | document number 1:\ncontent", "meta_value 2 | document number 2:\ncontent", "meta_value 3 | document number 3:\ncontent", - "meta_value 4 | document number 4:\ncontent" + "meta_value 4 | document number 4:\ncontent", ] def test_run_wrong_input_format(self): - embedder = GoogleGenAIDocumentEmbedder( - api_key=Secret.from_token("fake-api-key")) + embedder = GoogleGenAIDocumentEmbedder(api_key=Secret.from_token("fake-api-key")) # wrong formats string_input = "text" @@ -175,8 +171,7 @@ def test_run_wrong_input_format(self): embedder.run(documents=list_integers_input) def test_run_on_empty_list(self): - embedder = GoogleGenAIDocumentEmbedder( - api_key=Secret.from_token("fake-api-key")) + embedder = GoogleGenAIDocumentEmbedder(api_key=Secret.from_token("fake-api-key")) empty_list_input = [] result = embedder.run(documents=empty_list_input) @@ -192,14 +187,12 @@ def test_run_on_empty_list(self): def test_run(self): docs = [ Document(content="I love cheese", meta={"topic": "Cuisine"}), - Document(content="A transformer is a deep learning architecture", meta={ - "topic": "ML"}), + Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}), ] model = "text-embedding-004" - embedder = GoogleGenAIDocumentEmbedder(model=model, meta_fields_to_embed=[ - "topic"], embedding_separator=" | ") + embedder = GoogleGenAIDocumentEmbedder(model=model, meta_fields_to_embed=["topic"], embedding_separator=" | ") result = embedder.run(documents=docs) documents_with_embeddings = result["documents"] @@ -211,6 +204,6 @@ def test_run(self): assert len(doc.embedding) == 768 assert all(isinstance(x, float) for x in doc.embedding) - assert ( - "text" in result["meta"]["model"] and "004" in result["meta"]["model"] - ), "The model name does not contain 'text' and '004'" + assert "text" in result["meta"]["model"] and "004" in result["meta"]["model"], ( + "The model name does not contain 'text' and '004'" + ) diff --git a/integrations/google_genai/tests/test_text_embedder.py b/integrations/google_genai/tests/test_text_embedder.py index e5af84f664..bb700527be 100644 --- a/integrations/google_genai/tests/test_text_embedder.py +++ b/integrations/google_genai/tests/test_text_embedder.py @@ -160,6 +160,6 @@ def test_run(self): assert len(result["embedding"]) == 768 assert all(isinstance(x, float) for x in result["embedding"]) - assert ( - "text" in result["meta"]["model"] and "004" in result["meta"]["model"] - ), "The model name does not contain 'text' and '004'" + assert "text" in result["meta"]["model"] and "004" in result["meta"]["model"], ( + "The model name does not contain 'text' and '004'" + ) From 666d0d51f6b2190d8f501538b5f01f4ff1d9f2b2 Mon Sep 17 00:00:00 2001 From: garybadwal Date: Fri, 6 Jun 2025 18:46:29 +0530 Subject: [PATCH 13/14] refactor: update _prepare_texts_to_embed to return a list instead of a dictionary --- .../components/embedders/google_genai/document_embedder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py index 3da4aec8a3..4f143a07e0 100644 --- a/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py +++ b/integrations/google_genai/src/haystack_integrations/components/embedders/google_genai/document_embedder.py @@ -120,7 +120,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "GoogleGenAIDocumentEmbedder": deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"]) return default_from_dict(cls, data) - def _prepare_texts_to_embed(self, documents: List[Document]) -> Dict[str, str]: + def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]: """ Prepare the texts to embed by concatenating the Document text with the metadata fields to embed. """ From 0b8d687f3c952557a795f28a5211a50b6c8c1d2d Mon Sep 17 00:00:00 2001 From: garybadwal Date: Fri, 6 Jun 2025 19:44:21 +0530 Subject: [PATCH 14/14] feat: add new author to project metadata in pyproject.toml --- integrations/google_genai/pyproject.toml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/integrations/google_genai/pyproject.toml b/integrations/google_genai/pyproject.toml index e3c94cc601..65c8797ea1 100644 --- a/integrations/google_genai/pyproject.toml +++ b/integrations/google_genai/pyproject.toml @@ -10,7 +10,10 @@ readme = "README.md" requires-python = ">=3.9" license = "Apache-2.0" keywords = [] -authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }] +authors = [ + { name = "deepset GmbH", email = "info@deepset.ai" }, + { name = "Gary Badwal", email = "gurpreet071999@gmail.com" } +] classifiers = [ "License :: OSI Approved :: Apache Software License", "Development Status :: 4 - Beta",