From f7b14195533299d94457ee4e0c425d020e9b745e Mon Sep 17 00:00:00 2001 From: "rima.hajou" Date: Fri, 2 May 2025 18:06:55 +0300 Subject: [PATCH] add text embedder --- integrations/google_ai/README.md | 21 + integrations/google_ai/pyproject.toml | 2 +- .../embedders/google_ai/text_embedder.py | 205 ++++++++++ .../tests/embedders/test_text_embedder.py | 379 ++++++++++++++++++ 4 files changed, 606 insertions(+), 1 deletion(-) create mode 100644 integrations/google_ai/src/haystack_integrations/components/embedders/google_ai/text_embedder.py create mode 100644 integrations/google_ai/tests/embedders/test_text_embedder.py diff --git a/integrations/google_ai/README.md b/integrations/google_ai/README.md index 34ddefc79..51de6cfd9 100644 --- a/integrations/google_ai/README.md +++ b/integrations/google_ai/README.md @@ -16,6 +16,27 @@ pip install google-ai-haystack ``` + +## Contributing + +`hatch` is the best way to interact with this project, to install it: + +```sh +pip install hatch +``` + +With `hatch` installed, to run all the tests: + +``` +hatch run test +``` + +To run the linters `ruff` and `mypy`: + +``` +hatch run lint:all +``` + ## License `google-ai-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license. diff --git a/integrations/google_ai/pyproject.toml b/integrations/google_ai/pyproject.toml index 1c9d4d4ef..6534145f9 100644 --- a/integrations/google_ai/pyproject.toml +++ b/integrations/google_ai/pyproject.toml @@ -23,7 +23,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] -dependencies = ["haystack-ai>=2.9.0", "google-generativeai>=0.3.1"] +dependencies = ["haystack-ai>=2.9.0", "google-generativeai>=0.3.1", "google-genai==1.13.0"] [project.urls] Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/google_ai_haystack#readme" diff --git a/integrations/google_ai/src/haystack_integrations/components/embedders/google_ai/text_embedder.py b/integrations/google_ai/src/haystack_integrations/components/embedders/google_ai/text_embedder.py new file mode 100644 index 000000000..25d9a1a7f --- /dev/null +++ b/integrations/google_ai/src/haystack_integrations/components/embedders/google_ai/text_embedder.py @@ -0,0 +1,205 @@ +import logging +import warnings +from typing import Any, Dict, List, Literal, Optional + +from google import genai +from google.genai import types +from haystack.core.component import component +from haystack.core.serialization import default_from_dict, default_to_dict +from haystack.utils import Secret, deserialize_secrets_inplace + +# Load environment variables from .env file, if present +logger = logging.getLogger(__name__) + + +def resolve_secret(secret: Optional[Secret]) -> Optional[str]: + return secret.resolve_value() if secret else None + + +@component +class GoogleAIGeminiTextEmbedder: + """ + A component for embedding text using Google AI models (e.g., Gemini). + + Usage example: + ```python + from haystack_integrations.components.embedders.google_ai import GoogleAIGeminiTextEmbedder + from haystack.utils.auth import Secret + + # Ensure GOOGLE_API_KEY environment variable is set + + embedder = GoogleAIGeminiTextEmbedder(model="gemini-embedding-exp-03-07", task_type="retrieval_document") + embedder.warm_up() + + text = "What is the meaning of life?" + result = embedder.run([text]) + print(result['embedding']) + print(result['meta']) + + # Example with explicit API key + embedder_explicit_key = GoogleAIGeminiTextEmbedder( + api_key=Secret.from_token("YOUR_GOOGLE_API_KEY"), + model="models/embedding-001", + task_type="retrieval_query" + ) + embedder_explicit_key.warm_up() + result_query = embedder_explicit_key.run(["How does quantum physics work?"]) + + ``` + """ + + def __init__( + self, + model: Literal[ + "gemini-embedding-exp-03-07", "text-embedding-004", "embedding-001" + ] = "gemini-embedding-exp-03-07", + api_key: Optional[Secret] = Secret.from_env_var("GEMINI_API_KEY"), # noqa: B008 + task_type: Optional[str] = "retrieval_document", + # Supported task types: "retrieval_query", "retrieval_document", "semantic_similarity", + # "classification", "clustering", "question_answering", "fact_verification" + # See: https://ai.google.dev/docs/embeddings#task_types + title: Optional[str] = None, # Relevant only for task_type="retrieval_document" + output_dimensionality: Optional[int] = None, + ): + """ + Initializes the GoogleAIGeminiTextEmbedder component. + + :param model: The name of the Google AI embedding model to use. + Defaults to "models/embedding-001". + :param api_key: The Google AI API key. It can be explicitly provided or automatically read from the + `GOOGLE_API_KEY` environment variable. + :param task_type: The task type for the embedding model. This helps the model generate embeddings tailored to + the specific use case. Defaults to "retrieval_document". + :param title: An optional title for the text, relevant only when `task_type` is "retrieval_document". + """ + if not api_key: + msg = ( + "GoogleAIGeminiTextEmbedder requires an API key. Set the GOOGLE_API_KEY environment variable " + "or provide it explicitly via the api_key parameter." + ) + raise ValueError(msg) + + self.model = model + self.api_key = api_key + self.task_type = task_type + self.title = title + self.output_dimensionality = output_dimensionality + self._api_key_resolved: Optional[str] = None # Store resolved key after warm_up + + def warm_up(self): + """ + Authenticates with Google AI using the provided API key. + """ + + if self._api_key_resolved is None: + self._api_key_resolved = resolve_secret(self.api_key) + if not self._api_key_resolved: + msg = "Could not resolve Google AI API key." + raise ValueError(msg) + try: + self.client = genai.Client(api_key=self._api_key_resolved) + except Exception as e: + # Catch potential configuration errors early + msg = f"Failed to configure Google AI client: {e}" + raise ValueError(msg) from e + # No specific client object to store for genai, configuration is module-level + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + return default_to_dict( + self, + model=self.model, + api_key=self.api_key.to_dict() if self.api_key else None, + task_type=self.task_type, + title=self.title, + output_dimensionality=self.output_dimensionality, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "GoogleAIGeminiTextEmbedder": + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ + # Ensure api_key is properly deserialized from Secret + deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"]) + return default_from_dict(cls, data) + + @component.output_types(embedding=List[List[float]], meta=Dict[str, Any]) + def run(self, texts: List[str]): + """ + Embeds a list of texts using the configured Google AI model. + + :param texts: A list of strings to embed. + :returns: A dictionary containing: + - `embedding`: A list of embeddings, where each embedding is a list of floats. + - `meta`: A dictionary with metadata about the operation (e.g., model name, task type). + :raises TypeError: If the input `texts` is not a list of strings. + :raises RuntimeError: If the embedding process fails. + """ + if not isinstance(texts, list) or not all(isinstance(text, str) for text in texts): + msg = "GoogleAIGeminiTextEmbedder expects a List of strings as input." + raise TypeError(msg) + if not texts: + # Return empty list if no texts are provided + return {"embedding": [], "meta": {"model": self.model, "task_type": self.task_type}} + + if self._api_key_resolved is None: + msg = "The component has not been warmed up. Please call warm_up() before running." + raise RuntimeError(msg) + + # Prepare parameters for the API call + configs = types.EmbedContentConfig() + api_params = {"model": self.model, "contents": texts, "configs": configs} + + if self.task_type: + configs.task_type = self.task_type + # Add title only if task_type is retrieval_document and title is provided + if self.task_type == "retrieval_document" and self.title: + configs.title = self.title + elif self.title and self.task_type != "retrieval_document": + warnings.warn( + UserWarning("Warning: Title 'Should Be Ignored' is ignored because task_type is 'retrieval_query'"), + stacklevel=2, + ) + if self.output_dimensionality: + configs.output_dimensionality = self.output_dimensionality + try: + # Make the API call to embed the batch of texts + result = self.client.models.embed_content(**api_params) + + except Exception as e: + # TODO: Add more specific error handling for common API errors if possible + msg = f"Google AI embedding failed: {e}" + raise RuntimeError(msg) from e + + # Extract embeddings - result.embedding should be the list of lists + embeddings = result.get("embedding") # Use .get for safety, returns None if key missing + if embeddings is None: + msg = f"Google AI API response did not contain 'embedding' key. Response: {result}" + raise RuntimeError(msg) + + if not isinstance(embeddings, list) or len(embeddings) != len(texts): + msg = ( + f"Google AI API returned an unexpected number of embeddings " + f"(expected {len(texts)}, got {len(embeddings)}). Response: {result}" + ) + raise RuntimeError(msg) + + # Prepare metadata + meta = {"model": self.model, "task_type": self.task_type} + # Google AI API (genai) doesn't seem to consistently return usage info in embed_content response object yet. + # If it does in the future, it could be added here. Example: + # if usage_metadata := getattr(result, 'usage_metadata', None): # Check if attribute exists + # meta["usage"] = usage_metadata + + return {"embedding": embeddings, "meta": meta} diff --git a/integrations/google_ai/tests/embedders/test_text_embedder.py b/integrations/google_ai/tests/embedders/test_text_embedder.py new file mode 100644 index 000000000..91879cfc7 --- /dev/null +++ b/integrations/google_ai/tests/embedders/test_text_embedder.py @@ -0,0 +1,379 @@ +from unittest.mock import ANY, MagicMock, patch # Use ANY for complex objects like configs + +import pytest +from google.api_core import exceptions as google_exceptions # Import potential exceptions +from google.genai import types # Import types for mocking config +from haystack.utils.auth import Secret + +# Assuming the file is located at: +# haystack_integrations/components/embedders/google_ai/text_embedder.py +# Adjust the path if necessary +from haystack_integrations.components.embedders.google_ai.text_embedder import GoogleAIGeminiTextEmbedder + + +# Mock the genai module before it's imported by the class +# We need to mock the Client and its methods +@pytest.fixture(autouse=True) +def mock_genai_client(): + # Create a mock client instance + mock_client_instance = MagicMock() + # Mock the embed_content method on the client's models attribute + mock_client_instance.models.embed_content = MagicMock() + + # Patch the Client class within the text_embedder module + with patch( + "haystack_integrations.components.embedders.google_ai.text_embedder.genai.Client", + return_value=mock_client_instance, + ) as mock_client_constructor: + yield mock_client_constructor, mock_client_instance # Yield both constructor and instance for assertions + + +@pytest.fixture +def embedder(monkeypatch): + """Creates a default GoogleAIGeminiTextEmbedder instance with a mocked API key.""" + monkeypatch.setenv("GEMINI_API_KEY", "test-api-key") + return GoogleAIGeminiTextEmbedder() + + +# --- Initialization Tests --- + + +def test_init_default_parameters(monkeypatch): + """Test default initialization.""" + monkeypatch.setenv("GEMINI_API_KEY", "test-api-key") + embedder = GoogleAIGeminiTextEmbedder() + assert embedder.model == "gemini-embedding-exp-03-07" # Check default model + assert embedder.api_key == Secret.from_env_var("GEMINI_API_KEY") + assert embedder.task_type == "retrieval_document" + assert embedder.title is None + assert embedder.output_dimensionality is None + assert embedder._api_key_resolved is None + assert not hasattr(embedder, "client") # Client created in warm_up + + +def test_init_explicit_parameters(): + """Test initialization with explicit parameters.""" + embedder = GoogleAIGeminiTextEmbedder( + model="embedding-001", + api_key=Secret.from_token("explicit-key"), + task_type="retrieval_query", + title="My Doc Title", + output_dimensionality=256, + ) + assert embedder.model == "embedding-001" + assert embedder.api_key == Secret.from_token("explicit-key") + assert embedder.task_type == "retrieval_query" + assert embedder.title == "My Doc Title" + assert embedder.output_dimensionality == 256 + assert embedder._api_key_resolved is None + + +def test_init_invalid_model_name(): + """Test that invalid model names are still accepted at init (validation might happen at API call).""" + # The Literal type hint provides static checking, but doesn't prevent runtime assignment + # of other strings. The API call would likely fail later. + embedder = GoogleAIGeminiTextEmbedder( + model="invalid-model-name", + api_key=Secret.from_token("explicit-key"), + ) + assert embedder.model == "invalid-model-name" + + +def test_init_no_api_key_raises(monkeypatch): + """Test that ValueError is raised if API key is not provided and env var is not set.""" + monkeypatch.delenv("GEMINI_API_KEY", raising=False) # Ensure env var is not set + with pytest.raises(ValueError, match="GoogleAIGeminiTextEmbedder requires an API key"): + # Pass api_key=None explicitly to override the default Secret.from_env_var behavior during init check + GoogleAIGeminiTextEmbedder(api_key=None) + + +# --- Warm Up Tests --- + + +def test_warm_up_resolves_key_and_creates_client(embedder, mock_genai_client): + """Test warm_up resolves API key and initializes the client.""" + mock_client_constructor, mock_client_instance = mock_genai_client + + embedder.warm_up() + + # Assert API key was resolved + assert embedder._api_key_resolved == "test-api-key" + # Assert Client constructor was called with the resolved key + # The Client constructor should be called with the actual resolved API key. + mock_client_constructor.assert_called_once_with(api_key="test-api-key") + # Assert the client instance is stored + assert embedder.client == mock_client_instance + + +def test_warm_up_already_warmed_up(embedder, mock_genai_client): + """Test that warm_up doesn't re-initialize if called multiple times.""" + mock_client_constructor, _ = mock_genai_client + + embedder.warm_up() # First call + embedder.warm_up() # Second call + + # Assert Client constructor was called only once + mock_client_constructor.assert_called_once() + + +def test_warm_up_client_instantiation_fails(embedder, mock_genai_client): + """Test warm_up raises ValueError if client creation fails.""" + mock_client_constructor, _ = mock_genai_client + # Configure the mock constructor to raise an exception + mock_client_constructor.side_effect = Exception("Client creation failed") + + with pytest.raises(ValueError, match="Failed to configure Google AI client: Client creation failed"): + embedder.warm_up() + + +# --- Run Tests --- + + +def test_run_without_warm_up_raises(embedder): + """Test run raises RuntimeError if warm_up wasn't called.""" + with pytest.raises(RuntimeError, match="The component has not been warmed up"): + embedder.run(texts=["test text"]) + + +def test_run_invalid_input_type_raises(embedder): + """Test run raises TypeError for invalid input.""" + embedder.warm_up() # Needs warm_up first + with pytest.raises(TypeError, match="GoogleAIGeminiTextEmbedder expects a List of strings"): + embedder.run(texts="not a list") # type: ignore + with pytest.raises(TypeError, match="GoogleAIGeminiTextEmbedder expects a List of strings"): + embedder.run(texts=[1, 2, 3]) # type: ignore + + +def test_run_empty_list(embedder): + """Test run with an empty list returns empty results.""" + embedder.warm_up() + result = embedder.run(texts=[]) + assert result == {"embedding": [], "meta": {"model": embedder.model, "task_type": embedder.task_type}} + + +def test_run_api_call_success(embedder, mock_genai_client): + """Test a successful run call.""" + mock_client_constructor, mock_client_instance = mock_genai_client + texts = ["text 1", "text 2"] + expected_embeddings = [[0.1, 0.2], [0.3, 0.4]] + # Configure the mock embed_content method to return a successful response + mock_client_instance.models.embed_content.return_value = {"embedding": expected_embeddings} + + embedder.warm_up() + result = embedder.run(texts=texts) + + # Assert embed_content was called correctly + mock_client_instance.models.embed_content.assert_called_once_with( + model=embedder.model, + contents=texts, + configs=ANY, # Use ANY because comparing EmbedContentConfig objects is tricky + ) + # Check the properties of the passed config object (captured via ANY) + call_args, call_kwargs = mock_client_instance.models.embed_content.call_args + called_configs = call_kwargs.get("configs") + assert isinstance(called_configs, types.EmbedContentConfig) + assert called_configs.task_type == embedder.task_type + assert called_configs.title is None # Default, no title + assert called_configs.output_dimensionality is None # Default, no dim + + # Assert the result is correct + assert result["embedding"] == expected_embeddings + assert result["meta"] == {"model": embedder.model, "task_type": embedder.task_type} + + +def test_run_with_title_and_output_dim(monkeypatch, mock_genai_client): + """Test run with title and output_dimensionality.""" + mock_client_constructor, mock_client_instance = mock_genai_client + monkeypatch.setenv("GEMINI_API_KEY", "test-api-key") + embedder = GoogleAIGeminiTextEmbedder( + task_type="retrieval_document", # Title only relevant for this task type + title="My Awesome Document", + output_dimensionality=128, + ) + texts = ["content of the document"] + expected_embeddings = [[0.5, 0.6, 0.7]] + mock_client_instance.models.embed_content.return_value = {"embedding": expected_embeddings} + + embedder.warm_up() + result = embedder.run(texts=texts) + + # Assert embed_content was called correctly + mock_client_instance.models.embed_content.assert_called_once_with(model=embedder.model, contents=texts, configs=ANY) + # Check the properties of the passed config object + call_args, call_kwargs = mock_client_instance.models.embed_content.call_args + called_configs = call_kwargs.get("configs") + assert isinstance(called_configs, types.EmbedContentConfig) + assert called_configs.task_type == "retrieval_document" + assert called_configs.title == "My Awesome Document" + assert called_configs.output_dimensionality == 128 + + assert result["embedding"] == expected_embeddings + assert result["meta"] == {"model": embedder.model, "task_type": embedder.task_type} + + +def test_run_with_title_wrong_task_type(monkeypatch, mock_genai_client): + """Test run with title but wrong task_type (should ignore title and warn).""" + mock_client_constructor, mock_client_instance = mock_genai_client + monkeypatch.setenv("GEMINI_API_KEY", "test-api-key") + embedder = GoogleAIGeminiTextEmbedder( + task_type="retrieval_query", title="Should Be Ignored" + ) # Title not relevant here + + texts = ["some query text"] + expected_embeddings = [[0.8, 0.9]] + mock_client_instance.models.embed_content.return_value = {"embedding": expected_embeddings} + + embedder.warm_up() + + with pytest.warns( + UserWarning, match="Warning: Title 'Should Be Ignored' is ignored because task_type is 'retrieval_query'" + ): + result = embedder.run(texts=texts) + + # Assert embed_content was called correctly + mock_client_instance.models.embed_content.assert_called_once_with(model=embedder.model, contents=texts, configs=ANY) + # Check the properties of the passed config object + call_args, call_kwargs = mock_client_instance.models.embed_content.call_args + called_configs = call_kwargs.get("configs") + assert isinstance(called_configs, types.EmbedContentConfig) + assert called_configs.task_type == "retrieval_query" + assert called_configs.title is None # Title should NOT be set on config + assert result["embedding"] == expected_embeddings + assert result["meta"] == {"model": embedder.model, "task_type": embedder.task_type} + + +def test_run_api_error_raises(embedder, mock_genai_client): + """Test run raises RuntimeError if the API call fails.""" + mock_client_constructor, mock_client_instance = mock_genai_client + # Configure the mock embed_content to raise a Google API error + api_error = google_exceptions.InternalServerError("API failed") + mock_client_instance.models.embed_content.side_effect = api_error + + embedder.warm_up() + with pytest.raises(RuntimeError, match=f"Google AI embedding failed: {api_error}"): + embedder.run(texts=["test text"]) + + +def test_run_bad_response_no_embedding_key(embedder, mock_genai_client): + """Test run raises RuntimeError if response lacks 'embedding' key.""" + mock_client_constructor, mock_client_instance = mock_genai_client + mock_client_instance.models.embed_content.return_value = {"wrong_key": []} # Missing 'embedding' + + embedder.warm_up() + with pytest.raises(RuntimeError, match="Google AI API response did not contain 'embedding' key"): + embedder.run(texts=["test text"]) + + +def test_run_bad_response_wrong_embedding_count(embedder, mock_genai_client): + """Test run raises RuntimeError if embedding count doesn't match text count.""" + mock_client_constructor, mock_client_instance = mock_genai_client + texts = ["text 1", "text 2"] + # Return only one embedding for two texts + mock_client_instance.models.embed_content.return_value = {"embedding": [[0.1, 0.2]]} + + embedder.warm_up() + with pytest.raises(RuntimeError, match="Google AI API returned an unexpected number of embeddings"): + embedder.run(texts=texts) + + +# --- Serialization Tests --- + + +def test_to_dict(monkeypatch): + """Test serialization to dictionary.""" + monkeypatch.setenv("GEMINI_API_KEY", "test-api-key") + embedder = GoogleAIGeminiTextEmbedder( + model="embedding-001", task_type="semantic_similarity", title="Another Title", output_dimensionality=512 + ) + data = embedder.to_dict() + + assert data == { + "type": "haystack_integrations.components.embedders.google_ai.text_embedder.GoogleAIGeminiTextEmbedder", + "init_parameters": { + "model": "embedding-001", + "api_key": {"env_vars": ["GEMINI_API_KEY"], "type": "env_var", "strict": True}, # Serialized Secret + "task_type": "semantic_similarity", + "title": "Another Title", + "output_dimensionality": 512, + }, + } + + +def test_from_dict(monkeypatch): + """Test deserialization from dictionary.""" + monkeypatch.setenv("GEMINI_API_KEY", "test-api-key") # Needed for deserializing env_var Secret + data = { + "type": "haystack_integrations.components.embedders.google_ai.text_embedder.GoogleAIGeminiTextEmbedder", + "init_parameters": { + "model": "embedding-001", + "api_key": {"env_vars": ["GEMINI_API_KEY"], "type": "env_var", "strict": True}, + "task_type": "semantic_similarity", + "title": "Another Title", + "output_dimensionality": 512, + }, + } + embedder = GoogleAIGeminiTextEmbedder.from_dict(data) + + assert embedder.model == "embedding-001" + # Check that the Secret object was correctly deserialized + assert isinstance(embedder.api_key, Secret) + assert embedder.api_key._env_vars == ("GEMINI_API_KEY",) + assert embedder.task_type == "semantic_similarity" + assert embedder.title == "Another Title" + assert embedder.output_dimensionality == 512 + + +# --- Integration Test (Serialization + Run) --- + + +def test_integration_to_from_dict_and_run(monkeypatch, mock_genai_client): + """Test serialization, deserialization, and running the component.""" + mock_client_constructor, mock_client_instance = mock_genai_client + monkeypatch.setenv("GEMINI_API_KEY", "orig-key") # Set env var for original instance + + # 1. Create and configure the original instance + original_embedder = GoogleAIGeminiTextEmbedder( + model="text-embedding-004", task_type="retrieval_document", title="Test Doc", output_dimensionality=256 + ) + + # 2. Serialize it + data = original_embedder.to_dict() + + # 3. Deserialize it (ensure env var is still set for Secret resolution) + deserialized_embedder = GoogleAIGeminiTextEmbedder.from_dict(data) + + # 4. Assert deserialized parameters are correct + assert deserialized_embedder.model == "text-embedding-004" + assert deserialized_embedder.task_type == "retrieval_document" + assert deserialized_embedder.title == "Test Doc" + assert deserialized_embedder.output_dimensionality == 256 + # Check Secret deserialization + assert isinstance(deserialized_embedder.api_key, Secret) + assert deserialized_embedder.api_key._env_vars == ("GEMINI_API_KEY",) + + # 5. Warm up the deserialized instance + # (Mock client will be used, but let's ensure the key resolution works) + deserialized_embedder.warm_up() + # Assert client was created (using the mock) with the resolved key + mock_client_constructor.assert_called_with(api_key="orig-key") # Resolved from env var + assert deserialized_embedder.client is not None + + # 6. Prepare mock response and run + texts = ["some document content"] + expected_embeddings = [[0.1, 0.9, 0.2]] + mock_client_instance.models.embed_content.return_value = {"embedding": expected_embeddings} + + result = deserialized_embedder.run(texts=texts) + + # 7. Assert the run call was successful and used correct parameters + mock_client_instance.models.embed_content.assert_called_once_with( + model="text-embedding-004", contents=texts, configs=ANY + ) + call_args, call_kwargs = mock_client_instance.models.embed_content.call_args + called_configs = call_kwargs.get("configs") + assert called_configs.task_type == "retrieval_document" + assert called_configs.title == "Test Doc" + assert called_configs.output_dimensionality == 256 + + assert result["embedding"] == expected_embeddings + assert result["meta"] == {"model": "text-embedding-004", "task_type": "retrieval_document"}