diff --git a/experimental/pyproject.toml b/experimental/pyproject.toml index 503eb1ae7..5cc852f5f 100644 --- a/experimental/pyproject.toml +++ b/experimental/pyproject.toml @@ -7,7 +7,8 @@ name = "ragas_experimental" description = "Experimental extensions for Ragas" requires-python = ">=3.9" authors = [ - {name = "jjmachan", email = "jamesjithin97@gmail.com"} + {name = "jjmachan", email = "jithin@explodinggradients.com"}, + {name = "ikka", email = "shahul@explodinggradients.com"} ] license = {text = "Apache-2.0"} keywords = ["jupyter", "notebook", "python", "evaluation", "llm", "ragas"] @@ -22,7 +23,7 @@ classifiers = [ ] dependencies = [ "fastcore", - "tqdm", + "tqdm", "langfuse", "instructor", "pydantic", @@ -40,8 +41,9 @@ readme = "README.md" all = ["pandas"] [project.entry-points."ragas.backends"] -local_csv = "ragas_experimental.project.backends.local_csv:LocalCSVProjectBackend" -platform = "ragas_experimental.project.backends.platform:PlatformProjectBackend" +"local/csv" = "ragas_experimental.project.backends.local_csv:LocalCSVProjectBackend" +"ragas/app" = "ragas_experimental.project.backends.ragas_app:RagasAppProjectBackend" +"box/csv" = "ragas_experimental.project.backends.box_csv:BoxCSVProjectBackend" [tool.setuptools.packages.find] include = ["ragas_experimental*"] @@ -58,6 +60,11 @@ dev = [ "pytest-mock>=3.10.0", "black", "ruff", + "vcrpy", + "pytest-vcr", +] +box = [ + "boxsdk[jwt]", ] test = [] diff --git a/experimental/ragas_experimental/backends/README.md b/experimental/ragas_experimental/backends/README.md new file mode 100644 index 000000000..7fdf815f0 --- /dev/null +++ b/experimental/ragas_experimental/backends/README.md @@ -0,0 +1,197 @@ +# Ragas Backends + +Backends store your project data (datasets/experiments) in different places: local files, databases, cloud APIs. You implement 2 classes: `ProjectBackend` (manages projects) and `DataTableBackend` (handles data operations). + +``` +Project → ProjectBackend → DataTableBackend → Storage +``` + +## Current State + +**Available Backends:** +- `local/csv` - Local CSV files +- `ragas/app` - Ragas cloud platform +- `box/csv` - Box cloud storage + +**Import Path:** `ragas_experimental.backends` + +**Core Classes:** +- `ProjectBackend` - Project-level operations (create datasets/experiments) +- `DataTableBackend` - Data operations (read/write entries) +- `DataTable` - Base class for `Dataset` and `Experiment` + +## Learning Roadmap + +Follow this path to build your own backend: + +``` +□ 1. Understand: Read local_csv.py (simplest example) +□ 2. Explore: Study base.py abstract methods +□ 3. Practice: Modify LocalCSVBackend to add logging +□ 4. Build: Create your own backend following the pattern +□ 5. Advanced: Study ragas_app.py for API/async patterns +□ 6. Package: Create plugin (see Plugin Development) +``` + +## Quick Usage + +**Using existing backends:** +```python +from ragas_experimental.project import Project + +# Local CSV +project = Project.create("my_project", "local/csv", root_dir="./data") + +# Ragas platform +project = Project.create("my_project", "ragas/app", api_key="your_key") +``` + +**Basic backend structure:** +```python +from ragas_experimental.backends.base import ProjectBackend, DataTableBackend + +class MyProjectBackend(ProjectBackend): + def create_dataset(self, name, model): + # Create storage space for dataset + pass + +class MyDataTableBackend(DataTableBackend): + def load_entries(self, model_class): + # Load entries from storage + pass +``` + +## Essential Methods + +**ProjectBackend** (project management): +- `create_dataset()` / `create_experiment()` - Create storage +- `get_dataset_backend()` / `get_experiment_backend()` - Get data handler +- `list_datasets()` / `list_experiments()` - List existing + +**DataTableBackend** (data operations): +- `initialize()` - Setup with dataset instance +- `load_entries()` - Load all entries +- `append_entry()` - Add new entry +- `update_entry()` / `delete_entry()` - Modify entries + +See `base.py` for complete interface. + +## Learn from Examples + +**Start here:** +- `local_csv.py` - File-based storage, easiest to understand +- `config.py` - Configuration patterns + +**Advanced patterns:** +- `ragas_app.py` - API calls, async, error handling +- `box_csv.py` - Cloud storage, authentication +- `registry.py` - Backend discovery system + +## Quick Development + +**1. Copy template:** +```bash +cp local_csv.py my_backend.py +``` + +**2. Replace CSV logic with your storage** + +**3. Register backend:** +```python +# In registry.py _register_builtin_backends() +from .my_backend import MyProjectBackend +self.register_backend("my_storage", MyProjectBackend) +``` + +**4. Test:** +```python +project = Project.create("test", "my_storage") +``` + +## Plugin Development + +**Create separate package:** +``` +my-backend-plugin/ +├── pyproject.toml +├── src/my_backend/ +│ ├── __init__.py +│ └── backend.py +└── tests/ +``` + +**Entry point in pyproject.toml:** +```toml +[project.entry-points."ragas.backends"] +my_storage = "my_backend.backend:MyProjectBackend" +``` + +**Install and use:** +```bash +pip install my-backend-plugin +python -c "from ragas_experimental.project import Project; Project.create('test', 'my_storage')" +``` + +## Common Patterns + +**ID Generation:** +```python +from .utils import create_nano_id +dataset_id = create_nano_id() +``` + +**Error Handling:** +```python +try: + # Storage operation +except ConnectionError: + # Handle gracefully +``` + +**Testing:** +```python +def test_my_backend(): + backend = MyProjectBackend() + backend.initialize("test_project") + dataset_id = backend.create_dataset("test", MyModel) + assert dataset_id +``` + +## Troubleshooting + +**Backend not found?** Check registry with: +```python +from ragas_experimental.backends import list_backends +print(list_backends()) +``` + +**Entries not loading?** Verify: +- `initialize()` called before other methods +- `load_entries()` returns list of model instances +- Entry `_row_id` attributes set correctly + +**Need help?** Study existing backends - they handle most common patterns. + +## Configuration Examples + +**Local CSV:** +```python +from ragas_experimental.backends import LocalCSVConfig +config = LocalCSVConfig(root_dir="/path/to/data") +``` + +**Ragas App:** +```python +from ragas_experimental.backends import RagasAppConfig +config = RagasAppConfig(api_key="key", api_url="https://api.ragas.io") +``` + +**Box CSV:** +```python +from ragas_experimental.backends import BoxCSVConfig +config = BoxCSVConfig(client=authenticated_box_client) +``` + +--- + +**Next Steps:** Start with modifying `local_csv.py`, then build your own following the same patterns. diff --git a/experimental/ragas_experimental/backends/__init__.py b/experimental/ragas_experimental/backends/__init__.py index e69de29bb..72aa34137 100644 --- a/experimental/ragas_experimental/backends/__init__.py +++ b/experimental/ragas_experimental/backends/__init__.py @@ -0,0 +1,59 @@ +"""Backend factory and exports for all backends.""" + +from .base import DataTableBackend, ProjectBackend + +# Import concrete backends +from .local_csv import LocalCSVProjectBackend +from .ragas_app import RagasAppProjectBackend + +# Optional backends with dependencies +try: + from .box_csv import BoxCSVProjectBackend +except ImportError: + BoxCSVProjectBackend = None + +# Import configuration classes +from .config import BackendConfig, LocalCSVConfig, RagasAppConfig + +try: + from .config import BoxCSVConfig +except ImportError: + BoxCSVConfig = None + +from .registry import ( + BackendRegistry, + create_project_backend, + get_backend_info, + get_registry, + list_backend_info, + list_backends, + print_available_backends, + register_backend, +) + +# Import API client +from .ragas_api_client import RagasApiClient + +__all__ = [ + "ProjectBackend", + "DataTableBackend", + "BackendRegistry", + "get_registry", + "register_backend", + "list_backends", + "get_backend_info", + "list_backend_info", + "print_available_backends", + "create_project_backend", + # Configuration classes + "BackendConfig", + "LocalCSVConfig", + "RagasAppConfig", + "BoxCSVConfig", + # Concrete backends + "LocalCSVProjectBackend", + "RagasAppProjectBackend", + "BoxCSVProjectBackend", + # API client + "RagasApiClient", +] diff --git a/experimental/ragas_experimental/backends/base.py b/experimental/ragas_experimental/backends/base.py new file mode 100644 index 000000000..fd5433840 --- /dev/null +++ b/experimental/ragas_experimental/backends/base.py @@ -0,0 +1,113 @@ +"""Base classes for project and dataset backends.""" + +import typing as t +from abc import ABC, abstractmethod + +from ragas_experimental.model.pydantic_model import ( + ExtendedPydanticBaseModel as BaseModel, +) + + +class DataTableBackend(ABC): + """Abstract base class for datatable backends. + + All datatable storage backends must implement these methods. + Handles both datasets and experiments. + """ + + @abstractmethod + def initialize(self, dataset: t.Any) -> None: + """Initialize the backend with dataset information""" + pass + + @abstractmethod + def get_column_mapping(self, model: t.Type[BaseModel]) -> t.Dict[str, str]: + """Get mapping between model fields and backend columns""" + pass + + @abstractmethod + def load_entries(self, model_class) -> t.List[t.Any]: + """Load all entries from storage""" + pass + + @abstractmethod + def append_entry(self, entry) -> str: + """Add a new entry to storage and return its ID""" + pass + + @abstractmethod + def update_entry(self, entry) -> bool: + """Update an existing entry in storage""" + pass + + @abstractmethod + def delete_entry(self, entry_id) -> bool: + """Delete an entry from storage""" + pass + + @abstractmethod + def get_entry_by_field( + self, field_name: str, field_value: t.Any, model_class + ) -> t.Optional[t.Any]: + """Get an entry by field value""" + pass + + +class ProjectBackend(ABC): + """Abstract base class for project backends. + + Handles project-level operations like creating/listing datasets and experiments. + """ + + @abstractmethod + def initialize(self, project_id: str, **kwargs) -> None: + """Initialize the backend with project information""" + pass + + @abstractmethod + def create_dataset(self, name: str, model: t.Type[BaseModel]) -> str: + """Create a new dataset and return its ID""" + pass + + @abstractmethod + def create_experiment(self, name: str, model: t.Type[BaseModel]) -> str: + """Create a new experiment and return its ID""" + pass + + @abstractmethod + def list_datasets(self) -> t.List[t.Dict]: + """List all datasets in the project""" + pass + + @abstractmethod + def list_experiments(self) -> t.List[t.Dict]: + """List all experiments in the project""" + pass + + @abstractmethod + def get_dataset_backend( + self, dataset_id: str, name: str, model: t.Type[BaseModel] + ) -> DataTableBackend: + """Get a DataTableBackend instance for a specific dataset""" + pass + + @abstractmethod + def get_experiment_backend( + self, experiment_id: str, name: str, model: t.Type[BaseModel] + ) -> DataTableBackend: + """Get a DataTableBackend instance for a specific experiment""" + pass + + @abstractmethod + def get_dataset_by_name( + self, name: str, model: t.Type[BaseModel] + ) -> t.Tuple[str, DataTableBackend]: + """Get dataset ID and backend by name. Returns (dataset_id, backend)""" + pass + + @abstractmethod + def get_experiment_by_name( + self, name: str, model: t.Type[BaseModel] + ) -> t.Tuple[str, DataTableBackend]: + """Get experiment ID and backend by name. Returns (experiment_id, backend)""" + pass diff --git a/experimental/ragas_experimental/backends/box_csv.py b/experimental/ragas_experimental/backends/box_csv.py new file mode 100644 index 000000000..87bc52c86 --- /dev/null +++ b/experimental/ragas_experimental/backends/box_csv.py @@ -0,0 +1,691 @@ +"""Box CSV backend implementation for projects and datasets.""" + +import csv +import io +import json +import logging +import os +import typing as t +import uuid +from typing import TYPE_CHECKING, Optional, get_origin + +from ragas_experimental.model.pydantic_model import ( + ExtendedPydanticBaseModel as BaseModel, +) + +from .utils import create_nano_id +from .base import DataTableBackend, ProjectBackend +from .config import BoxCSVConfig, BoxClientProtocol, BoxFolderProtocol, BoxFileProtocol + +logger = logging.getLogger(__name__) + +# Type-only imports for static analysis +if TYPE_CHECKING: + from boxsdk import BoxAPIException, Client + from boxsdk.object.folder import Folder + from boxsdk.object.file import File +else: + # Runtime imports with fallbacks + try: + from boxsdk import BoxAPIException, Client + from boxsdk.object.folder import Folder + from boxsdk.object.file import File + except ImportError: + logger.warning( + "Box SDK not available. Install with: pip install 'ragas_experimental[box]' to use Box backend." + ) + # Create placeholder types for runtime + from typing import Any + + BoxAPIException = Any + Client = Any + Folder = Any + File = Any + + +class BoxCSVDataTableBackend(DataTableBackend): + """Box CSV implementation of DataTableBackend.""" + + def __init__( + self, + box_client: BoxClientProtocol, + project_folder_id: str, + dataset_id: str, + dataset_name: str, + datatable_type: t.Literal["datasets", "experiments"], + ): + self.box_client = box_client + self.project_folder_id = project_folder_id + self.dataset_id = dataset_id + self.dataset_name = dataset_name + self.datatable_type = datatable_type + self.dataset = None + self._csv_file: Optional[BoxFileProtocol] = None + + def _is_json_serializable_type(self, field_type): + """Check if field needs JSON serialization.""" + origin = get_origin(field_type) + return origin in (list, dict) or field_type in (list, dict) + + def initialize(self, dataset: t.Any) -> None: + """Initialize the backend with dataset information.""" + self.dataset = dataset + self._ensure_csv_exists() + + def _ensure_csv_exists(self): + """Create the CSV file on Box if it doesn't exist.""" + try: + # Get or create the datatable folder (datasets/experiments) + datatable_folder = self._get_or_create_folder( + self.project_folder_id, self.datatable_type + ) + + # Check if CSV file already exists + csv_filename = f"{self.dataset_name}.csv" + self._csv_file = self._get_file_in_folder(datatable_folder, csv_filename) + + if self._csv_file is None: + # Create CSV with headers + if self.dataset is None: + raise ValueError( + "Dataset must be initialized before creating CSV headers" + ) + field_names = ["_row_id"] + list( + self.dataset.model.__annotations__.keys() + ) + + # Create CSV content with headers + csv_content = io.StringIO() + writer = csv.writer(csv_content) + writer.writerow(field_names) + csv_content.seek(0) + + # Upload to Box + self._csv_file = datatable_folder.upload_stream( + csv_content, csv_filename + ) + logger.info(f"Created CSV file on Box: {csv_filename}") + + except Exception as e: + logger.error(f"Error ensuring CSV exists on Box: {e}") + raise + + def _get_or_create_folder( + self, parent_folder_id: str, folder_name: str + ) -> BoxFolderProtocol: + """Get existing folder or create new one.""" + try: + parent_folder = self.box_client.folder(parent_folder_id) + + # Check if folder already exists + for item in parent_folder.get_items(): + if item.type == "folder" and item.name == folder_name: + return self.box_client.folder(item.id) + + # Create new folder + new_folder = parent_folder.create_subfolder(folder_name) + logger.info(f"Created folder on Box: {folder_name}") + return new_folder + + except Exception as e: + logger.error(f"Error creating/getting folder {folder_name}: {e}") + raise + + def _get_file_in_folder( + self, folder: BoxFolderProtocol, filename: str + ) -> Optional[BoxFileProtocol]: + """Get file by name in folder, return None if not found.""" + try: + for item in folder.get_items(): + if item.type == "file" and item.name == filename: + return self.box_client.file(item.id) + return None + except Exception as e: + logger.error(f"Error searching for file {filename}: {e}") + return None + + def get_column_mapping(self, model) -> t.Dict: + """Get mapping between model fields and CSV columns.""" + return model.model_fields + + def load_entries(self, model_class): + """Load all entries from the CSV file on Box.""" + if self._csv_file is None: + return [] + + try: + # Download CSV content + csv_content = self._csv_file.content().decode("utf-8") + csv_reader = csv.DictReader(io.StringIO(csv_content)) + + entries = [] + for row in csv_reader: + try: + # Extract row_id and remove from model data + row_id = row.get("_row_id", str(uuid.uuid4())) + + # Create a copy without _row_id for model instantiation + model_data = {k: v for k, v in row.items() if k != "_row_id"} + + # Convert types as needed + typed_row = {} + for field, value in model_data.items(): + if field in model_class.model_fields: + field_type = model_class.model_fields[field].annotation + + try: + if not value: # Handle empty strings + typed_row[field] = None + elif self._is_json_serializable_type(field_type): + # Deserialize JSON for lists/dicts + typed_row[field] = json.loads(value) + elif field_type is int: + typed_row[field] = int(value) + elif field_type is float: + typed_row[field] = float(value) + elif field_type is bool: + typed_row[field] = value.lower() in ( + "true", + "t", + "yes", + "y", + "1", + ) + else: + typed_row[field] = value + except (json.JSONDecodeError, ValueError) as e: + logger.warning( + f"Failed to convert field {field}='{value}' to {field_type}: {e}" + ) + typed_row[field] = value # Fallback to string + + # Create model instance + entry = model_class(**typed_row) + + # Set the row ID from CSV + entry._row_id = row_id + + entries.append(entry) + except Exception as e: + logger.error(f"Error loading row from CSV: {e}") + + return entries + + except Exception as e: + logger.error(f"Error loading entries from Box CSV: {e}") + return [] + + def append_entry(self, entry) -> str: + """Add a new entry to the CSV file on Box and return a generated ID.""" + try: + # Load existing entries + existing_entries = self.load_entries(entry.__class__) + + # Generate a row ID if needed + row_id = getattr(entry, "_row_id", None) or str(uuid.uuid4()) + entry._row_id = row_id + + # Add new entry + existing_entries.append(entry) + + # Write all entries back to Box + self._write_entries_to_box(existing_entries) + + return row_id + + except Exception as e: + logger.error(f"Error appending entry to Box CSV: {e}") + raise + + def update_entry(self, entry) -> bool: + """Update an existing entry in the CSV file on Box.""" + try: + # Load existing entries + existing_entries = self.load_entries(entry.__class__) + + # Find and update the entry + updated = False + for i, e in enumerate(existing_entries): + if ( + hasattr(e, "_row_id") + and hasattr(entry, "_row_id") + and e._row_id == entry._row_id + ): + existing_entries[i] = entry + updated = True + break + + # If entry wasn't found, append it + if not updated: + existing_entries.append(entry) + + # Write all entries back to Box + self._write_entries_to_box(existing_entries) + + return True + + except Exception as e: + logger.error(f"Error updating entry in Box CSV: {e}") + return False + + def delete_entry(self, entry_id) -> bool: + """Delete an entry from the CSV file on Box.""" + try: + if self.dataset is None: + raise ValueError("Dataset must be initialized") + + # Filter out the entry to delete + entries_to_keep = [] + for e in self.dataset._entries: + if not (hasattr(e, "_row_id") and e._row_id == entry_id): + entries_to_keep.append(e) + + # Write remaining entries back to Box + self._write_entries_to_box(entries_to_keep) + + return True + + except Exception as e: + logger.error(f"Error deleting entry from Box CSV: {e}") + return False + + def _write_entries_to_box(self, entries): + """Write all entries to the CSV file on Box.""" + if self._csv_file is None: + raise ValueError("CSV file not initialized") + + try: + # Create CSV content + csv_content = io.StringIO() + + if not entries: + # If no entries, just create headers + if self.dataset is None: + raise ValueError("Dataset must be initialized") + field_names = ["_row_id"] + list(self.dataset.model.model_fields.keys()) + writer = csv.DictWriter(csv_content, fieldnames=field_names) + writer.writeheader() + else: + # Get field names including _row_id + field_names = ["_row_id"] + list( + entries[0].__class__.model_fields.keys() + ) + writer = csv.DictWriter(csv_content, fieldnames=field_names) + writer.writeheader() + + for entry in entries: + # Create a dict with model data + row_id, handling JSON serialization + entry_dict = {} + for field_name, field_value in entry.model_dump().items(): + field_type = entry.__class__.model_fields[field_name].annotation + if self._is_json_serializable_type(field_type): + entry_dict[field_name] = ( + json.dumps(field_value) + if field_value is not None + else "" + ) + else: + entry_dict[field_name] = field_value + entry_dict["_row_id"] = getattr(entry, "_row_id", str(uuid.uuid4())) + writer.writerow(entry_dict) + + csv_content.seek(0) + + # Upload new version to Box + self._csv_file.update_contents_with_stream(csv_content) + logger.debug(f"Updated CSV file on Box with {len(entries)} entries") + + except Exception as e: + logger.error(f"Error writing entries to Box CSV: {e}") + raise + + def get_entry_by_field( + self, field_name, field_value, model_class + ) -> t.Optional[t.Any]: + """Get an entry by field value.""" + entries = self.load_entries(model_class) + + for entry in entries: + if hasattr(entry, field_name) and getattr(entry, field_name) == field_value: + return entry + + return None + + +class BoxCSVProjectBackend(ProjectBackend): + """Box CSV implementation of ProjectBackend.""" + + def __init__(self, config: BoxCSVConfig): + """Initialize Box backend with authenticated client. + + Args: + config: BoxCSVConfig object containing authenticated Box client. + """ + self.config = config + self.box_client: BoxClientProtocol = config.client + self.project_id: Optional[str] = None + self.project_folder: Optional[BoxFolderProtocol] = None + + @classmethod + def from_jwt_file( + cls, config_file_path: str, root_folder_id: str = "0" + ) -> "BoxCSVProjectBackend": + """Convenience constructor for JWT authentication from config file. + + Args: + config_file_path: Path to Box JWT configuration file + root_folder_id: Box folder ID to use as root (defaults to "0") + + Returns: + BoxCSVProjectBackend instance with authenticated client + """ + try: + # Import here to avoid dependency issues if not available + from boxsdk.auth.jwt_auth import JWTAuth + from boxsdk import Client + except ImportError: + raise ImportError( + "Box SDK not available. Install with: pip install 'ragas_experimental[box]'" + ) + + auth = JWTAuth.from_settings_file(config_file_path) + client = Client(auth) + config = BoxCSVConfig(client=client, root_folder_id=root_folder_id) + return cls(config) + + @classmethod + def from_developer_token( + cls, token: str, root_folder_id: str = "0" + ) -> "BoxCSVProjectBackend": + """Convenience constructor for developer token (testing only). + + Args: + token: Box developer token + root_folder_id: Box folder ID to use as root (defaults to "0") + + Returns: + BoxCSVProjectBackend instance with authenticated client + """ + try: + # Import here to avoid dependency issues if not available + from boxsdk.auth.oauth2 import OAuth2 + from boxsdk import Client + except ImportError: + raise ImportError( + "Box SDK not available. Install with: pip install 'ragas_experimental[box]'" + ) + + oauth = OAuth2( + client_id="not_needed_for_dev_token", + client_secret="not_needed_for_dev_token", + access_token=token, + ) + client = Client(oauth) + config = BoxCSVConfig(client=client, root_folder_id=root_folder_id) + return cls(config) + + @classmethod + def from_oauth2( + cls, + client_id: str, + client_secret: str, + access_token: str, + refresh_token: Optional[str] = None, + root_folder_id: str = "0", + ) -> "BoxCSVProjectBackend": + """Convenience constructor for OAuth2 authentication. + + Args: + client_id: Box application client ID + client_secret: Box application client secret + access_token: User access token + refresh_token: Optional refresh token + root_folder_id: Box folder ID to use as root (defaults to "0") + + Returns: + BoxCSVProjectBackend instance with authenticated client + """ + try: + # Import here to avoid dependency issues if not available + from boxsdk.auth.oauth2 import OAuth2 + from boxsdk import Client + except ImportError: + raise ImportError( + "Box SDK not available. Install with: pip install 'ragas_experimental[box]'" + ) + + oauth = OAuth2( + client_id=client_id, + client_secret=client_secret, + access_token=access_token, + refresh_token=refresh_token, + ) + client = Client(oauth) + config = BoxCSVConfig(client=client, root_folder_id=root_folder_id) + return cls(config) + + def initialize(self, project_id: str, **kwargs): + """Initialize the backend with project information.""" + self.project_id = project_id + + # Get or create project folder + root_folder_id = self.config.root_folder_id + root_folder = self.box_client.folder(root_folder_id) + + # Check if project folder exists + project_folder = None + for item in root_folder.get_items(): + if item.type == "folder" and item.name == project_id: + project_folder = self.box_client.folder(item.id) + break + + # Create project folder if it doesn't exist + if project_folder is None: + project_folder = root_folder.create_subfolder(project_id) + logger.info(f"Created project folder on Box: {project_id}") + + self.project_folder = project_folder + self._create_project_structure() + + def _create_project_structure(self): + """Create the folder structure for the project on Box.""" + if self.project_folder is None: + raise ValueError("Project folder not initialized") + + # Create datasets and experiments folders + for folder_name in ["datasets", "experiments"]: + folder_exists = False + for item in self.project_folder.get_items(): + if item.type == "folder" and item.name == folder_name: + folder_exists = True + break + + if not folder_exists: + self.project_folder.create_subfolder(folder_name) + logger.info(f"Created {folder_name} folder on Box") + + def create_dataset(self, name: str, model: t.Type[BaseModel]) -> str: + """Create a new dataset and return its ID.""" + dataset_id = create_nano_id() + return dataset_id + + def create_experiment(self, name: str, model: t.Type[BaseModel]) -> str: + """Create a new experiment and return its ID.""" + experiment_id = create_nano_id() + return experiment_id + + def list_datasets(self) -> t.List[t.Dict]: + """List all datasets in the project.""" + if self.project_folder is None: + return [] + + try: + datasets = [] + + # Find datasets folder + datasets_folder = None + for item in self.project_folder.get_items(): + if item.type == "folder" and item.name == "datasets": + datasets_folder = self.box_client.folder(item.id) + break + + if datasets_folder is None: + return [] + + # List CSV files in datasets folder + for item in datasets_folder.get_items(): + if item.type == "file" and item.name.endswith(".csv"): + name = os.path.splitext(item.name)[0] + datasets.append( + { + "id": create_nano_id(), + "name": name, + } + ) + + return datasets + + except Exception as e: + logger.error(f"Error listing datasets from Box: {e}") + return [] + + def list_experiments(self) -> t.List[t.Dict]: + """List all experiments in the project.""" + if self.project_folder is None: + return [] + + try: + experiments = [] + + # Find experiments folder + experiments_folder = None + for item in self.project_folder.get_items(): + if item.type == "folder" and item.name == "experiments": + experiments_folder = self.box_client.folder(item.id) + break + + if experiments_folder is None: + return [] + + # List CSV files in experiments folder + for item in experiments_folder.get_items(): + if item.type == "file" and item.name.endswith(".csv"): + name = os.path.splitext(item.name)[0] + experiments.append( + { + "id": create_nano_id(), + "name": name, + } + ) + + return experiments + + except Exception as e: + logger.error(f"Error listing experiments from Box: {e}") + return [] + + def get_dataset_backend( + self, dataset_id: str, name: str, model: t.Type[BaseModel] + ) -> DataTableBackend: + """Get a DataTableBackend instance for a specific dataset.""" + if self.project_folder is None: + raise ValueError("Backend not properly initialized") + + return BoxCSVDataTableBackend( + box_client=self.box_client, + project_folder_id=self.project_folder.object_id, + dataset_id=dataset_id, + dataset_name=name, + datatable_type="datasets", + ) + + def get_experiment_backend( + self, experiment_id: str, name: str, model: t.Type[BaseModel] + ) -> DataTableBackend: + """Get a DataTableBackend instance for a specific experiment.""" + if self.project_folder is None: + raise ValueError("Backend not properly initialized") + + return BoxCSVDataTableBackend( + box_client=self.box_client, + project_folder_id=self.project_folder.object_id, + dataset_id=experiment_id, + dataset_name=name, + datatable_type="experiments", + ) + + def get_dataset_by_name( + self, name: str, model: t.Type[BaseModel] + ) -> t.Tuple[str, DataTableBackend]: + """Get dataset ID and backend by name.""" + if self.project_folder is None: + raise ValueError("Backend not initialized") + + try: + # Check if dataset exists + datasets_folder = None + for item in self.project_folder.get_items(): + if item.type == "folder" and item.name == "datasets": + datasets_folder = self.box_client.folder(item.id) + break + + if datasets_folder is None: + raise ValueError("Datasets folder not found") + + # Look for CSV file + csv_exists = False + for item in datasets_folder.get_items(): + if item.type == "file" and item.name == f"{name}.csv": + csv_exists = True + break + + if not csv_exists: + raise ValueError(f"Dataset '{name}' does not exist") + + # Create dataset instance + dataset_id = create_nano_id() + backend = self.get_dataset_backend(dataset_id, name, model) + + return dataset_id, backend + + except Exception as e: + logger.error(f"Error getting dataset by name: {e}") + raise + + def get_experiment_by_name( + self, name: str, model: t.Type[BaseModel] + ) -> t.Tuple[str, DataTableBackend]: + """Get experiment ID and backend by name.""" + if self.project_folder is None: + raise ValueError("Backend not initialized") + + try: + # Check if experiment exists + experiments_folder = None + for item in self.project_folder.get_items(): + if item.type == "folder" and item.name == "experiments": + experiments_folder = self.box_client.folder(item.id) + break + + if experiments_folder is None: + raise ValueError("Experiments folder not found") + + # Look for CSV file + csv_exists = False + for item in experiments_folder.get_items(): + if item.type == "file" and item.name == f"{name}.csv": + csv_exists = True + break + + if not csv_exists: + raise ValueError(f"Experiment '{name}' does not exist") + + # Create experiment instance + experiment_id = create_nano_id() + backend = self.get_experiment_backend(experiment_id, name, model) + + return experiment_id, backend + + except Exception as e: + logger.error(f"Error getting experiment by name: {e}") + raise diff --git a/experimental/ragas_experimental/backends/config.py b/experimental/ragas_experimental/backends/config.py new file mode 100644 index 000000000..dd0981f14 --- /dev/null +++ b/experimental/ragas_experimental/backends/config.py @@ -0,0 +1,161 @@ +"""Configuration classes for all backend types.""" + +from abc import ABC +from typing import ( + Optional, + TYPE_CHECKING, + Any, + Protocol, + runtime_checkable, + Annotated, +) +from ragas_experimental.model.pydantic_model import ( + ExtendedPydanticBaseModel as BaseModel, +) +from pydantic import ConfigDict, PlainValidator + +# Type-only imports for Box SDK +if TYPE_CHECKING: + from boxsdk import Client +else: + try: + from boxsdk import Client + except ImportError: + Client = None + + +# Protocol definitions for Box SDK interfaces +@runtime_checkable +class BoxUserProtocol(Protocol): + """Protocol for Box user objects.""" + + name: str + + +@runtime_checkable +class BoxUserManagerProtocol(Protocol): + """Protocol for Box user manager objects.""" + + def get(self) -> BoxUserProtocol: ... + + +@runtime_checkable +class BoxItemProtocol(Protocol): + """Protocol for Box items (files/folders) returned by get_items().""" + + type: str # "file" or "folder" + name: str + id: str + + +@runtime_checkable +class BoxFileProtocol(Protocol): + """Protocol for Box file objects.""" + + def content(self) -> bytes: ... + def update_contents_with_stream(self, stream) -> None: ... + + +@runtime_checkable +class BoxFolderProtocol(Protocol): + """Protocol for Box folder objects.""" + + object_id: str + + def get_items(self) -> list[BoxItemProtocol]: ... + def create_subfolder(self, name: str) -> "BoxFolderProtocol": ... + def upload_stream(self, stream, filename: str) -> BoxFileProtocol: ... + + +@runtime_checkable +class BoxClientProtocol(Protocol): + """Protocol for Box client objects.""" + + def user(self) -> BoxUserManagerProtocol: ... + def folder(self, folder_id: str) -> BoxFolderProtocol: ... + def file(self, file_id: str) -> BoxFileProtocol: ... + + +def validate_box_client(value: Any) -> Any: + """Validate that the value implements the BoxClientProtocol interface.""" + if value is None: + raise ValueError("Box client is required") + + # Check if the object implements the required interface + if not isinstance(value, BoxClientProtocol): + # For mocks and other objects, check if they have the required methods + required_methods = ["user", "folder", "file"] + for method in required_methods: + if not hasattr(value, method): + raise ValueError(f"Client must have {method} method") + + return value + + +# Type alias for the validated Box client +BoxClientType = Annotated[Any, PlainValidator(validate_box_client)] + + +class BackendConfig(BaseModel, ABC): + """Base configuration class for all backends.""" + + model_config = ConfigDict(validate_assignment=True) + + +class LocalCSVConfig(BackendConfig): + """Configuration for Local CSV backend. + + Stores data in local CSV files organized in folder structure: + root_dir/project_id/datasets/dataset_name.csv + root_dir/project_id/experiments/experiment_name.csv + """ + + root_dir: str = "./ragas_data" + """Root directory for storing CSV files. Defaults to './ragas_data'.""" + + +class BoxCSVConfig(BackendConfig): + """Configuration for Box CSV backend. + + Stores CSV files on Box cloud storage with same organization as local CSV. + Requires an authenticated Box client to be provided. + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + + client: BoxClientType + """Authenticated Box client. User must provide this.""" + + root_folder_id: str = "0" + """Box folder ID to use as root. '0' is Box root folder.""" + + def model_post_init(self, __context): + """Validate configuration after initialization.""" + # Verify the client is properly authenticated by attempting to get user info + try: + user = self.client.user().get() + # Store user info for logging/debugging if needed + self._authenticated_user = user.name + except Exception as e: + raise ValueError(f"Box client authentication failed: {e}") + + +class RagasAppConfig(BackendConfig): + """Configuration for Ragas App Platform backend. + + Connects to the official Ragas platform service for cloud-based storage. + """ + + api_url: str = "https://api.ragas.io" + """Ragas API base URL. Defaults to production API.""" + + api_key: Optional[str] = None + """API key for authentication. Can be set via RAGAS_API_KEY environment variable.""" + + timeout: int = 30 + """Request timeout in seconds. Defaults to 30.""" + + max_retries: int = 3 + """Maximum number of retry attempts for failed requests. Defaults to 3.""" + + model_config = ConfigDict(env_prefix="RAGAS_") diff --git a/experimental/ragas_experimental/backends/factory.py b/experimental/ragas_experimental/backends/factory.py deleted file mode 100644 index 3d48e1600..000000000 --- a/experimental/ragas_experimental/backends/factory.py +++ /dev/null @@ -1,40 +0,0 @@ -"""Factory class for creating the backends or mocked backends.""" - -__all__ = ["RagasApiClientFactory"] - -import os -import typing as t - -from .ragas_api_client import RagasApiClient - - -class RagasApiClientFactory: - """Factory for creating Ragas API client instances.""" - - @staticmethod - def create( - app_token: t.Optional[str] = None, - base_url: t.Optional[str] = None, - ) -> RagasApiClient: - """Create a Ragas API client. - - Args: - api_key: The API key for the Ragas API - base_url: The base URL for the Ragas API - - Returns: - RagasApiClient: A Ragas API client instance - """ - if app_token is None: - app_token = os.getenv("RAGAS_APP_TOKEN") - - if app_token is None: - raise ValueError("RAGAS_API_KEY environment variable is not set") - - if base_url is None: - base_url = os.getenv("RAGAS_API_BASE_URL") - - if base_url is None: - base_url = "https://api.dev.app.ragas.io" - - return RagasApiClient(app_token=app_token, base_url=base_url) diff --git a/experimental/ragas_experimental/backends/local_csv.py b/experimental/ragas_experimental/backends/local_csv.py new file mode 100644 index 000000000..6b5c6dd16 --- /dev/null +++ b/experimental/ragas_experimental/backends/local_csv.py @@ -0,0 +1,379 @@ +"""Local CSV backend implementation for projects and datasets.""" + +import csv +import os +import typing as t +import uuid + +from ragas_experimental.model.pydantic_model import ( + ExtendedPydanticBaseModel as BaseModel, +) + +from .utils import create_nano_id +from .base import DataTableBackend, ProjectBackend +from .config import LocalCSVConfig + + +class LocalCSVDataTableBackend(DataTableBackend): + """Local CSV implementation of DataTableBackend.""" + + def __init__( + self, + local_root_dir: str, + project_id: str, + dataset_id: str, + dataset_name: str, + datatable_type: t.Literal["datasets", "experiments"], + ): + self.local_root_dir = local_root_dir + self.project_id = project_id + self.dataset_id = dataset_id + self.dataset_name = dataset_name + self.datatable_type = datatable_type + self.dataset = None + + def initialize(self, dataset): + """Initialize the backend with the dataset instance.""" + self.dataset = dataset + self._ensure_csv_exists() + + def _ensure_csv_exists(self): + """Create the CSV file if it doesn't exist.""" + csv_path = self._get_csv_path() + + # Create directories if needed + os.makedirs(os.path.dirname(csv_path), exist_ok=True) + + # Create file with headers if it doesn't exist + if not os.path.exists(csv_path): + # Include _row_id in the headers + if self.dataset is None: + raise ValueError( + "Dataset must be initialized before creating CSV headers" + ) + field_names = ["_row_id"] + list(self.dataset.model.__annotations__.keys()) + + with open(csv_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(field_names) + + def _get_csv_path(self): + """Get the path to the CSV file.""" + return os.path.join( + self.local_root_dir, + self.project_id, + self.datatable_type, + f"{self.dataset_name}.csv", + ) + + def get_column_mapping(self, model) -> t.Dict: + """Get mapping between model fields and CSV columns.""" + return model.model_fields + + def load_entries(self, model_class): + """Load all entries from the CSV file.""" + csv_path = self._get_csv_path() + + if not os.path.exists(csv_path): + return [] + + entries = [] + + with open(csv_path, "r", newline="") as f: + reader = csv.DictReader(f) + + for row in reader: + try: + # Extract row_id and remove from model data + row_id = row.get("_row_id", str(uuid.uuid4())) + + # Create a copy without _row_id for model instantiation + model_data = {k: v for k, v in row.items() if k != "_row_id"} + + # Convert types as needed + typed_row = {} + for field, value in model_data.items(): + if field in model_class.model_fields: + field_type = model_class.model_fields[field].annotation + + # Handle basic type conversions + if field_type is int: + typed_row[field] = int(value) if value else 0 + elif field_type is float: + typed_row[field] = float(value) if value else 0.0 + elif field_type is bool: + typed_row[field] = value.lower() in ( + "true", + "t", + "yes", + "y", + "1", + ) + else: + typed_row[field] = value + + # Create model instance + entry = model_class(**typed_row) + + # Set the row ID from CSV + entry._row_id = row_id + + entries.append(entry) + except Exception as e: + print(f"Error loading row from CSV: {e}") + + return entries + + def append_entry(self, entry) -> str: + """Add a new entry to the CSV file and return a generated ID.""" + csv_path = self._get_csv_path() + + # Read existing rows to avoid overwriting + existing_rows = [] + if os.path.exists(csv_path) and os.path.getsize(csv_path) > 0: + with open(csv_path, "r", newline="") as f: + reader = csv.DictReader(f) + existing_rows = list(reader) + + # Generate a row ID if needed + row_id = getattr(entry, "_row_id", None) or str(uuid.uuid4()) + + # Get field names including row_id + field_names = ["_row_id"] + list(entry.__class__.model_fields.keys()) + + # Convert entry to dict + entry_dict = entry.model_dump() + + # Add row_id to the dict + entry_dict["_row_id"] = row_id + + # Write all rows back with the new entry + with open(csv_path, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=field_names) + writer.writeheader() + + # Write existing rows + for row in existing_rows: + writer.writerow(row) + + # Write new row + writer.writerow(entry_dict) + + # Return the row ID + return row_id + + def update_entry(self, entry) -> bool: + """Update an existing entry in the CSV file.""" + # Create a copy of entries to modify + if self.dataset is None: + raise ValueError("Dataset must be initialized") + entries_to_save = list(self.dataset._entries) # Make a copy + + # Find the entry to update + updated = False + for i, e in enumerate(entries_to_save): + if ( + hasattr(e, "_row_id") + and hasattr(entry, "_row_id") + and e._row_id == entry._row_id + ): + # Update the entry in our copy + entries_to_save[i] = entry + updated = True + break + + # If entry wasn't found, just append it + if not updated and entries_to_save: + entries_to_save.append(entry) + + # Write all entries back to CSV + self._write_entries_to_csv(entries_to_save) + + return True + + def delete_entry(self, entry_id) -> bool: + """Delete an entry from the CSV file.""" + # Create a copy of entries to modify, excluding the one to delete + if self.dataset is None: + raise ValueError("Dataset must be initialized") + entries_to_save = [] + for e in self.dataset._entries: + if not (hasattr(e, "_row_id") and e._row_id == entry_id): + entries_to_save.append(e) + + # Write all entries back to CSV + self._write_entries_to_csv(entries_to_save) + + return True + + def _write_entries_to_csv(self, entries): + """Write all entries to the CSV file.""" + csv_path = self._get_csv_path() + + if not entries: + # If no entries, just create an empty CSV with headers + if self.dataset is None: + raise ValueError("Dataset must be initialized") + field_names = ["_row_id"] + list(self.dataset.model.model_fields.keys()) + with open(csv_path, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=field_names) + writer.writeheader() + return + + # Get field names including _row_id + field_names = ["_row_id"] + list(entries[0].__class__.model_fields.keys()) + + # Write all entries + with open(csv_path, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=field_names) + writer.writeheader() + + for entry in entries: + # Create a dict with model data + row_id + entry_dict = entry.model_dump() + entry_dict["_row_id"] = getattr(entry, "_row_id", str(uuid.uuid4())) + + writer.writerow(entry_dict) + + def get_entry_by_field( + self, field_name, field_value, model_class + ) -> t.Optional[t.Any]: + """Get an entry by field value.""" + entries = self.load_entries(model_class) + + for entry in entries: + if hasattr(entry, field_name) and getattr(entry, field_name) == field_value: + return entry + + return None + + +class LocalCSVProjectBackend(ProjectBackend): + """Local CSV implementation of ProjectBackend.""" + + def __init__(self, config: LocalCSVConfig): + self.config = config + self.root_dir = config.root_dir + self.project_id: t.Optional[str] = None + + def initialize(self, project_id: str, **kwargs): + """Initialize the backend with project information.""" + self.project_id = project_id + self._project_dir = os.path.join(self.root_dir, project_id) + self._create_project_structure() + + def _create_project_structure(self): + """Create the local directory structure for the project.""" + os.makedirs(self._project_dir, exist_ok=True) + # Create datasets directory + os.makedirs(os.path.join(self._project_dir, "datasets"), exist_ok=True) + # Create experiments directory + os.makedirs(os.path.join(self._project_dir, "experiments"), exist_ok=True) + + def create_dataset(self, name: str, model: t.Type[BaseModel]) -> str: + """Create a new dataset and return its ID.""" + dataset_id = create_nano_id() + return dataset_id + + def create_experiment(self, name: str, model: t.Type[BaseModel]) -> str: + """Create a new experiment and return its ID.""" + experiment_id = create_nano_id() + return experiment_id + + def list_datasets(self) -> t.List[t.Dict]: + """List all datasets in the project.""" + datasets_dir = os.path.join(self._project_dir, "datasets") + if not os.path.exists(datasets_dir): + return [] + + datasets = [] + for filename in os.listdir(datasets_dir): + if filename.endswith(".csv"): + name = os.path.splitext(filename)[0] + datasets.append( + { + "id": create_nano_id(), # Generate ID for consistency + "name": name, + } + ) + return datasets + + def list_experiments(self) -> t.List[t.Dict]: + """List all experiments in the project.""" + experiments_dir = os.path.join(self._project_dir, "experiments") + if not os.path.exists(experiments_dir): + return [] + + experiments = [] + for filename in os.listdir(experiments_dir): + if filename.endswith(".csv"): + name = os.path.splitext(filename)[0] + experiments.append( + { + "id": create_nano_id(), # Generate ID for consistency + "name": name, + } + ) + return experiments + + def get_dataset_backend( + self, dataset_id: str, name: str, model: t.Type[BaseModel] + ) -> DataTableBackend: + """Get a DataTableBackend instance for a specific dataset.""" + if self.project_id is None: + raise ValueError( + "Backend must be initialized before creating dataset backend" + ) + return LocalCSVDataTableBackend( + local_root_dir=self.root_dir, + project_id=self.project_id, + dataset_id=dataset_id, + dataset_name=name, + datatable_type="datasets", + ) + + def get_experiment_backend( + self, experiment_id: str, name: str, model: t.Type[BaseModel] + ) -> DataTableBackend: + """Get a DataTableBackend instance for a specific experiment.""" + if self.project_id is None: + raise ValueError( + "Backend must be initialized before creating experiment backend" + ) + return LocalCSVDataTableBackend( + local_root_dir=self.root_dir, + project_id=self.project_id, + dataset_id=experiment_id, + dataset_name=name, + datatable_type="experiments", + ) + + def get_dataset_by_name( + self, name: str, model: t.Type[BaseModel] + ) -> t.Tuple[str, DataTableBackend]: + """Get dataset ID and backend by name.""" + # Check if the dataset file exists + dataset_path = os.path.join(self._project_dir, "datasets", f"{name}.csv") + if not os.path.exists(dataset_path): + raise ValueError(f"Dataset '{name}' does not exist") + + # Create dataset instance with a random ID + dataset_id = create_nano_id() + backend = self.get_dataset_backend(dataset_id, name, model) + + return dataset_id, backend + + def get_experiment_by_name( + self, name: str, model: t.Type[BaseModel] + ) -> t.Tuple[str, DataTableBackend]: + """Get experiment ID and backend by name.""" + # Check if the experiment file exists + experiment_path = os.path.join(self._project_dir, "experiments", f"{name}.csv") + if not os.path.exists(experiment_path): + raise ValueError(f"Experiment '{name}' does not exist") + + # Create experiment instance with a random ID + experiment_id = create_nano_id() + backend = self.get_experiment_backend(experiment_id, name, model) + + return experiment_id, backend diff --git a/experimental/ragas_experimental/backends/mock_notion.py b/experimental/ragas_experimental/backends/mock_notion.py deleted file mode 100644 index e9d4dba87..000000000 --- a/experimental/ragas_experimental/backends/mock_notion.py +++ /dev/null @@ -1,269 +0,0 @@ -"""Helps with testing `ragas_annotator` better.""" - -__all__ = [ - "MockPagesAPI", - "MockDatabasesAPI", - "MockBlocksAPI", - "MockBlockChildrenAPI", - "MockNotionClient", -] - -import uuid -from copy import deepcopy -from datetime import datetime - -from ..exceptions import NotFoundError - - -class MockPagesAPI: - """Mock implementation of notion_client.Client.pages""" - - def __init__(self, client): - self.client = client - - def create(self, parent, properties, **kwargs): - """Create a new page.""" - page_id = self.client._create_id() - - # Create the page object - page = { - "id": page_id, - "object": "page", - "created_time": self.client._get_timestamp(), - "last_edited_time": self.client._get_timestamp(), - "archived": False, - "properties": deepcopy(properties), - "parent": deepcopy(parent), - } - - # Add page to storage - self.client._pages[page_id] = page - - # Add child reference to parent - parent_type = parent.get("type") - parent_id = parent.get(f"{parent_type}_id") - - if parent_id: - child_block = { - "id": self.client._create_id(), - "object": "block", - "type": "child_page", - "created_time": self.client._get_timestamp(), - "last_edited_time": self.client._get_timestamp(), - "child_page": {"title": self._extract_title(properties)}, - } - - if parent_id not in self.client._children: - self.client._children[parent_id] = [] - - self.client._children[parent_id].append(child_block) - - return deepcopy(page) - - def retrieve(self, page_id): - """Retrieve a page by ID.""" - if page_id not in self.client._pages: - raise NotFoundError(f"Page {page_id} not found") - - return deepcopy(self.client._pages[page_id]) - - def update(self, page_id, properties=None, archived=None, **kwargs): - """Update a page.""" - if page_id not in self.client._pages: - raise NotFoundError(f"Page {page_id} not found") - - page = self.client._pages[page_id] - - if properties: - # Update properties - for key, value in properties.items(): - page["properties"][key] = deepcopy(value) - - if archived is not None: - page["archived"] = archived - - page["last_edited_time"] = self.client._get_timestamp() - - return deepcopy(page) - - def _extract_title(self, properties): - """Extract page title from properties.""" - for prop in properties.values(): - if prop.get("type") == "title" and prop.get("title"): - for text_obj in prop["title"]: - if text_obj.get("type") == "text" and "content" in text_obj.get( - "text", {} - ): - return text_obj["text"]["content"] - return "Untitled" - - -class MockDatabasesAPI: - """Mock implementation of notion_client.Client.databases""" - - def __init__(self, client): - self.client = client - - def create(self, parent, title, properties, **kwargs): - """Create a new database.""" - database_id = self.client._create_id() - - # Create database object - database = { - "id": database_id, - "object": "database", - "created_time": self.client._get_timestamp(), - "last_edited_time": self.client._get_timestamp(), - "title": deepcopy(title), - "properties": deepcopy(properties), - "parent": deepcopy(parent), - } - - # Add database to storage - self.client._databases[database_id] = database - - # Add child reference to parent - parent_type = parent.get("type") - parent_id = parent.get(f"{parent_type}_id") - - if parent_id: - child_block = { - "id": self.client._create_id(), - "object": "block", - "type": "child_database", - "created_time": self.client._get_timestamp(), - "last_edited_time": self.client._get_timestamp(), - "child_database": {"title": self._extract_title(title)}, - } - - if parent_id not in self.client._children: - self.client._children[parent_id] = [] - - self.client._children[parent_id].append(child_block) - - return deepcopy(database) - - def retrieve(self, database_id): - """Retrieve a database by ID.""" - if database_id not in self.client._databases: - raise NotFoundError(f"Database {database_id} not found") - - return deepcopy(self.client._databases[database_id]) - - def query( - self, - database_id, - filter=None, - sorts=None, - start_cursor=None, - page_size=100, - **kwargs, - ): - """Query a database.""" - if database_id not in self.client._databases: - raise NotFoundError(f"Database {database_id} not found") - - # Get all pages in the database - results = [] - for page_id, page in self.client._pages.items(): - parent = page.get("parent", {}) - if ( - parent.get("type") == "database_id" - and parent.get("database_id") == database_id - ): - results.append(deepcopy(page)) - - # TODO: Implement filtering, sorting, and pagination if needed - - return {"results": results, "has_more": False, "next_cursor": None} - - def _extract_title(self, title): - """Extract database title from title array.""" - for text_obj in title: - if text_obj.get("type") == "text" and "content" in text_obj.get("text", {}): - return text_obj["text"]["content"] - return "Untitled" - - -class MockBlocksAPI: - """Mock implementation of notion_client.Client.blocks""" - - def __init__(self, client): - self.client = client - self.children = MockBlockChildrenAPI(client) - - def retrieve(self, block_id): - """Retrieve a block by ID.""" - if block_id not in self.client._blocks: - raise NotFoundError(f"Block {block_id} not found") - - return deepcopy(self.client._blocks[block_id]) - - -class MockBlockChildrenAPI: - """Mock implementation of notion_client.Client.blocks.children""" - - def __init__(self, client): - self.client = client - - def list(self, block_id, start_cursor=None, page_size=100): - """List children of a block.""" - children = self.client._children.get(block_id, []) - - # TODO: Implement pagination if needed - - return {"results": deepcopy(children), "has_more": False, "next_cursor": None} - - -class MockNotionClient: - """Mock implementation of notion_client.Client for testing.""" - - def __init__(self, auth=None): - """Initialize the mock client with in-memory storage. - - Args: - auth: Ignored in mock implementation - """ - # In-memory storage - self._pages = {} # page_id -> page object - self._databases = {} # database_id -> database object - self._blocks = {} # block_id -> block object - self._children = {} # parent_id -> list of child blocks - - # Create API namespaces to match real client - self.pages = MockPagesAPI(self) - self.databases = MockDatabasesAPI(self) - self.blocks = MockBlocksAPI(self) - - def _get_timestamp(self): - """Generate a timestamp in Notion API format.""" - return datetime.utcnow().isoformat() + "Z" - - def _create_id(self): - """Generate a random ID in Notion format.""" - return str(uuid.uuid4()).replace("-", "") - - def add_page(self, page_data): - """Add a page to the mock storage.""" - self._pages[page_data["id"]] = deepcopy(page_data) - - def add_database(self, database_data): - """Add a database to the mock storage.""" - self._databases[database_data["id"]] = deepcopy(database_data) - - def add_block(self, block_data): - """Add a block to the mock storage.""" - self._blocks[block_data["id"]] = deepcopy(block_data) - - def add_children(self, parent_id, children): - """Add children to a parent.""" - if parent_id not in self._children: - self._children[parent_id] = [] - self._children[parent_id].extend(deepcopy(children)) - - def __str__(self): - return "MockNotionClient(num_pages={}, num_databases={}, num_blocks={})".format( - len(self._pages), len(self._databases), len(self._blocks) - ) - - __repr__ = __str__ diff --git a/experimental/ragas_experimental/backends/notion_backend.py b/experimental/ragas_experimental/backends/notion_backend.py deleted file mode 100644 index 38c53b722..000000000 --- a/experimental/ragas_experimental/backends/notion_backend.py +++ /dev/null @@ -1,357 +0,0 @@ -"""`Project` uses this backend to interact with the Notion API.""" - -__all__ = ["NotionBackend", "get_page_id", "get_database_id"] - -import os -import typing as t - -from fastcore.utils import patch, patch_to -from notion_client import Client as NotionClient - -from ..exceptions import DuplicateError, NotFoundError - - -class NotionBackend: - """A backend for interacting with the Notion API""" - - def __init__( - self, root_page_id: str, notion_client: t.Optional[NotionClient] = None - ): - self.root_page_id = root_page_id - if notion_client is None: - self.client = NotionClient(auth=os.getenv("NOTION_API_KEY")) - else: - self.client = notion_client - - def __repr__(self): - return f"NotionBackend(root_page_id={self.root_page_id})" - - def validate_project_structure(self, root_page_id): - """ - Validate the project structure by checking if the root page exists and has the correct sub-pages. - Structure is as follows: - - Root Page - - Datasets - - Experiments - - Comparisons - """ - # Check if root page exists - if not self.page_exists(root_page_id): - return False - - # Search for required sub-pages under root - required_pages = {"Datasets", "Experiments", "Comparisons"} - found_pages = set() - - # Search for child pages - children = self.client.blocks.children.list(root_page_id) - for block in children["results"]: - if block["type"] == "child_page": - found_pages.add(block["child_page"]["title"]) - - # Verify all required pages exist - return required_pages.issubset(found_pages) - - def create_new_page(self, parent_page_id, page_name) -> str: - """ - Create a new page inside the given parent page and return the page id. - - Args: - parent_page_id (str): The ID of the parent page - page_name (str): The title for the new page - - Returns: - str: The ID of the newly created page - - Raises: - ValueError: If the parent page does not exist - """ - # First check if parent page exists - if not self.page_exists(parent_page_id): - raise ValueError(f"Parent page {parent_page_id} does not exist") - - # Create a new child page - response = self.client.pages.create( - parent={"type": "page_id", "page_id": parent_page_id}, - properties={"title": [{"type": "text", "text": {"content": page_name}}]}, - ) - - # Return the ID of the newly created page - return response["id"] - - def page_exists(self, page_id): - """Check if a page exists by attempting to retrieve it.""" - try: - self.client.pages.retrieve(page_id) - return True - except Exception: - return False - - def create_new_database( - self, parent_page_id: str, title: str, properties: dict - ) -> str: - """Create a new database inside the given parent page. - - Args: - parent_page_id (str): The ID of the parent page - title (str): The title for the new database - properties (dict): The database properties definition - - Returns: - str: The ID of the newly created database - """ - response = self.client.databases.create( - parent={"type": "page_id", "page_id": parent_page_id}, - title=[{"type": "text", "text": {"content": title}}], - properties=properties, - ) - return response["id"] - - -@t.overload -def get_page_id( - self, parent_id: str, page_name: str, return_multiple: t.Literal[False] = False -) -> str: ... -@t.overload -def get_page_id( - self, parent_id: str, page_name: str, return_multiple: t.Literal[True] -) -> t.List[str]: ... -@patch_to(NotionBackend) -def get_page_id( - self, parent_id: str, page_name: str, return_multiple: bool = False -) -> t.Union[str, t.List[str]]: - """Get page ID(s) by name under a parent page. - - Args: - parent_id (str): The ID of the parent page to search under - page_name (str): The title of the page to find - return_multiple (bool): If True, returns all matching page IDs - - Returns: - Union[str, List[str]]: Single page ID or list of page IDs - - Raises: - DuplicateError: If return_multiple is False and multiple pages found - ValueError: If no pages found - """ - matching_pages = [] - next_cursor = None - - while True: - # Get page of results, using cursor if we have one - response = self.client.blocks.children.list(parent_id, start_cursor=next_cursor) - - # Check each block in current page - for block in response["results"]: - if ( - block["type"] == "child_page" - and block["child_page"]["title"] == page_name - ): - matching_pages.append(block["id"]) - - # Check if there are more results - if not response.get("has_more", False): - break - - next_cursor = response.get("next_cursor") - - if not matching_pages: - raise NotFoundError(f"No page found with name '{page_name}'") - - if return_multiple: - return matching_pages - else: - if len(matching_pages) > 1: - raise DuplicateError(f"Multiple pages found with name '{page_name}'") - return matching_pages[0] - - -@t.overload -def get_database_id( - self, parent_page_id: str, name: str, return_multiple: t.Literal[False] = False -) -> str: ... -@t.overload -def get_database_id( - self, parent_page_id: str, name: str, return_multiple: t.Literal[True] -) -> t.List[str]: ... -@patch_to(NotionBackend) -def get_database_id( - self, parent_page_id: str, name: str, return_multiple: bool = False -) -> t.Union[str, t.List[str]]: - """Get the database ID(s) by name under a parent page. - - Args: - parent_page_id (str): The ID of the parent page to search under - name (str): The name of the database to find - return_multiple (bool): If True, returns all matching database IDs - - Returns: - Union[str, List[str]]: Single database ID or list of database IDs - - Raises: - NotFoundError: If no database found with given name - DuplicateError: If return_multiple is False and multiple databases found - """ - matching_databases = [] - next_cursor = None - - while True: - response = self.client.blocks.children.list( - parent_page_id, start_cursor=next_cursor - ) - - for block in response["results"]: - if block["type"] == "child_database": - database = self.client.databases.retrieve(database_id=block["id"]) - if database["title"][0]["plain_text"].lower() == name.lower(): - matching_databases.append(block["id"]) - - if not response.get("has_more", False): - break - - next_cursor = response.get("next_cursor") - - if not matching_databases: - raise NotFoundError(f"No database found with name '{name}'") - - if return_multiple: - return matching_databases - else: - if len(matching_databases) > 1: - raise DuplicateError(f"Multiple databases found with name '{name}'") - return matching_databases[0] - - -@patch -def create_page_in_database( - self: NotionBackend, - database_id: str, - properties: dict, - parent: t.Optional[dict] = None, -) -> dict: - """Create a new page in a database. - - Args: - database_id: The ID of the database to create the page in - properties: The page properties - parent: Optional parent object (defaults to database parent) - - Returns: - dict: The created page object - """ - if parent is None: - parent = {"type": "database_id", "database_id": database_id} - - # Remove any unique_id properties as they cannot be updated directly - filtered_properties = { - k: v - for k, v in properties.items() - if not (isinstance(v, dict) and v.get("type") == "unique_id") - } - - response = self.client.pages.create(parent=parent, properties=filtered_properties) - - return response - - -@patch -def get_database(self: NotionBackend, database_id: str) -> dict: - """Get a database by ID. - - Args: - database_id: The ID of the database to retrieve - - Returns: - dict: The database object - """ - return self.client.databases.retrieve(database_id=database_id) - - -@patch -def query_database( - self: NotionBackend, - database_id: str, - filter: t.Optional[dict] = None, - sorts: t.Optional[t.List[dict]] = None, - archived: bool = False, -) -> dict: - """Query a database with optional filtering and sorting. - - Args: - database_id: The ID of the database to query - filter: Optional filter conditions - sorts: Optional sort conditions - archived: If True, include archived pages. If False, only return non-archived pages - - Returns: - dict: Query response containing all results - """ - query_params = { - "database_id": database_id, - "page_size": 100, # Maximum allowed by Notion API - } - - if filter: - query_params["filter"] = filter - if sorts: - query_params["sorts"] = sorts - - # Initialize results - all_results = [] - has_more = True - next_cursor = None - - # Fetch all pages - while has_more: - if next_cursor: - query_params["start_cursor"] = next_cursor - - response = self.client.databases.query(**query_params) - - # Filter results based on archived status - filtered_results = [ - page - for page in response["results"] - if page.get("archived", False) == archived - ] - all_results.extend(filtered_results) - - has_more = response.get("has_more", False) - next_cursor = response.get("next_cursor") - - # Return combined results - return {"results": all_results, "has_more": False, "next_cursor": None} - - -@patch -def update_page( - self: NotionBackend, - page_id: str, - properties: t.Optional[t.Dict[str, t.Any]] = None, - archived: bool = False, -) -> dict: - """Update a page's properties and/or archive status. - - Args: - page_id: The ID of the page to update - properties: Optional properties to update - archived: Whether to archive the page - - Returns: - dict: The updated page object - """ - update_params = {"page_id": page_id} - - if properties: - # Remove any unique_id properties as they cannot be updated directly - filtered_properties = { - k: v - for k, v in properties.items() - if not (isinstance(v, dict) and v.get("type") == "unique_id") - } - update_params["properties"] = filtered_properties - - if archived: - update_params["archived"] = True # type: ignore - - return self.client.pages.update(**update_params) diff --git a/experimental/ragas_experimental/backends/ragas_app.py b/experimental/ragas_experimental/backends/ragas_app.py new file mode 100644 index 000000000..979836526 --- /dev/null +++ b/experimental/ragas_experimental/backends/ragas_app.py @@ -0,0 +1,359 @@ +"""Ragas App (Platform API) backend implementation for projects and datasets.""" + +import asyncio +import typing as t + +import ragas_experimental.typing as rt +from ragas_experimental.model.pydantic_model import ( + ExtendedPydanticBaseModel as BaseModel, +) + +from .ragas_api_client import RagasApiClient +from ..utils import async_to_sync +from .utils import create_nano_id +from .base import DataTableBackend, ProjectBackend +from .config import RagasAppConfig + + +class RagasAppDataTableBackend(DataTableBackend): + """Ragas App API implementation of DataTableBackend.""" + + def __init__( + self, + ragas_api_client: RagasApiClient, + project_id: str, + dataset_id: str, + datatable_type: t.Literal["datasets", "experiments"], + ): + self.ragas_api_client = ragas_api_client + self.project_id = project_id + self.dataset_id = dataset_id + self.datatable_type = datatable_type + self.dataset = None + + def initialize(self, dataset): + """Initialize the backend with the dataset instance.""" + self.dataset = dataset + + def get_column_mapping(self, model): + """Get mapping between model fields and backend columns.""" + if self.datatable_type == "datasets": + sync_func = async_to_sync(self.ragas_api_client.list_dataset_columns) + columns = sync_func(project_id=self.project_id, dataset_id=self.dataset_id) + else: # experiments + sync_func = async_to_sync(self.ragas_api_client.list_experiment_columns) + columns = sync_func( + project_id=self.project_id, experiment_id=self.dataset_id + ) + + column_id_map = {column["name"]: column["id"] for column in columns["items"]} + + # Update the model's column mapping with the values from the API + column_mapping = {} + for field_name in model.__annotations__: + if field_name in column_id_map: + column_mapping[field_name] = column_id_map[field_name] + + return column_mapping + + def load_entries(self, model_class) -> t.List[t.Any]: + """Load all entries from the API.""" + # Get all rows + if self.datatable_type == "datasets": + sync_func = async_to_sync(self.ragas_api_client.list_dataset_rows) + response = sync_func(project_id=self.project_id, dataset_id=self.dataset_id) + else: # experiments + sync_func = async_to_sync(self.ragas_api_client.list_experiment_rows) + response = sync_func( + project_id=self.project_id, experiment_id=self.dataset_id + ) + + # Get column mapping (ID -> name) + column_map = {v: k for k, v in model_class.__column_mapping__.items()} + + # Process rows + entries = [] + for row in response.get("items", []): + model_data = {} + row_id = row.get("id") + + # Convert from API data format to model fields + for col_id, value in row.get("data", {}).items(): + if col_id in column_map: + field_name = column_map[col_id] + model_data[field_name] = value + + # Create model instance + entry = model_class(**model_data) + + # Store row ID for future operations + entry._row_id = row_id + + entries.append(entry) + + return entries + + def append_entry(self, entry) -> str: + """Add a new entry to the API and return its ID.""" + # Get column mapping + column_id_map = entry.__class__.__column_mapping__ + + # Create row data + row_dict_converted = rt.ModelConverter.instance_to_row(entry) + row_id = create_nano_id() + row_data = {} + + for column in row_dict_converted["data"]: + if column["column_id"] in column_id_map: + row_data[column_id_map[column["column_id"]]] = column["data"] + + # Create row in API + if self.datatable_type == "datasets": + sync_func = async_to_sync(self.ragas_api_client.create_dataset_row) + response = sync_func( + project_id=self.project_id, + dataset_id=self.dataset_id, + id=row_id, + data=row_data, + ) + else: # experiments + sync_func = async_to_sync(self.ragas_api_client.create_experiment_row) + response = sync_func( + project_id=self.project_id, + experiment_id=self.dataset_id, + id=row_id, + data=row_data, + ) + + # Return the row ID + return response["id"] + + def update_entry(self, entry) -> bool: + """Update an existing entry in the API.""" + # Get the row ID + row_id = None + if hasattr(entry, "_row_id") and entry._row_id: + row_id = entry._row_id + else: + raise ValueError("Cannot update: entry has no row ID") + + # Get column mapping and prepare data + column_id_map = entry.__class__.__column_mapping__ + row_dict = rt.ModelConverter.instance_to_row(entry)["data"] + row_data = {} + + for column in row_dict: + if column["column_id"] in column_id_map: + row_data[column_id_map[column["column_id"]]] = column["data"] + + # Update in API + if self.datatable_type == "datasets": + sync_func = async_to_sync(self.ragas_api_client.update_dataset_row) + response = sync_func( + project_id=self.project_id, + dataset_id=self.dataset_id, + row_id=row_id, + data=row_data, + ) + else: # experiments + sync_func = async_to_sync(self.ragas_api_client.update_experiment_row) + response = sync_func( + project_id=self.project_id, + experiment_id=self.dataset_id, + row_id=row_id, + data=row_data, + ) + + return response + + def delete_entry(self, entry_id) -> bool: + """Delete an entry from the API.""" + # Delete the row + if self.datatable_type == "datasets": + sync_func = async_to_sync(self.ragas_api_client.delete_dataset_row) + response = sync_func( + project_id=self.project_id, dataset_id=self.dataset_id, row_id=entry_id + ) + else: # experiments + sync_func = async_to_sync(self.ragas_api_client.delete_experiment_row) + response = sync_func( + project_id=self.project_id, + experiment_id=self.dataset_id, + row_id=entry_id, + ) + + return response + + def get_entry_by_field( + self, field_name, field_value, model_class + ) -> t.Optional[t.Any]: + """Get an entry by field value.""" + # We don't have direct filtering in the API, so load all and filter + entries = self.load_entries(model_class) + + # Search for matching entry + for entry in entries: + if hasattr(entry, field_name) and getattr(entry, field_name) == field_value: + return entry + + return None + + +async def create_dataset_columns( + project_id, dataset_id, columns, create_dataset_column_func +): + """Helper function to create dataset columns.""" + tasks = [] + for column in columns: + tasks.append( + create_dataset_column_func( + project_id=project_id, + dataset_id=dataset_id, + id=create_nano_id(), + name=column["name"], + type=column["type"], + settings=column["settings"], + ) + ) + return await asyncio.gather(*tasks) + + +async def create_experiment_columns( + project_id, experiment_id, columns, create_experiment_column_func +): + """Helper function to create experiment columns.""" + tasks = [] + for column in columns: + tasks.append( + create_experiment_column_func( + project_id=project_id, + experiment_id=experiment_id, + id=create_nano_id(), + name=column["name"], + type=column["type"], + settings=column["settings"], + ) + ) + return await asyncio.gather(*tasks) + + +class RagasAppProjectBackend(ProjectBackend): + """Ragas App API implementation of ProjectBackend.""" + + def __init__(self, config: RagasAppConfig): + self.config = config + self.ragas_api_client = RagasApiClient( + base_url=config.api_url, + app_token=config.api_key, + ) + self.project_id: t.Optional[str] = None + + def initialize(self, project_id: str, **kwargs): + """Initialize the backend with project information.""" + self.project_id = project_id + + def create_dataset(self, name: str, model: t.Type[BaseModel]) -> str: + """Create a new dataset and return its ID.""" + # Create the dataset + sync_version = async_to_sync(self.ragas_api_client.create_dataset) + dataset_info = sync_version( + project_id=self.project_id, + name=name, + ) + + # Create the columns for the dataset + column_types = rt.ModelConverter.model_to_columns(model) + sync_create_columns = async_to_sync(create_dataset_columns) + sync_create_columns( + project_id=self.project_id, + dataset_id=dataset_info["id"], + columns=column_types, + create_dataset_column_func=self.ragas_api_client.create_dataset_column, + ) + + return dataset_info["id"] + + def create_experiment(self, name: str, model: t.Type[BaseModel]) -> str: + """Create a new experiment and return its ID.""" + # Create the experiment in the API + sync_version = async_to_sync(self.ragas_api_client.create_experiment) + experiment_info = sync_version( + project_id=self.project_id, + name=name, + ) + + # Create the columns for the experiment + column_types = rt.ModelConverter.model_to_columns(model) + sync_version = async_to_sync(create_experiment_columns) + sync_version( + project_id=self.project_id, + experiment_id=experiment_info["id"], + columns=column_types, + create_experiment_column_func=self.ragas_api_client.create_experiment_column, + ) + + return experiment_info["id"] + + def list_datasets(self) -> t.List[t.Dict]: + """List all datasets in the project.""" + sync_version = async_to_sync(self.ragas_api_client.list_datasets) + datasets = sync_version(project_id=self.project_id) + return datasets.get("items", []) + + def list_experiments(self) -> t.List[t.Dict]: + """List all experiments in the project.""" + sync_version = async_to_sync(self.ragas_api_client.list_experiments) + experiments = sync_version(project_id=self.project_id) + return experiments.get("items", []) + + def get_dataset_backend( + self, dataset_id: str, name: str, model: t.Type[BaseModel] + ) -> DataTableBackend: + """Get a DataTableBackend instance for a specific dataset.""" + if self.project_id is None: + raise ValueError( + "Backend must be initialized before creating dataset backend" + ) + return RagasAppDataTableBackend( + ragas_api_client=self.ragas_api_client, + project_id=self.project_id, + dataset_id=dataset_id, + datatable_type="datasets", + ) + + def get_experiment_backend( + self, experiment_id: str, name: str, model: t.Type[BaseModel] + ) -> DataTableBackend: + """Get a DataTableBackend instance for a specific experiment.""" + if self.project_id is None: + raise ValueError( + "Backend must be initialized before creating experiment backend" + ) + return RagasAppDataTableBackend( + ragas_api_client=self.ragas_api_client, + project_id=self.project_id, + dataset_id=experiment_id, + datatable_type="experiments", + ) + + def get_dataset_by_name( + self, name: str, model: t.Type[BaseModel] + ) -> t.Tuple[str, DataTableBackend]: + """Get dataset ID and backend by name.""" + # Search for dataset with given name + sync_version = async_to_sync(self.ragas_api_client.get_dataset_by_name) + dataset_info = sync_version(project_id=self.project_id, dataset_name=name) + + backend = self.get_dataset_backend(dataset_info["id"], name, model) + return dataset_info["id"], backend + + def get_experiment_by_name( + self, name: str, model: t.Type[BaseModel] + ) -> t.Tuple[str, DataTableBackend]: + """Get experiment ID and backend by name.""" + # Search for experiment with given name + sync_version = async_to_sync(self.ragas_api_client.get_experiment_by_name) + experiment_info = sync_version(project_id=self.project_id, experiment_name=name) + + backend = self.get_experiment_backend(experiment_info["id"], name, model) + return experiment_info["id"], backend diff --git a/experimental/ragas_experimental/backends/registry.py b/experimental/ragas_experimental/backends/registry.py new file mode 100644 index 000000000..28ba2f586 --- /dev/null +++ b/experimental/ragas_experimental/backends/registry.py @@ -0,0 +1,390 @@ +"""Backend registry for managing and discovering project backends.""" + +import logging +import typing as t +from importlib import metadata + +from .base import ProjectBackend + +logger = logging.getLogger(__name__) + + +class BackendRegistry: + """Registry for managing project backends with plugin support.""" + + _instance = None + _backends: t.Dict[str, t.Type[ProjectBackend]] = {} + _aliases: t.Dict[str, str] = {} + _discovered = False + + def __new__(cls): + """Singleton pattern to ensure single registry instance.""" + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + @classmethod + def instance(cls) -> "BackendRegistry": + """Get the singleton registry instance.""" + if cls._instance is None: + cls._instance = cls() + return cls._instance + + def register_backend( + self, + name: str, + backend_class: t.Type[ProjectBackend], + aliases: t.Optional[t.List[str]] = None, + overwrite: bool = False, + ) -> None: + """Register a backend class with the registry. + + Args: + name: Primary name for the backend + backend_class: The backend class to register + aliases: Optional list of alternative names for the backend + overwrite: Whether to overwrite existing backends with the same name + + Raises: + TypeError: If backend_class doesn't inherit from ProjectBackend + ValueError: If backend name already exists and overwrite=False + """ + if not name or not isinstance(name, str): + raise ValueError("Backend name must be a non-empty string") + + if not issubclass(backend_class, ProjectBackend): + raise TypeError( + f"Backend class {backend_class} must inherit from ProjectBackend" + ) + + # Check for existing registration + if name in self._backends and not overwrite: + raise ValueError( + f"Backend '{name}' is already registered. Use overwrite=True to replace." + ) + + self._backends[name] = backend_class + logger.debug(f"Registered backend: {name} -> {backend_class}") + + # Register aliases + if aliases: + for alias in aliases: + if not alias or not isinstance(alias, str): + logger.warning( + f"Invalid alias '{alias}' for backend '{name}', skipping" + ) + continue + + if alias in self._aliases and not overwrite: + logger.warning(f"Alias '{alias}' already exists, skipping") + continue + + self._aliases[alias] = name + logger.debug(f"Registered backend alias: {alias} -> {name}") + + def get_backend(self, name: str) -> t.Type[ProjectBackend]: + """Get a backend class by name. + + Args: + name: Name or alias of the backend + + Returns: + The backend class + + Raises: + ValueError: If backend is not found + """ + # Ensure backends are discovered + if not self._discovered: + self.discover_backends() + + # Check if it's an alias first + if name in self._aliases: + name = self._aliases[name] + + if name not in self._backends: + available = list(self._backends.keys()) + list(self._aliases.keys()) + raise ValueError( + f"Backend '{name}' not found. Available backends: {available}" + ) + + return self._backends[name] + + def list_available_backends(self) -> t.List[str]: + """List all available backend names. + + Returns: + List of backend names (primary names only, not aliases) + """ + if not self._discovered: + self.discover_backends() + + return list(self._backends.keys()) + + def list_all_names(self) -> t.Dict[str, t.List[str]]: + """List all backend names including aliases. + + Returns: + Dictionary mapping primary names to lists of all names (including aliases) + """ + if not self._discovered: + self.discover_backends() + + result = {} + for primary_name in self._backends.keys(): + aliases = [ + alias + for alias, target in self._aliases.items() + if target == primary_name + ] + result[primary_name] = [primary_name] + aliases + + return result + + def discover_backends(self) -> t.Dict[str, t.Type[ProjectBackend]]: + """Discover and register backends from entry points and manual registration. + + Returns: + Dictionary of discovered backends + """ + if self._discovered: + return self._backends.copy() + + logger.debug("Discovering backends...") + + # First register built-in backends manually (for now) + self._register_builtin_backends() + + # Then discover from entry points + self._discover_from_entry_points() + + self._discovered = True + logger.info( + f"Backend discovery complete. Found {len(self._backends)} backends." + ) + + return self._backends.copy() + + def _register_builtin_backends(self) -> None: + """Register the built-in backends.""" + try: + from .local_csv import LocalCSVProjectBackend + + self.register_backend("local/csv", LocalCSVProjectBackend) + + from .ragas_app import RagasAppProjectBackend + + self.register_backend("ragas/app", RagasAppProjectBackend) + + # Box backend (optional import) + try: + from .box_csv import BoxCSVProjectBackend + + self.register_backend("box/csv", BoxCSVProjectBackend) + except ImportError: + logger.debug("Box backend not available (optional dependency)") + + except ImportError as e: + logger.warning(f"Failed to import built-in backend: {e}") + + def _discover_from_entry_points(self) -> None: + """Discover backends from setuptools entry points.""" + try: + # Look for entry points in the 'ragas.backends' group + entry_points = metadata.entry_points().select(group="ragas.backends") + + for entry_point in entry_points: + try: + backend_class = entry_point.load() + self.register_backend(entry_point.name, backend_class) + logger.info( + f"Discovered backend from entry point: {entry_point.name}" + ) + + except Exception as e: + logger.warning(f"Failed to load backend '{entry_point.name}': {e}") + + except Exception as e: + logger.debug( + f"Entry point discovery failed (this is normal if no plugins installed): {e}" + ) + + def get_backend_info(self, name: str) -> t.Dict[str, t.Any]: + """Get detailed information about a backend. + + Args: + name: Name or alias of the backend + + Returns: + Dictionary with backend information + """ + backend_class = self.get_backend(name) + + # Resolve to primary name if it's an alias + primary_name = name + if name in self._aliases: + primary_name = self._aliases[name] + + # Get all aliases for this backend + aliases = [ + alias for alias, target in self._aliases.items() if target == primary_name + ] + + return { + "name": primary_name, + "class": backend_class, + "module": backend_class.__module__, + "aliases": aliases, + "doc": backend_class.__doc__ or "No documentation available", + } + + def list_backend_info(self) -> t.List[t.Dict[str, t.Any]]: + """List detailed information about all backends. + + Returns: + List of dictionaries with backend information + """ + if not self._discovered: + self.discover_backends() + + return [self.get_backend_info(name) for name in self._backends.keys()] + + def clear(self) -> None: + """Clear all registered backends. Mainly for testing.""" + self._backends.clear() + self._aliases.clear() + self._discovered = False + + def create_backend( + self, backend_type: str, config: t.Optional["BackendConfig"] = None, **kwargs + ) -> ProjectBackend: + """Create a backend instance. + + Args: + backend_type: The type of backend to create + config: Pre-built configuration object (preferred) + **kwargs: Arguments specific to the backend + + Returns: + ProjectBackend: An instance of the requested backend + """ + backend_class = self.get_backend(backend_type) + + if config is not None: + return backend_class(config) + + # Create config from kwargs using the appropriate config class + config_class = _get_config_class(backend_type) + config_instance = config_class(**kwargs) + return backend_class(config_instance) + + +# Global registry instance +_registry = BackendRegistry.instance() + + +def get_registry() -> BackendRegistry: + """Get the global backend registry instance.""" + return _registry + + +def register_backend( + name: str, + backend_class: t.Type[ProjectBackend], + aliases: t.Optional[t.List[str]] = None, +) -> None: + """Register a backend with the global registry. + + Args: + name: Primary name for the backend + backend_class: The backend class to register + aliases: Optional list of alternative names for the backend + """ + _registry.register_backend(name, backend_class, aliases) + + +def list_backends() -> t.List[str]: + """List all available backend names.""" + return _registry.list_available_backends() + + +def get_backend_info(name: str) -> t.Dict[str, t.Any]: + """Get detailed information about a specific backend.""" + return _registry.get_backend_info(name) + + +def list_backend_info() -> t.List[t.Dict[str, t.Any]]: + """List detailed information about all available backends.""" + return _registry.list_backend_info() + + +def print_available_backends() -> None: + """Print a formatted list of available backends.""" + backends = _registry.list_backend_info() + + if not backends: + print("No backends available.") + return + + print("Available backends:") + print("-" * 50) + + for backend in backends: + print(f"Name: {backend['name']}") + if backend["aliases"]: + print(f"Aliases: {', '.join(backend['aliases'])}") + print(f"Module: {backend['module']}") + print(f"Description: {backend['doc']}") + print("-" * 50) + + +def _get_config_class(backend_type: str) -> t.Type["BackendConfig"]: + """Get configuration class for backend type.""" + from .config import LocalCSVConfig, RagasAppConfig + + try: + from .config import BoxCSVConfig + except ImportError: + BoxCSVConfig = None + + config_mapping = { + "local/csv": LocalCSVConfig, + "ragas/app": RagasAppConfig, + } + + if BoxCSVConfig is not None: + config_mapping["box/csv"] = BoxCSVConfig + + if backend_type not in config_mapping: + available = list(config_mapping.keys()) + raise ValueError( + f"Unknown backend type '{backend_type}'. Available: {available}" + ) + + return config_mapping[backend_type] + + +def create_project_backend( + backend_type: str, config: t.Optional["BackendConfig"] = None, **kwargs +) -> ProjectBackend: + """Create a project backend instance with structured configuration. + + Args: + backend_type: The type of backend to create ("local/csv", "box/csv", "ragas/app") + config: Pre-built configuration object (preferred) + **kwargs: Configuration parameters (alternative to config object) + + Returns: + ProjectBackend: An instance of the requested backend + """ + backend_class = _registry.get_backend(backend_type) + + if config is not None: + # Use provided config object + return backend_class(config) + + # Create config from kwargs + config_class = _get_config_class(backend_type) + config_instance = config_class(**kwargs) + + return backend_class(config_instance) diff --git a/experimental/ragas_experimental/backends/utils.py b/experimental/ragas_experimental/backends/utils.py new file mode 100644 index 000000000..63cebf72e --- /dev/null +++ b/experimental/ragas_experimental/backends/utils.py @@ -0,0 +1,237 @@ +"""Shared utilities for project module.""" + +import random +import string +import uuid + + +def create_nano_id(size=12): + """Create a short, URL-safe unique identifier.""" + # Define characters to use (alphanumeric) + alphabet = string.ascii_letters + string.digits + + # Generate UUID and convert to int + uuid_int = uuid.uuid4().int + + # Convert to base62 + result = "" + while uuid_int: + uuid_int, remainder = divmod(uuid_int, len(alphabet)) + result = alphabet[remainder] + result + + # Pad if necessary and return desired length + return result[:size] + + +class MemorableNames: + """Generator for memorable, unique names for experiments and datasets.""" + + def __init__(self): + # List of adjectives (similar to what Docker uses) + self.adjectives = [ + "admiring", + "adoring", + "affectionate", + "agitated", + "amazing", + "angry", + "awesome", + "blissful", + "bold", + "boring", + "brave", + "busy", + "charming", + "clever", + "cool", + "compassionate", + "competent", + "condescending", + "confident", + "cranky", + "crazy", + "dazzling", + "determined", + "distracted", + "dreamy", + "eager", + "ecstatic", + "elastic", + "elated", + "elegant", + "eloquent", + "epic", + "fervent", + "festive", + "flamboyant", + "focused", + "friendly", + "frosty", + "gallant", + "gifted", + "goofy", + "gracious", + "happy", + "hardcore", + "heuristic", + "hopeful", + "hungry", + "infallible", + "inspiring", + "jolly", + "jovial", + "keen", + "kind", + "laughing", + "loving", + "lucid", + "magical", + "mystifying", + "modest", + "musing", + "naughty", + "nervous", + "nifty", + "nostalgic", + "objective", + "optimistic", + "peaceful", + "pedantic", + "pensive", + "practical", + "priceless", + "quirky", + "quizzical", + "relaxed", + "reverent", + "romantic", + "sad", + "serene", + "sharp", + "silly", + "sleepy", + "stoic", + "stupefied", + "suspicious", + "sweet", + "tender", + "thirsty", + "trusting", + "upbeat", + "vibrant", + "vigilant", + "vigorous", + "wizardly", + "wonderful", + "xenodochial", + "youthful", + "zealous", + "zen", + ] + + # List of influential computer scientists and tech entrepreneurs + self.scientists = [ + "turing", + "hopper", + "knuth", + "torvalds", + "ritchie", + "thompson", + "dijkstra", + "kay", + "wozniak", + "gates", + "jobs", + "musk", + "bezos", + "lovelace", + "berners_lee", + "cerf", + "gosling", + "kernighan", + "lamport", + "mccarthy", + "minsky", + "rossum", + "backus", + "engelbart", + "hamilton", + "chomsky", + "shannon", + "zuckerberg", + "page", + "brin", + "matsumoto", + "stallman", + "stroustrup", + "cook", + "neumann", + "babbage", + "tanenbaum", + "rivest", + "shamir", + "adleman", + "carmack", + "andreessen", + "ullman", + "postel", + "huffman", + "boole", + "curry", + "liskov", + "wing", + "goldwasser", + "hoare", + "milner", + "perlis", + "sutherland", + "tarjan", + "valiant", + "yao", + "hopcroft", + "naur", + "wilkes", + "codd", + "diffie", + "hellman", + "pearl", + "thiel", + "narayen", + "nadella", + "pichai", + "dorsey", + ] + + self.used_names = set() + + def generate_name(self): + """Generate a single memorable name.""" + adjective = random.choice(self.adjectives) + scientist = random.choice(self.scientists) + return f"{adjective}_{scientist}" + + def generate_unique_name(self): + """Generate a unique memorable name.""" + attempts = 0 + max_attempts = 100 # Prevent infinite loops + + while attempts < max_attempts: + name = self.generate_name() + if name not in self.used_names: + self.used_names.add(name) + return name + attempts += 1 + + # If we exhaust our combinations, add a random suffix + base_name = self.generate_name() + unique_name = f"{base_name}_{random.randint(1000, 9999)}" + self.used_names.add(unique_name) + return unique_name + + def generate_unique_names(self, count): + """Generate multiple unique memorable names.""" + return [self.generate_unique_name() for _ in range(count)] + + +# Global instance for easy access +memorable_names = MemorableNames() diff --git a/experimental/ragas_experimental/dataset.py b/experimental/ragas_experimental/dataset.py index e36cd21cc..fcce1eeef 100644 --- a/experimental/ragas_experimental/dataset.py +++ b/experimental/ragas_experimental/dataset.py @@ -2,10 +2,12 @@ __all__ = [ "BaseModelType", + "DataTable", "Dataset", ] import typing as t +from typing import overload, Literal try: import pandas as pd @@ -16,23 +18,202 @@ ExtendedPydanticBaseModel as BaseModel, ) -from .backends.ragas_api_client import RagasApiClient -from .project.backends import ( - LocalCSVProjectBackend, - PlatformProjectBackend, -) -from .typing import SUPPORTED_BACKENDS +from .backends import DataTableBackend + +# Type-only imports +if t.TYPE_CHECKING: + from .project.core import Project BaseModelType = t.TypeVar("BaseModelType", bound=BaseModel) -class Dataset(t.Generic[BaseModelType]): - """A list-like interface for managing dataset entries with backend synchronization. +class DataTable(t.Generic[BaseModelType]): + """A list-like interface for managing datatable entries with backend synchronization. This class behaves like a Python list while synchronizing operations with the - chosen backend (Ragas API or local filesystem). + chosen backend (Ragas API or local filesystem). Base class for Dataset and Experiment. """ + # Type-safe overloads for dataset creation + @overload + @classmethod + def create( + cls, + name: str, + model: t.Type[BaseModel], + project: "Project", + dataset_type: Literal["datasets"] = "datasets", + ) -> "DataTable[BaseModel]": ... + + @overload + @classmethod + def create( + cls, + name: str, + model: t.Type[BaseModel], + project: "Project", + dataset_type: Literal["experiments"], + ) -> "DataTable[BaseModel]": ... + + @classmethod + def create( + cls, + name: str, + model: t.Type[BaseModel], + project: "Project", + dataset_type: Literal["datasets", "experiments"] = "datasets", + ) -> "DataTable[BaseModel]": + """Create a new dataset with type-safe parameters. + + Args: + name: Name of the dataset + model: Pydantic model class for entries + project: Project instance to create the dataset in + dataset_type: Type of dataset ("datasets" or "experiments") + + Returns: + Dataset: A new dataset instance + + Examples: + >>> # Create a dataset + >>> dataset = Dataset.create("my_data", MyModel, project) + + >>> # Create an experiment + >>> experiment = Dataset.create("my_experiment", MyModel, project, "experiments") + """ + # Use the project's backend to create the dataset + if dataset_type == "datasets": + dataset_id = project._backend.create_dataset(name, model) + backend = project._backend.get_dataset_backend(dataset_id, name, model) + else: # experiments + dataset_id = project._backend.create_experiment(name, model) + backend = project._backend.get_experiment_backend(dataset_id, name, model) + + # Create the dataset with the simplified constructor + return cls._create_with_backend( + name=name, + model=model, + project_id=project.project_id, + dataset_id=dataset_id, + datatable_type=dataset_type, + backend=backend, + ) + + # Type-safe overloads for getting existing datasets + @overload + @classmethod + def get_dataset( + cls, + name: str, + model: t.Type[BaseModel], + project: "Project", + dataset_type: Literal["datasets"] = "datasets", + ) -> "DataTable[BaseModel]": ... + + @overload + @classmethod + def get_dataset( + cls, + name: str, + model: t.Type[BaseModel], + project: "Project", + dataset_type: Literal["experiments"], + ) -> "DataTable[BaseModel]": ... + + @classmethod + def get_dataset( + cls, + name: str, + model: t.Type[BaseModel], + project: "Project", + dataset_type: Literal["datasets", "experiments"] = "datasets", + ) -> "DataTable[BaseModel]": + """Get an existing dataset by name with type-safe parameters. + + Args: + name: Name of the dataset to retrieve + model: Pydantic model class for entries + project: Project instance containing the dataset + dataset_type: Type of dataset ("datasets" or "experiments") + + Returns: + Dataset: The existing dataset instance + + Examples: + >>> # Get a dataset + >>> dataset = Dataset.get_dataset("my_data", MyModel, project) + + >>> # Get an experiment + >>> experiment = Dataset.get_dataset("my_experiment", MyModel, project, "experiments") + """ + # Use the project's backend to get the dataset + if dataset_type == "datasets": + dataset_id, _ = project._backend.get_dataset_by_name(name, model) + backend = project._backend.get_dataset_backend(dataset_id, name, model) + else: # experiments + dataset_id, _ = project._backend.get_experiment_by_name(name, model) + backend = project._backend.get_experiment_backend(dataset_id, name, model) + + # Create the dataset with the simplified constructor + return cls._create_with_backend( + name=name, + model=model, + project_id=project.project_id, + dataset_id=dataset_id, + datatable_type=dataset_type, + backend=backend, + ) + + @classmethod + def _create_with_backend( + cls, + name: str, + model: t.Type[BaseModel], + project_id: str, + dataset_id: str, + datatable_type: t.Literal["datasets", "experiments"], + backend: DataTableBackend, + ) -> "DataTable[BaseModel]": + """Internal helper to create a dataset with a backend instance. + + Args: + name: Dataset name + model: Pydantic model class + project_id: Project ID + dataset_id: Dataset ID + datatable_type: Dataset or experiment type + backend: Backend instance + + Returns: + DataTable: New datatable instance + """ + # Create the instance without calling __init__ + instance = cls.__new__(cls) + + # Set basic properties + instance.name = name + instance.model = model + instance.project_id = project_id + instance.dataset_id = dataset_id + instance.backend_type = getattr(backend, "backend_type", "unknown") + instance.datatable_type = datatable_type + instance._entries = [] + instance._backend = backend + + # Initialize the backend with this dataset + instance._backend.initialize(instance) + + # Initialize column mapping if it doesn't exist yet + if not hasattr(instance.model, "__column_mapping__"): + instance.model.__column_mapping__ = {} + + # Get column mappings from backend and update the model's mapping + column_mapping = instance._backend.get_column_mapping(model) + for field_name, column_id in column_mapping.items(): + instance.model.__column_mapping__[field_name] = column_id + + return instance + def __init__( self, name: str, @@ -40,11 +221,12 @@ def __init__( project_id: str, dataset_id: str, datatable_type: t.Literal["datasets", "experiments"], - ragas_api_client: t.Optional[RagasApiClient] = None, - backend: SUPPORTED_BACKENDS = "local/csv", - local_root_dir: t.Optional[str] = None, + backend: DataTableBackend, ): - """Initialize a Dataset with the specified backend. + """Initialize a Dataset with a backend instance. + + Note: This constructor is primarily for internal use. + For new code, prefer using Dataset.create() or Dataset.get() class methods. Args: name: The name of the dataset @@ -52,55 +234,17 @@ def __init__( project_id: The ID of the parent project dataset_id: The ID of this dataset datatable_type: Whether this is for "datasets" or "experiments" - ragas_api_client: Required for ragas/app backend - backend: The storage backend to use (ragas/app or local/csv) - local_root_dir: Required for local backend + backend: The backend instance to use """ # Store basic properties self.name = name self.model = model self.project_id = project_id self.dataset_id = dataset_id - self.backend_type = backend + self.backend_type = getattr(backend, "backend_type", "unknown") self.datatable_type = datatable_type self._entries: t.List[BaseModelType] = [] - - # Create the appropriate backend using the project backend system - if backend == "ragas/app": - if ragas_api_client is None: - raise ValueError("ragas_api_client is required for ragas/app backend") - - # Create a platform project backend and get dataset backend from it - project_backend = PlatformProjectBackend(ragas_api_client) - project_backend.initialize(project_id) - - if datatable_type == "datasets": - self._backend = project_backend.get_dataset_backend( - dataset_id, name, model - ) - else: # experiments - self._backend = project_backend.get_experiment_backend( - dataset_id, name, model - ) - - elif backend == "local/csv": - if local_root_dir is None: - raise ValueError("local_root_dir is required for local/csv backend") - - # Create a local CSV project backend and get dataset backend from it - project_backend = LocalCSVProjectBackend(local_root_dir) - project_backend.initialize(project_id) - - if datatable_type == "datasets": - self._backend = project_backend.get_dataset_backend( - dataset_id, name, model - ) - else: # experiments - self._backend = project_backend.get_experiment_backend( - dataset_id, name, model - ) - else: - raise ValueError(f"Unsupported backend: {backend}") + self._backend = backend # Initialize the backend with this dataset self._backend.initialize(self) @@ -334,3 +478,13 @@ def get( return self._backend.get_entry_by_field(field_name, field_value, self.model) return None + + +class Dataset(DataTable[BaseModelType]): + """Dataset class for managing dataset entries. + + Inherits all functionality from DataTable. This class represents + datasets specifically (as opposed to experiments). + """ + + pass diff --git a/experimental/ragas_experimental/experiment.py b/experimental/ragas_experimental/experiment.py index 7fc54eb21..19451a097 100644 --- a/experimental/ragas_experimental/experiment.py +++ b/experimental/ragas_experimental/experiment.py @@ -8,20 +8,17 @@ ExtendedPydanticBaseModel as BaseModel, ) -from .backends.ragas_api_client import RagasApiClient -from .dataset import Dataset +from .dataset import DataTable -class Experiment(Dataset): +class Experiment(DataTable): def __init__( self, name: str, model: t.Type[BaseModel], project_id: str, experiment_id: str, - ragas_api_client: t.Optional[RagasApiClient] = None, - backend: t.Literal["ragas/app", "local/csv"] = "ragas/app", - local_root_dir: t.Optional[str] = None, + backend, # DataTableBackend instance ): self.experiment_id = experiment_id super().__init__( @@ -29,10 +26,8 @@ def __init__( model=model, project_id=project_id, dataset_id=experiment_id, - ragas_api_client=ragas_api_client, - backend=backend, - local_root_dir=local_root_dir, datatable_type="experiments", + backend=backend, ) def __str__(self): diff --git a/experimental/ragas_experimental/project/__init__.py b/experimental/ragas_experimental/project/__init__.py index 483221e10..29b1cdaa5 100644 --- a/experimental/ragas_experimental/project/__init__.py +++ b/experimental/ragas_experimental/project/__init__.py @@ -4,8 +4,8 @@ multiple backend storage options including local CSV files and the Ragas app. """ -from .backends import ( - DatasetBackend, +from ..backends import ( + DataTableBackend, ProjectBackend, create_project_backend, list_backends, @@ -17,60 +17,13 @@ __all__ = [ "Project", - "create_project", - "get_project", "MemorableNames", "memorable_names", "create_nano_id", "ProjectBackend", - "DatasetBackend", + "DataTableBackend", "create_project_backend", "list_backends", "print_available_backends", "register_backend", ] - - -def create_project( - name: str, description: str = "", backend: str = "local/csv", **kwargs -) -> Project: - """Create a new project with the specified backend. - - Args: - name: Name of the project - description: Description of the project - backend: Backend type ("local/csv" or "ragas/app") - **kwargs: Additional backend-specific arguments - - Returns: - Project: A new project instance - - Examples: - >>> # Create a local project - >>> project = create_project("my_project", backend="local/csv", root_dir="/path/to/projects") - - >>> # Create a ragas/app project - >>> project = create_project("my_project", backend="ragas/app", ragas_api_client=client) - """ - return Project.create(name=name, description=description, backend=backend, **kwargs) - - -def get_project(name: str, backend: str = "local/csv", **kwargs) -> Project: - """Get an existing project by name. - - Args: - name: Name of the project to retrieve - backend: Backend type ("local/csv" or "ragas/app") - **kwargs: Additional backend-specific arguments - - Returns: - Project: The existing project instance - - Examples: - >>> # Get a local project - >>> project = get_project("my_project", backend="local/csv", root_dir="/path/to/projects") - - >>> # Get a ragas/app project - >>> project = get_project("my_project", backend="ragas/app", ragas_api_client=client) - """ - return Project.get(name=name, backend=backend, **kwargs) diff --git a/experimental/ragas_experimental/project/core.py b/experimental/ragas_experimental/project/core.py index e2c0ae114..c1c7c4fff 100644 --- a/experimental/ragas_experimental/project/core.py +++ b/experimental/ragas_experimental/project/core.py @@ -5,22 +5,34 @@ import os import shutil import typing as t +from typing import overload, Literal, Optional -import ragas_experimental.typing as rt from ragas_experimental.model.pydantic_model import ( ExtendedPydanticBaseModel as BaseModel, ) -from ..backends.factory import RagasApiClientFactory -from ..backends.ragas_api_client import RagasApiClient from ..dataset import Dataset from ..experiment import Experiment -from ..utils import async_to_sync -from .backends import ProjectBackend -from .backends.local_csv import LocalCSVProjectBackend -from .backends.platform import PlatformProjectBackend +from ..backends import ProjectBackend, create_project_backend from .decorators import add_experiment_decorators +# Type-only imports for Box client protocol +if t.TYPE_CHECKING: + from ..backends.config import BoxClientProtocol + from ..backends.ragas_api_client import RagasApiClient + from ..backends.local_csv import LocalCSVProjectBackend + from ..backends.ragas_app import RagasAppProjectBackend +else: + # Runtime imports for isinstance checks + try: + from ..backends.local_csv import LocalCSVProjectBackend + except ImportError: + LocalCSVProjectBackend = None + try: + from ..backends.ragas_app import RagasAppProjectBackend + except ImportError: + RagasAppProjectBackend = None + class Project: """Represents an AI project for managing datasets and experiments.""" @@ -51,122 +63,189 @@ def __init__( # Add experiment decorator methods add_experiment_decorators(self) + # Type-safe overloads for different backend types + @overload + @classmethod + def create( + cls, + name: str, + backend_type: Literal["local/csv"], + *, + description: str = "", + root_dir: str = "./ragas_data", + ) -> "Project": ... + + @overload + @classmethod + def create( + cls, + name: str, + backend_type: Literal["ragas/app"], + *, + description: str = "", + api_key: Optional[str] = None, + api_url: str = "https://api.ragas.io", + timeout: int = 30, + max_retries: int = 3, + ragas_api_client: Optional["RagasApiClient"] = None, + ) -> "Project": ... + + @overload @classmethod def create( cls, name: str, + backend_type: Literal["box/csv"], + *, description: str = "", - backend: rt.SUPPORTED_BACKENDS = "local/csv", - root_dir: t.Optional[str] = None, - ragas_api_client: t.Optional[RagasApiClient] = None, + client: "BoxClientProtocol", + root_folder_id: str = "0", + ) -> "Project": ... + + @classmethod + def create( + cls, name: str, backend_type: str, *, description: str = "", **kwargs ) -> "Project": - """Create a new project. + """Create a new project with the specified backend. Args: name: Name of the project + backend_type: Backend type ("local/csv", "ragas/app", or "box/csv") description: Description of the project - backend: Backend type ("local/csv" or "ragas/app") - root_dir: Root directory for local backends - ragas_api_client: API client for ragas/app backend + **kwargs: Backend-specific configuration parameters Returns: Project: A new project instance + + Examples: + >>> # Create a local project with type-safe parameters + >>> project = Project.create( + ... "my_project", + ... backend_type="local/csv", + ... root_dir="/path/to/projects" + ... ) + + >>> # Create a ragas/app project + >>> project = Project.create( + ... "my_project", + ... backend_type="ragas/app", + ... api_key="your_api_key" + ... ) + + >>> # Create a Box project + >>> project = Project.create( + ... "my_project", + ... backend_type="box/csv", + ... client=authenticated_box_client, + ... root_folder_id="123456" + ... ) """ - if backend == "ragas/app": - ragas_api_client = ragas_api_client or RagasApiClientFactory.create() - sync_version = async_to_sync(ragas_api_client.create_project) - new_project = sync_version(title=name, description=description) - - project_backend = PlatformProjectBackend(ragas_api_client) - return cls( - project_id=new_project["id"], - project_backend=project_backend, - name=new_project["title"], - description=new_project["description"], - ) - elif backend == "local/csv": - if root_dir is None: - raise ValueError("root_dir is required for local/csv backend") - - project_backend = LocalCSVProjectBackend(root_dir) - return cls( - project_id=name, # Use name as project_id for local - project_backend=project_backend, - name=name, - description=description, - ) - else: - raise ValueError(f"Unsupported backend: {backend}") + # Use the registry-based approach for backend creation + backend = create_project_backend(backend_type, **kwargs) + # Create and return the Project instance + return cls( + project_id=name, # Use name as project_id for simplicity + project_backend=backend, + name=name, + description=description, + ) + + # Type-safe overloads for get_project + @overload @classmethod def get( cls, name: str, - backend: rt.SUPPORTED_BACKENDS = "local/csv", - root_dir: t.Optional[str] = None, - ragas_api_client: t.Optional[RagasApiClient] = None, - ) -> "Project": + backend_type: Literal["local/csv"], + *, + root_dir: str = "./ragas_data", + ) -> "Project": ... + + @overload + @classmethod + def get( + cls, + name: str, + backend_type: Literal["ragas/app"], + *, + api_key: Optional[str] = None, + api_url: str = "https://api.ragas.io", + timeout: int = 30, + max_retries: int = 3, + ragas_api_client: Optional["RagasApiClient"] = None, + ) -> "Project": ... + + @overload + @classmethod + def get( + cls, + name: str, + backend_type: Literal["box/csv"], + *, + client: "BoxClientProtocol", + root_folder_id: str = "0", + ) -> "Project": ... + + @classmethod + def get(cls, name: str, backend_type: str, **kwargs) -> "Project": """Get an existing project by name. Args: - name: The name of the project to get - backend: The backend to use ("local/csv" or "ragas/app") - root_dir: The root directory for local backends - ragas_api_client: Optional custom Ragas API client + name: Name of the project to retrieve + backend_type: Backend type ("local/csv", "ragas/app", or "box/csv") + **kwargs: Backend-specific configuration parameters Returns: - Project: The project instance + Project: The existing project instance + + Examples: + >>> # Get a local project + >>> project = Project.get("my_project", backend_type="local/csv", root_dir="/path/to/projects") + + >>> # Get a ragas/app project + >>> project = Project.get("my_project", backend_type="ragas/app", api_key="your_api_key") + + >>> # Get a Box project + >>> project = Project.get("my_project", backend_type="box/csv", client=authenticated_box_client) """ - if backend == "ragas/app": - if ragas_api_client is None: - ragas_api_client = RagasApiClientFactory.create() - - # Get the project by name - sync_version = async_to_sync(ragas_api_client.get_project_by_name) - project_info = sync_version(project_name=name) - - project_backend = PlatformProjectBackend(ragas_api_client) - return cls( - project_id=project_info["id"], - project_backend=project_backend, - name=project_info["title"], - description=project_info["description"], - ) - elif backend == "local/csv": - if root_dir is None: - raise ValueError("root_dir is required for local/csv backend") - - # For local backend, check if project directory exists - project_path = os.path.join(root_dir, name) - if not os.path.exists(project_path): - raise ValueError( - f"Local project '{name}' does not exist at {project_path}" - ) - - project_backend = LocalCSVProjectBackend(root_dir) - return cls( - project_id=name, - project_backend=project_backend, - name=name, - description="", - ) - else: - raise ValueError(f"Unsupported backend: {backend}") + # Use the registry-based approach for backend creation + backend = create_project_backend(backend_type, **kwargs) + + # For local backend, check if project actually exists + if backend_type == "local/csv": + import os + + root_dir = kwargs.get("root_dir", "./ragas_data") + project_dir = os.path.join(root_dir, name) + if not os.path.exists(project_dir): + raise ValueError(f"Local project '{name}' does not exist in {root_dir}") + + # Get the existing project using the backend + return cls( + project_id=name, + project_backend=backend, + name=name, + description="", # Description will be loaded from backend if available + ) def delete(self): """Delete the project and all its data.""" - if isinstance(self._backend, PlatformProjectBackend): - sync_version = async_to_sync(self._backend.ragas_api_client.delete_project) - sync_version(project_id=self.project_id) - print("Project deleted from Ragas platform!") - elif isinstance(self._backend, LocalCSVProjectBackend): - # Caution: this deletes the entire project directory + # Check if backend has a delete method, otherwise handle basic deletion + if hasattr(self._backend, "delete_project"): + # Backend provides its own deletion logic + self._backend.delete_project(self.project_id) + print("Project deleted!") + elif hasattr(self._backend, "root_dir"): + # Local backend - delete project directory project_dir = os.path.join(self._backend.root_dir, self.project_id) if os.path.exists(project_dir): shutil.rmtree(project_dir) print(f"Local project at {project_dir} deleted!") else: print(f"Local project at {project_dir} does not exist") + else: + print("Project deletion not supported by this backend") # Dataset operations def create_dataset( @@ -186,24 +265,8 @@ def create_dataset( if name is None: name = model.__name__ - dataset_id = self._backend.create_dataset(name, model) - - backend_name = ( - "ragas/app" - if isinstance(self._backend, PlatformProjectBackend) - else "local/csv" - ) - - return Dataset( - name=name, - model=model, - project_id=self.project_id, - dataset_id=dataset_id, - datatable_type="datasets", - ragas_api_client=getattr(self._backend, "ragas_api_client", None), - backend=backend_name, - local_root_dir=getattr(self._backend, "root_dir", None), - ) + # Use the new Dataset.create() method for cleaner interface + return Dataset.create(name, model, self, "datasets") def get_dataset( self, @@ -219,26 +282,8 @@ def get_dataset( Returns: Dataset: The retrieved dataset """ - dataset_id, dataset_backend = self._backend.get_dataset_by_name( - dataset_name, model - ) - - backend_name = ( - "ragas/app" - if isinstance(self._backend, PlatformProjectBackend) - else "local/csv" - ) - - return Dataset( - name=dataset_name, - model=model, - project_id=self.project_id, - dataset_id=dataset_id, - datatable_type="datasets", - ragas_api_client=getattr(self._backend, "ragas_api_client", None), - backend=backend_name, - local_root_dir=getattr(self._backend, "root_dir", None), - ) + # Use the new Dataset.get_dataset() method for cleaner interface + return Dataset.get_dataset(dataset_name, model, self, "datasets") def list_datasets(self) -> t.List[str]: """List all datasets in the project. @@ -264,22 +309,17 @@ def create_experiment( Returns: Experiment: An experiment object for managing results """ + # Create experiment using backend experiment_id = self._backend.create_experiment(name, model) + backend = self._backend.get_experiment_backend(experiment_id, name, model) - backend_name = ( - "ragas/app" - if isinstance(self._backend, PlatformProjectBackend) - else "local/csv" - ) - + # Return Experiment object for better UX return Experiment( name=name, model=model, project_id=self.project_id, experiment_id=experiment_id, - ragas_api_client=getattr(self._backend, "ragas_api_client", None), - backend=backend_name, - local_root_dir=getattr(self._backend, "root_dir", None), + backend=backend, ) def get_experiment( @@ -296,24 +336,18 @@ def get_experiment( Returns: Experiment: The retrieved experiment """ + # Get experiment using backend experiment_id, experiment_backend = self._backend.get_experiment_by_name( experiment_name, model ) - backend_name = ( - "ragas/app" - if isinstance(self._backend, PlatformProjectBackend) - else "local/csv" - ) - + # Return Experiment object for better UX return Experiment( name=experiment_name, model=model, project_id=self.project_id, experiment_id=experiment_id, - ragas_api_client=getattr(self._backend, "ragas_api_client", None), - backend=backend_name, - local_root_dir=getattr(self._backend, "root_dir", None), + backend=experiment_backend, ) def list_experiments(self) -> t.List[str]: @@ -338,7 +372,9 @@ def get_dataset_path(self, dataset_name: str) -> str: Raises: ValueError: If not using local backend """ - if not isinstance(self._backend, LocalCSVProjectBackend): + if LocalCSVProjectBackend is None or not isinstance( + self._backend, LocalCSVProjectBackend + ): raise ValueError("This method is only available for local/csv backend") return os.path.join( self._backend._project_dir, "datasets", f"{dataset_name}.csv" @@ -356,7 +392,9 @@ def get_experiment_path(self, experiment_name: str) -> str: Raises: ValueError: If not using local backend """ - if not isinstance(self._backend, LocalCSVProjectBackend): + if LocalCSVProjectBackend is None or not isinstance( + self._backend, LocalCSVProjectBackend + ): raise ValueError("This method is only available for local/csv backend") return os.path.join( self._backend._project_dir, "experiments", f"{experiment_name}.csv" @@ -366,7 +404,8 @@ def __repr__(self) -> str: """String representation of the project.""" backend_name = ( "ragas/app" - if isinstance(self._backend, PlatformProjectBackend) + if RagasAppProjectBackend is not None + and isinstance(self._backend, RagasAppProjectBackend) else "local/csv" ) return f"Project(name='{self.name}', backend='{backend_name}')" diff --git a/experimental/ragas_experimental/typing.py b/experimental/ragas_experimental/typing.py index c7e2339a2..78045cb66 100644 --- a/experimental/ragas_experimental/typing.py +++ b/experimental/ragas_experimental/typing.py @@ -29,7 +29,7 @@ from .metric.result import MetricResult # Define supported backends -SUPPORTED_BACKENDS = t.Literal["ragas/app", "local/csv"] +SUPPORTED_BACKENDS = t.Literal["local/csv", "ragas/app", "box/csv"] class ColumnType(str, Enum): diff --git a/experimental/tests/backends/test_box_csv.py b/experimental/tests/backends/test_box_csv.py new file mode 100644 index 000000000..c6a5aae00 --- /dev/null +++ b/experimental/tests/backends/test_box_csv.py @@ -0,0 +1,513 @@ +"""Tests for Box CSV backend implementation.""" + +import io +import pytest +from unittest.mock import MagicMock, Mock, patch + +# Skip all tests if box dependencies not available + +try: + from ragas_experimental.backends.box_csv import ( + BoxCSVDataTableBackend, + BoxCSVProjectBackend, + ) + from ragas_experimental.backends.config import BoxCSVConfig + from boxsdk import BoxAPIException, Client + box_available = True +except ImportError: + box_available = False + +from ragas_experimental.model.pydantic_model import ( + ExtendedPydanticBaseModel as BaseModel, +) + + +# Test model for dataset entries +class TestEntry(BaseModel): + name: str + age: int + active: bool = True + + +@pytest.mark.skipif(not box_available, reason="Box SDK not available") +class TestBoxCSVDataTableBackend: + """Test BoxCSVDataTableBackend functionality.""" + + @pytest.fixture + def mock_box_client(self): + """Mock Box client for testing.""" + client = MagicMock(spec=Client) + + # Mock folder structure + mock_folder = MagicMock() + mock_folder.id = "test_folder_id" + mock_folder.get_items.return_value = [] + mock_folder.create_subfolder.return_value = mock_folder + mock_folder.upload_stream.return_value = MagicMock() + + client.folder.return_value = mock_folder + return client + + @pytest.fixture + def mock_dataset(self): + """Mock dataset for testing.""" + dataset = MagicMock() + dataset.model = TestEntry + dataset._entries = [] + return dataset + + @pytest.fixture + def backend(self, mock_box_client): + """Create backend instance for testing.""" + return BoxCSVDataTableBackend( + box_client=mock_box_client, + project_folder_id="project_123", + dataset_id="dataset_456", + dataset_name="test_dataset", + datatable_type="datasets", + ) + + def test_initialize(self, backend, mock_dataset, mock_box_client): + """Test backend initialization.""" + backend.initialize(mock_dataset) + + assert backend.dataset == mock_dataset + # Should call folder operations to ensure CSV exists + mock_box_client.folder.assert_called() + + def test_get_column_mapping(self, backend): + """Test column mapping retrieval.""" + mapping = backend.get_column_mapping(TestEntry) + assert mapping == TestEntry.model_fields + + @patch('ragas_experimental.backends.box_csv.csv') + def test_load_entries_empty_file(self, mock_csv, backend, mock_box_client): + """Test loading entries from empty CSV file.""" + # Mock empty CSV content + mock_file = MagicMock() + mock_file.content.return_value = b"_row_id,name,age,active\n" + backend._csv_file = mock_file + + mock_csv.DictReader.return_value = [] + + entries = backend.load_entries(TestEntry) + assert entries == [] + + @patch('ragas_experimental.backends.box_csv.csv') + def test_load_entries_with_data(self, mock_csv, backend, mock_box_client): + """Test loading entries from CSV with data.""" + # Mock CSV content with data + mock_file = MagicMock() + csv_content = "_row_id,name,age,active\nrow1,John,30,true\nrow2,Jane,25,false\n" + mock_file.content.return_value = csv_content.encode() + backend._csv_file = mock_file + + # Mock CSV reader + mock_reader = [ + {"_row_id": "row1", "name": "John", "age": "30", "active": "true"}, + {"_row_id": "row2", "name": "Jane", "age": "25", "active": "false"} + ] + mock_csv.DictReader.return_value = mock_reader + + entries = backend.load_entries(TestEntry) + + assert len(entries) == 2 + # Note: This test would need actual TestEntry instances to verify properly + # The mock would need to be more sophisticated to test type conversion + + def test_append_entry(self, backend, mock_box_client): + """Test appending new entry to CSV.""" + # Mock existing entries + backend.load_entries = MagicMock(return_value=[]) + backend._write_entries_to_box = MagicMock() + + entry = TestEntry(name="Alice", age=28) + row_id = backend.append_entry(entry) + + assert row_id is not None + assert hasattr(entry, "_row_id") + backend._write_entries_to_box.assert_called_once() + + def test_update_entry(self, backend): + """Test updating existing entry.""" + # Mock existing entries + existing_entry = TestEntry(name="Bob", age=35) + existing_entry._row_id = "test_id" + backend.load_entries = MagicMock(return_value=[existing_entry]) + backend._write_entries_to_box = MagicMock() + + # Update entry + updated_entry = TestEntry(name="Robert", age=36) + updated_entry._row_id = "test_id" + + result = backend.update_entry(updated_entry) + + assert result is True + backend._write_entries_to_box.assert_called_once() + + def test_delete_entry(self, backend, mock_dataset): + """Test deleting entry from CSV.""" + # Mock dataset entries + entry1 = TestEntry(name="Carol", age=40) + entry1._row_id = "keep_id" + entry2 = TestEntry(name="Dave", age=45) + entry2._row_id = "delete_id" + + mock_dataset._entries = [entry1, entry2] + backend.dataset = mock_dataset + backend._write_entries_to_box = MagicMock() + + result = backend.delete_entry("delete_id") + + assert result is True + backend._write_entries_to_box.assert_called_once() + + def test_get_entry_by_field(self, backend): + """Test finding entry by field value.""" + # Mock entries + entries = [ + TestEntry(name="Eve", age=50), + TestEntry(name="Frank", age=55), + ] + backend.load_entries = MagicMock(return_value=entries) + + found_entry = backend.get_entry_by_field("name", "Eve", TestEntry) + assert found_entry.name == "Eve" + + not_found = backend.get_entry_by_field("name", "Unknown", TestEntry) + assert not_found is None + + +@pytest.mark.skipif(not box_available, reason="Box SDK not available") +class TestBoxCSVProjectBackend: + """Test BoxCSVProjectBackend functionality.""" + + @pytest.fixture + def mock_client_config(self): + """Mock Box client configuration for testing.""" + mock_client = MagicMock(spec=Client) + # Mock successful authentication verification + mock_user = MagicMock() + mock_user.name = "Test User" + mock_client.user().get.return_value = mock_user + + return BoxCSVConfig(client=mock_client) + + @pytest.fixture + def mock_box_client_for_backend(self): + """Mock Box client for backend testing.""" + mock_client = MagicMock(spec=Client) + # Mock successful authentication verification + mock_user = MagicMock() + mock_user.name = "Test User" + mock_client.user().get.return_value = mock_user + return mock_client + + def test_backend_with_authenticated_client(self, mock_box_client_for_backend): + """Test creating backend with authenticated client.""" + config = BoxCSVConfig(client=mock_box_client_for_backend) + backend = BoxCSVProjectBackend(config) + + assert backend.box_client == mock_box_client_for_backend + assert backend.config.client == mock_box_client_for_backend + + def test_client_verification_success(self, mock_box_client_for_backend): + """Test that client verification works with valid client.""" + # This should not raise an exception + config = BoxCSVConfig(client=mock_box_client_for_backend) + assert config._authenticated_user == "Test User" + + def test_client_verification_failure(self): + """Test that client verification fails with invalid client.""" + mock_client = MagicMock(spec=Client) + # Make the user().get() call fail to simulate auth failure + mock_client.user().get.side_effect = Exception("Authentication failed") + + with pytest.raises(ValueError, match="Box client authentication failed"): + BoxCSVConfig(client=mock_client) + + def test_initialize(self, mock_client_config): + """Test backend initialization.""" + backend = BoxCSVProjectBackend(mock_client_config) + + # Mock folder operations + mock_root_folder = MagicMock() + mock_project_folder = MagicMock() + mock_project_folder.id = "project_folder_id" + + backend.box_client.folder.return_value = mock_root_folder + mock_root_folder.get_items.return_value = [] + mock_root_folder.create_subfolder.return_value = mock_project_folder + mock_project_folder.get_items.return_value = [] + mock_project_folder.create_subfolder.return_value = MagicMock() + + backend.initialize("test_project") + + assert backend.project_id == "test_project" + assert backend.project_folder == mock_project_folder + + def test_create_dataset(self, mock_client_config): + """Test dataset creation.""" + backend = BoxCSVProjectBackend(mock_client_config) + dataset_id = backend.create_dataset("test_dataset", TestEntry) + + assert dataset_id is not None + assert isinstance(dataset_id, str) + + def test_create_experiment(self, mock_client_config): + """Test experiment creation.""" + backend = BoxCSVProjectBackend(mock_client_config) + experiment_id = backend.create_experiment("test_experiment", TestEntry) + + assert experiment_id is not None + assert isinstance(experiment_id, str) + + def test_list_datasets(self, mock_client_config): + """Test listing datasets.""" + backend = BoxCSVProjectBackend(mock_client_config) + + mock_project_folder = MagicMock() + backend.project_folder = mock_project_folder + + # Mock datasets folder with CSV files + mock_datasets_folder = MagicMock() + mock_csv_file = MagicMock() + mock_csv_file.type = "file" + mock_csv_file.name = "dataset1.csv" + + # Mock the item returned from project folder + mock_folder_item = MagicMock() + mock_folder_item.type = "folder" + mock_folder_item.name = "datasets" + mock_folder_item.id = "datasets_id" + + mock_project_folder.get_items.return_value = [mock_folder_item] + backend.box_client.folder.return_value = mock_datasets_folder + mock_datasets_folder.get_items.return_value = [mock_csv_file] + + datasets = backend.list_datasets() + + assert len(datasets) == 1 + assert datasets[0]["name"] == "dataset1" + + def test_get_dataset_backend(self, mock_client_config): + """Test getting dataset backend instance.""" + backend = BoxCSVProjectBackend(mock_client_config) + backend.project_folder = MagicMock() + backend.project_folder.object_id = "project_id" + + dataset_backend = backend.get_dataset_backend("ds_123", "test_dataset", TestEntry) + + assert isinstance(dataset_backend, BoxCSVDataTableBackend) + assert dataset_backend.dataset_name == "test_dataset" + assert dataset_backend.datatable_type == "datasets" + + @patch('boxsdk.auth.jwt_auth.JWTAuth') + @patch('boxsdk.Client') + def test_from_jwt_file_factory(self, mock_client_class, mock_jwt_auth): + """Test JWT file factory method.""" + mock_auth = MagicMock() + mock_jwt_auth.from_settings_file.return_value = mock_auth + mock_client = MagicMock() + mock_client_class.return_value = mock_client + + # Mock successful authentication verification + mock_user = MagicMock() + mock_user.name = "Test User" + mock_client.user().get.return_value = mock_user + + backend = BoxCSVProjectBackend.from_jwt_file("config.json") + + mock_jwt_auth.from_settings_file.assert_called_once_with("config.json") + mock_client_class.assert_called_once_with(mock_auth) + assert backend.config.client == mock_client + + @patch('boxsdk.auth.oauth2.OAuth2') + @patch('boxsdk.Client') + def test_from_developer_token_factory(self, mock_client_class, mock_oauth2): + """Test developer token factory method.""" + mock_auth = MagicMock() + mock_oauth2.return_value = mock_auth + mock_client = MagicMock() + mock_client_class.return_value = mock_client + + # Mock successful authentication verification + mock_user = MagicMock() + mock_user.name = "Test User" + mock_client.user().get.return_value = mock_user + + backend = BoxCSVProjectBackend.from_developer_token("test_token") + + mock_oauth2.assert_called_once() + mock_client_class.assert_called_once_with(mock_auth) + assert backend.config.client == mock_client + + @patch('boxsdk.auth.oauth2.OAuth2') + @patch('boxsdk.Client') + def test_from_oauth2_factory(self, mock_client_class, mock_oauth2): + """Test OAuth2 factory method.""" + mock_auth = MagicMock() + mock_oauth2.return_value = mock_auth + mock_client = MagicMock() + mock_client_class.return_value = mock_client + + # Mock successful authentication verification + mock_user = MagicMock() + mock_user.name = "Test User" + mock_client.user().get.return_value = mock_user + + backend = BoxCSVProjectBackend.from_oauth2( + client_id="test_id", + client_secret="test_secret", + access_token="test_token" + ) + + mock_oauth2.assert_called_once_with( + client_id="test_id", + client_secret="test_secret", + access_token="test_token", + refresh_token=None + ) + mock_client_class.assert_called_once_with(mock_auth) + assert backend.config.client == mock_client + + +@pytest.mark.skipif(not box_available, reason="Box SDK not available") +class TestBoxCSVIntegration: + """Integration tests using VCR.py for Box API interactions.""" + + @pytest.mark.vcr() + def test_authentication_flow(self): + """Test Box authentication flow (recorded).""" + # This test would require actual Box credentials for initial recording + # For now, it serves as a placeholder for VCR integration + pass + + @pytest.mark.vcr() + def test_create_project_structure(self): + """Test creating project folder structure on Box (recorded).""" + # This would test the actual API calls for folder creation + pass + + @pytest.mark.vcr() + def test_upload_download_csv(self): + """Test uploading and downloading CSV files (recorded).""" + # This would test the actual file upload/download operations + pass + + @pytest.mark.vcr() + def test_error_handling_network_failure(self): + """Test error handling for network failures (recorded).""" + # This would test how the backend handles various API errors + pass + + +# Example VCR configuration for Box API testing +@pytest.fixture(scope="module") +def vcr_config(): + """VCR configuration for Box API tests.""" + return { + # Sanitize sensitive data in cassettes + "filter_headers": [ + ("authorization", "Bearer [REDACTED]"), + ("box-device-id", "[REDACTED]"), + ], + "filter_query_parameters": [ + ("access_token", "[REDACTED]"), + ], + # Record new interactions only once + "record_mode": "once", + # Match requests by method, uri, and body + "match_on": ["method", "uri", "body"], + # Cassette naming + "cassette_library_dir": "tests/cassettes/box", + "path_transformer": lambda path: path + ".yaml", + } + + +# Mock Box API responses for testing without VCR +@pytest.fixture +def mock_box_responses(): + """Mock Box API responses for comprehensive testing.""" + return { + "auth_success": { + "access_token": "mock_token", + "token_type": "bearer", + "expires_in": 3600, + }, + "folder_create": { + "type": "folder", + "id": "123456789", + "name": "test_folder", + }, + "file_upload": { + "type": "file", + "id": "987654321", + "name": "test.csv", + }, + "file_content": "name,age,active\nJohn,30,true\n", + } + + +@pytest.mark.skipif(not box_available, reason="Box SDK not available") +def test_backend_registration(): + """Test that Box backend is properly registered.""" + from ragas_experimental.backends.registry import get_registry + + registry = get_registry() + available_backends = registry.list_available_backends() + + # Check if box/csv backend is discoverable + # Note: This might fail in test environment without proper entry point setup + if "box/csv" in available_backends: + backend_info = registry.get_backend_info("box/csv") + assert "BoxCSVProjectBackend" in str(backend_info["class"]) + + +# Performance and stress tests +@pytest.mark.skipif(not box_available, reason="Box SDK not available") +class TestBoxCSVPerformance: + """Performance tests for Box CSV backend.""" + + def test_large_dataset_operations(self): + """Test operations with large datasets.""" + # This would test performance with many entries + pass + + def test_concurrent_operations(self): + """Test concurrent read/write operations.""" + # This would test thread safety and concurrent access + pass + + def test_memory_usage_streaming(self): + """Test memory usage during streaming operations.""" + # This would verify that streaming is used for large files + pass + + +# Error scenarios and edge cases +@pytest.mark.skipif(not box_available, reason="Box SDK not available") +class TestBoxCSVErrorHandling: + """Test error handling and edge cases.""" + + def test_invalid_client(self): + """Test handling of invalid Box client.""" + # This should raise Pydantic validation error + from pydantic import ValidationError + with pytest.raises(ValidationError): + BoxCSVConfig(client=None) + + def test_network_timeout(self): + """Test handling of network timeouts.""" + # This would test timeout scenarios + pass + + def test_rate_limiting(self): + """Test handling of Box API rate limits.""" + # This would test rate limit responses + pass + + def test_insufficient_permissions(self): + """Test handling of insufficient permissions.""" + # This would test permission denied scenarios + pass \ No newline at end of file diff --git a/experimental/tests/backends/test_config.py b/experimental/tests/backends/test_config.py new file mode 100644 index 000000000..df5c8dce2 --- /dev/null +++ b/experimental/tests/backends/test_config.py @@ -0,0 +1,140 @@ +"""Tests for backend configuration classes.""" + +import pytest +from ragas_experimental.backends.config import ( + LocalCSVConfig, + RagasAppConfig, +) + +# Import BoxCSVConfig if available +try: + from ragas_experimental.backends.config import BoxCSVConfig + HAS_BOX_CONFIG = True +except ImportError: + HAS_BOX_CONFIG = False + + +def test_local_csv_config(): + """Test LocalCSV configuration.""" + config = LocalCSVConfig(root_dir="/custom/path") + assert config.root_dir == "/custom/path" + + # Test defaults + default_config = LocalCSVConfig() + assert default_config.root_dir == "./ragas_data" + + +@pytest.mark.skipif(not HAS_BOX_CONFIG, reason="Box SDK not available") +def test_box_csv_config_with_client(): + """Test Box CSV configuration with authenticated client.""" + from unittest.mock import MagicMock + + # Mock an authenticated client + mock_client = MagicMock() + mock_user = MagicMock() + mock_user.name = "Test User" + mock_client.user().get.return_value = mock_user + + config = BoxCSVConfig(client=mock_client) + + assert config.client == mock_client + assert config.root_folder_id == "0" # default + assert config._authenticated_user == "Test User" + + +@pytest.mark.skipif(not HAS_BOX_CONFIG, reason="Box SDK not available") +def test_box_csv_config_with_custom_folder(): + """Test Box CSV configuration with custom root folder.""" + from unittest.mock import MagicMock + + mock_client = MagicMock() + mock_user = MagicMock() + mock_user.name = "Test User" + mock_client.user().get.return_value = mock_user + + config = BoxCSVConfig(client=mock_client, root_folder_id="123456") + + assert config.client == mock_client + assert config.root_folder_id == "123456" + + +@pytest.mark.skipif(not HAS_BOX_CONFIG, reason="Box SDK not available") +def test_box_csv_config_validation_missing_client(): + """Test Box CSV configuration validation for missing client.""" + from pydantic import ValidationError + + # Should raise error when client is not provided + with pytest.raises(ValidationError): + BoxCSVConfig() + + +@pytest.mark.skipif(not HAS_BOX_CONFIG, reason="Box SDK not available") +def test_box_csv_config_validation_invalid_client(): + """Test Box CSV configuration validation for invalid client.""" + from unittest.mock import MagicMock + + # Mock client that fails authentication + mock_client = MagicMock() + mock_client.user().get.side_effect = Exception("Authentication failed") + + with pytest.raises(ValueError, match="Box client authentication failed"): + BoxCSVConfig(client=mock_client) + + +@pytest.mark.skipif(not HAS_BOX_CONFIG, reason="Box SDK not available") +def test_box_csv_config_none_client(): + """Test Box CSV configuration with None client.""" + from pydantic import ValidationError + + with pytest.raises(ValidationError): + BoxCSVConfig(client=None) + + +@pytest.mark.skipif(not HAS_BOX_CONFIG, reason="Box SDK not available") +def test_box_csv_config_defaults(): + """Test Box CSV configuration defaults.""" + from unittest.mock import MagicMock + + mock_client = MagicMock() + mock_user = MagicMock() + mock_user.name = "Test User" + mock_client.user().get.return_value = mock_user + + config = BoxCSVConfig(client=mock_client) + + # Should default to root folder "0" + assert config.root_folder_id == "0" + + +def test_ragas_app_config(): + """Test Ragas App configuration.""" + config = RagasAppConfig(api_key="test_key") + assert config.api_key == "test_key" + assert config.api_url == "https://api.ragas.io" # default + assert config.timeout == 30 # default + assert config.max_retries == 3 # default + + +def test_ragas_app_config_custom_values(): + """Test Ragas App configuration with custom values.""" + config = RagasAppConfig( + api_url="https://custom.api.com", + api_key="custom_key", + timeout=60, + max_retries=5 + ) + + assert config.api_url == "https://custom.api.com" + assert config.api_key == "custom_key" + assert config.timeout == 60 + assert config.max_retries == 5 + + +def test_ragas_app_config_defaults(): + """Test Ragas App configuration defaults.""" + config = RagasAppConfig() + + assert config.api_url == "https://api.ragas.io" + assert config.api_key is None + assert config.timeout == 30 + assert config.max_retries == 3 \ No newline at end of file diff --git a/experimental/tests/e2e/test_integration.py b/experimental/tests/e2e/test_integration.py index 923590974..fc7cfd104 100644 --- a/experimental/tests/e2e/test_integration.py +++ b/experimental/tests/e2e/test_integration.py @@ -39,7 +39,7 @@ def temp_project(): project = Project.create( name="integration_test_project", description="Project for integration testing", - backend="local/csv", + backend_type="local/csv", root_dir=temp_dir ) yield project diff --git a/experimental/tests/unit/test_dataset.py b/experimental/tests/unit/test_dataset.py index c8d47d35b..d5fb957c8 100644 --- a/experimental/tests/unit/test_dataset.py +++ b/experimental/tests/unit/test_dataset.py @@ -29,7 +29,7 @@ def temp_dir(): @pytest.fixture def test_project(temp_dir): """Create a test project.""" - return Project.create(name="test_project", backend="local/csv", root_dir=temp_dir) + return Project.create(name="test_project", backend_type="local/csv", root_dir=temp_dir) @pytest.fixture diff --git a/experimental/tests/unit/test_project_core.py b/experimental/tests/unit/test_project_core.py index 823a2f91b..f779a6746 100644 --- a/experimental/tests/unit/test_project_core.py +++ b/experimental/tests/unit/test_project_core.py @@ -2,16 +2,17 @@ import tempfile import pytest -from ragas_experimental.project.core import Project +from ragas_experimental.project import Project def test_local_backend_creation(): """Test creating a project with local backend creates proper directory structure.""" + with tempfile.TemporaryDirectory() as temp_dir: local_project = Project.create( name="test_local_project", + backend_type="local/csv", description="A test project using local backend", - backend="local/csv", root_dir=temp_dir ) @@ -23,11 +24,12 @@ def test_local_backend_creation(): def test_local_backend_deletion(): """Test deleting a local backend project removes the directory.""" + with tempfile.TemporaryDirectory() as temp_dir: local_project = Project.create( name="test_local_project", + backend_type="local/csv", description="A test project using local backend", - backend="local/csv", root_dir=temp_dir ) @@ -44,21 +46,21 @@ def test_project_get_existing(): # Create a project local_project = Project.create( name="test_local_project", + backend_type="local/csv", description="A test project using local backend", - backend="local/csv", root_dir=temp_dir ) # Get the project retrieved_project = Project.get( name="test_local_project", - backend="local/csv", + backend_type="local/csv", root_dir=temp_dir ) assert retrieved_project.name == "test_local_project" # Check backend type by checking if it's a LocalCSVProjectBackend - from ragas_experimental.project.backends.local_csv import LocalCSVProjectBackend + from ragas_experimental.backends.local_csv import LocalCSVProjectBackend assert isinstance(retrieved_project._backend, LocalCSVProjectBackend) @@ -68,7 +70,7 @@ def test_project_get_nonexistent(): with pytest.raises(ValueError, match="Local project 'nonexistent' does not exist"): Project.get( name="nonexistent", - backend="local/csv", + backend_type="local/csv", root_dir=temp_dir ) @@ -78,8 +80,8 @@ def test_project_paths(): with tempfile.TemporaryDirectory() as temp_dir: local_project = Project.create( name="test_local_project", + backend_type="local/csv", description="A test project using local backend", - backend="local/csv", root_dir=temp_dir ) @@ -99,8 +101,8 @@ def test_project_repr(): with tempfile.TemporaryDirectory() as temp_dir: local_project = Project.create( name="test_local_project", + backend_type="local/csv", description="A test project using local backend", - backend="local/csv", root_dir=temp_dir )