Skip to content

Commit 0773595

Browse files
authored
feat: refactored out Project (#2106)
removes the concept of project because in local context the codebase you are working on the project. how you use it ```py from ragas_experimental import Dataset, experiment from pydantic import BaseModel import typing as t from pydantic import BaseModel class TestDataRow(BaseModel): id: t.Optional[int] query: str persona: t.List[t.Literal["opt1", "opt2", "opt3"]] # build the dataset dataset = Dataset( name="test_data_row", backend="local/csv", # here we providing it the "csv" name root_dir="./.logs", data_model=TestDataRow, ) # load your data for i in range(3): row = TestDataRow(id=i, query=f"query_{i}", persona=["opt1"]) dataset.append(row) # make sure to save it dataset.save() class ExperimentDataRow(TestDataRow): response: str metrics: t.List[MetricResult] @experiment(ExperimentDataRow) async def run_experiment(row: TestDataRow): response = "test" score1 = MetricResult(result=1, reason="score 1") score2 = MetricResult(result=0, reason="score 2") experiment_view = ExperimentDataRow( id=row.id, query=row.query, persona=["opt1"], response=response, metrics=[score1, score2], ) return experiment_view # run the experiment in async await run_experiment.run_async(dataset) ```
1 parent 71c6918 commit 0773595

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+3050
-6114
lines changed

.gitignore

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -165,19 +165,12 @@ cython_debug/
165165
.idea/
166166

167167
# Ragas specific
168-
experiments/
168+
_experiments/
169169
**/fil-result/
170170
ragas/src/ragas/_version.py
171171
experimental/ragas_experimental/_version.py
172172
.vscode
173173
.envrc
174174
uv.lock
175175
.cache/
176-
177-
# nbdev
178-
_proc/
179-
site/
180-
_version.py
181-
test_resources
182176
.claude
183-
**/old_nbs/*.md

CLAUDE.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,5 +191,5 @@ analytics_logger.addHandler(console_handler)
191191

192192
## Memories
193193

194-
- whenever you create such docs put in in /experiments because that is gitignored and you can use it as a scratchpad or tmp directory for storing these
195-
- always use uv to run python and python related commandline tools like isort, ruff, pyright ect. This is because we are using uv to manage the .venv and dependencies.
194+
- whenever you create such docs put in in /_experiments because that is gitignored and you can use it as a scratchpad or tmp directory for storing these
195+
- always use uv to run python and python related commandline tools like isort, ruff, pyright ect. This is because we are using uv to manage the .venv and dependencies.

experimental/pyproject.toml

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@ name = "ragas_experimental"
77
description = "Experimental extensions for Ragas"
88
requires-python = ">=3.9"
99
authors = [
10-
{name = "jjmachan", email = "jamesjithin97@gmail.com"}
10+
{name = "jjmachan", email = "jithin@explodinggradients.com"},
11+
{name = "ikka", email = "shahul@explodinggradients.com"}
1112
]
1213
license = {text = "Apache-2.0"}
1314
keywords = ["jupyter", "notebook", "python", "evaluation", "llm", "ragas"]
@@ -22,7 +23,7 @@ classifiers = [
2223
]
2324
dependencies = [
2425
"fastcore",
25-
"tqdm",
26+
"tqdm",
2627
"langfuse",
2728
"instructor",
2829
"pydantic",
@@ -40,8 +41,8 @@ readme = "README.md"
4041
all = ["pandas"]
4142

4243
[project.entry-points."ragas.backends"]
43-
local_csv = "ragas_experimental.project.backends.local_csv:LocalCSVProjectBackend"
44-
platform = "ragas_experimental.project.backends.platform:PlatformProjectBackend"
44+
"local/csv" = "ragas_experimental.backends.local_csv:LocalCSVBackend"
45+
"local/jsonl" = "ragas_experimental.backends.local_jsonl:LocalJSONLBackend"
4546

4647
[tool.setuptools.packages.find]
4748
include = ["ragas_experimental*"]
@@ -61,6 +62,11 @@ dev = [
6162
"pytest-mock>=3.10.0",
6263
"black",
6364
"ruff",
65+
"vcrpy",
66+
"pytest-vcr",
67+
]
68+
box = [
69+
"boxsdk[jwt]",
6470
]
6571
test = []
6672

experimental/ragas_experimental/__init__.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,7 @@
1111
except PackageNotFoundError:
1212
__version__ = "unknown"
1313

14-
from ragas_experimental.model.pydantic_model import (
15-
ExtendedPydanticBaseModel as BaseModel,
16-
)
14+
from ragas_experimental.dataset import Dataset
15+
from ragas_experimental.experiment import experiment, Experiment
1716

18-
from .project.core import Project
19-
20-
# Import the main Project class - decorators are added automatically in core.py
21-
22-
__all__ = ["Project", "BaseModel"]
17+
__all__ = ["Dataset", "experiment", "Experiment"]
Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
# Backend Architecture Guide
2+
3+
Simple plugin architecture for data storage backends. Implement one abstract class, register via entry points.
4+
5+
## Architecture
6+
7+
```
8+
Registry (dict-like) → Backend (implements BaseBackend) → Storage
9+
```
10+
11+
**Key Files:**
12+
- `base.py` - Abstract interface (6 methods)
13+
- `registry.py` - Plugin discovery & dict-like access
14+
- `local_csv.py`, `local_jsonl.py` - Reference implementations
15+
16+
## Quick Start
17+
18+
**1. Implement BaseBackend:**
19+
```python
20+
from ragas_experimental.backends.base import BaseBackend
21+
22+
class MyBackend(BaseBackend):
23+
def __init__(self, connection_string: str):
24+
self.conn = connection_string
25+
26+
def load_dataset(self, name: str) -> List[Dict[str, Any]]:
27+
# Load dataset from your storage
28+
return [{"id": 1, "text": "example"}]
29+
30+
def save_dataset(self, name: str, data: List[Dict], model: Optional[Type[BaseModel]]):
31+
# Save dataset to your storage
32+
pass
33+
34+
# ... implement other 4 methods (see base.py)
35+
```
36+
37+
**2. Register via entry points:**
38+
```toml
39+
# pyproject.toml
40+
[project.entry-points."ragas.backends"]
41+
"my_storage" = "my_package.backend:MyBackend"
42+
```
43+
44+
**3. Use:**
45+
```python
46+
from ragas_experimental.backends import get_registry
47+
registry = get_registry()
48+
backend = registry["my_storage"](connection_string="...")
49+
```
50+
51+
## Required Methods
52+
53+
**BaseBackend (6 methods):**
54+
```python
55+
# Data loading
56+
def load_dataset(name: str) -> List[Dict[str, Any]]
57+
def load_experiment(name: str) -> List[Dict[str, Any]]
58+
59+
# Data saving
60+
def save_dataset(name: str, data: List[Dict], model: Optional[Type[BaseModel]])
61+
def save_experiment(name: str, data: List[Dict], model: Optional[Type[BaseModel]])
62+
63+
# Listing
64+
def list_datasets() -> List[str]
65+
def list_experiments() -> List[str]
66+
```
67+
68+
## Registry Usage
69+
70+
**Dict-like interface:**
71+
```python
72+
from ragas_experimental.backends import get_registry
73+
74+
registry = get_registry()
75+
print(registry) # {'local/csv': <class 'LocalCSVBackend'>, ...}
76+
77+
# Access backend classes
78+
backend_class = registry["local/csv"]
79+
backend = backend_class(root_dir="./data")
80+
81+
# Check availability
82+
if "my_backend" in registry:
83+
backend = registry["my_backend"]()
84+
```
85+
86+
## Reference Implementations
87+
88+
**LocalCSVBackend** (`local_csv.py`):
89+
- **Pattern:** File-based storage with CSV format
90+
- **Init:** `LocalCSVBackend(root_dir="./data")`
91+
- **Storage:** `{root_dir}/datasets/{name}.csv`, `{root_dir}/experiments/{name}.csv`
92+
- **Features:** Directory auto-creation, UTF-8 encoding, proper CSV escaping
93+
94+
**LocalJSONLBackend** (`local_jsonl.py`):
95+
- **Pattern:** File-based storage with JSONL format
96+
- **Init:** `LocalJSONLBackend(root_dir="./data")`
97+
- **Storage:** `{root_dir}/datasets/{name}.jsonl`, `{root_dir}/experiments/{name}.jsonl`
98+
- **Features:** Handles complex nested data, preserves types
99+
100+
## Implementation Patterns
101+
102+
**Common backend structure:**
103+
```python
104+
class MyBackend(BaseBackend):
105+
def __init__(self, **config):
106+
# Initialize connection/client
107+
108+
def _get_storage_path(self, data_type: str, name: str):
109+
# Generate storage location
110+
111+
def _load(self, data_type: str, name: str):
112+
# Generic load implementation
113+
114+
def _save(self, data_type: str, name: str, data, model):
115+
# Generic save implementation
116+
117+
# Implement required methods using _load/_save
118+
def load_dataset(self, name): return self._load("datasets", name)
119+
def save_dataset(self, name, data, model): self._save("datasets", name, data, model)
120+
# ... etc
121+
```
122+
123+
**Error handling:**
124+
```python
125+
def load_dataset(self, name: str):
126+
try:
127+
return self._load("datasets", name)
128+
except FileNotFoundError:
129+
raise FileNotFoundError(f"Dataset '{name}' not found")
130+
except ConnectionError:
131+
raise RuntimeError(f"Storage connection failed")
132+
```
133+
134+
**Pydantic model handling:**
135+
```python
136+
def save_dataset(self, name: str, data: List[Dict], model: Optional[Type[BaseModel]]):
137+
if model:
138+
# Validate data against model if provided
139+
validated_data = [model(**item).model_dump() for item in data]
140+
self._save(name, validated_data)
141+
else:
142+
self._save(name, data)
143+
```
144+
145+
## Testing Your Backend
146+
147+
```python
148+
def test_backend():
149+
backend = MyBackend(config="test")
150+
151+
# Test save/load cycle
152+
test_data = [{"id": 1, "text": "test"}]
153+
backend.save_dataset("test_dataset", test_data, None)
154+
loaded = backend.load_dataset("test_dataset")
155+
assert loaded == test_data
156+
157+
# Test listing
158+
datasets = backend.list_datasets()
159+
assert "test_dataset" in datasets
160+
```
161+
162+
## Plugin Development
163+
164+
**Full plugin structure:**
165+
```
166+
my-backend-plugin/
167+
├── pyproject.toml # Entry point configuration
168+
├── src/my_backend/
169+
│ ├── __init__.py # Export backend class
170+
│ └── backend.py # Backend implementation
171+
└── tests/
172+
└── test_backend.py # Integration tests
173+
```
174+
175+
**Entry point registration:**
176+
```toml
177+
[project.entry-points."ragas.backends"]
178+
"s3" = "my_backend.backend:S3Backend"
179+
"postgres" = "my_backend.backend:PostgresBackend"
180+
```
181+
182+
**Install & use:**
183+
```bash
184+
pip install my-backend-plugin
185+
python -c "from ragas_experimental.backends import get_registry; print(get_registry())"
186+
```
187+
188+
## Registry Internals
189+
190+
**Discovery process:**
191+
1. Registry loads entry points from group `"ragas.backends"`
192+
2. Each entry point maps `name -> backend_class`
193+
3. Lazy loading - backends loaded on first access
194+
4. Dict-like interface for easy access
195+
196+
**Debugging:**
197+
```python
198+
from ragas_experimental.backends import get_registry
199+
registry = get_registry()
200+
201+
# Check what's available
202+
print(f"Available backends: {list(registry.keys())}")
203+
204+
# Get backend info
205+
for name in registry:
206+
backend_class = registry[name]
207+
print(f"{name}: {backend_class.__module__}.{backend_class.__name__}")
208+
```
209+
210+
## Design Decisions
211+
212+
**Why BaseBackend instead of separate Project/DataTable backends?**
213+
- Simpler: One interface to implement vs. two
214+
- Clearer: Backend owns both storage and operations
215+
- Flexible: Backends can optimize cross-operation concerns
216+
217+
**Why entry points vs. manual registration?**
218+
- Extensible: Third-party backends without code changes
219+
- Standard: Follows Python packaging conventions
220+
- Discoverable: Automatic registration on install
221+
222+
**Why dict-like registry?**
223+
- Intuitive: Familiar `registry["name"]` access pattern
224+
- Debuggable: Shows available backends in repr
225+
- Flexible: Supports `in`, `keys()`, iteration
226+
227+
---
228+
229+
**Quick Start:** Copy `local_csv.py`, replace CSV logic with your storage, add entry point, done.
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
"""Backend factory and exports for all backends."""
2+
3+
from .base import BaseBackend
4+
from .registry import (
5+
BackendRegistry,
6+
BACKEND_REGISTRY,
7+
get_registry,
8+
print_available_backends,
9+
register_backend,
10+
)
11+
12+
# concrete backends
13+
from .local_csv import LocalCSVBackend
14+
from .local_jsonl import LocalJSONLBackend
15+
16+
17+
__all__ = [
18+
"BaseBackend",
19+
"BackendRegistry",
20+
"LocalCSVBackend",
21+
"LocalJSONLBackend",
22+
"get_registry",
23+
"register_backend",
24+
"print_available_backends",
25+
"BACKEND_REGISTRY",
26+
]

0 commit comments

Comments
 (0)