Skip to content

Commit 90c60c7

Browse files
Add interactive view of catalog (#723)
* Add interactive view of catalog * Add itables to environment-upsteam-dev.yml too * update docs to include interactive catalog demo - should also function as a test - doc failure <=> broken * Make itables work better: - Fix columns_with_iterables - Add widgets for multi filtering * Explode iterable columns one at a time * Set cascading panes to closed by default * Add MinimalExploder to 'jointly explode' compatible iterable columns - this looks for iterable columns which look to be over the same lists & explodes them together
1 parent be72fe5 commit 90c60c7

File tree

7 files changed

+134
-1
lines changed

7 files changed

+134
-1
lines changed

ci/environment-docs.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ dependencies:
1010
- fsspec >=2024.12
1111
- gcsfs >=2024.12
1212
- intake >=2.0
13+
- itables
1314
- jupyterlab
1415
- matplotlib
1516
- myst-nb

ci/environment-upstream-dev.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ dependencies:
1212
- gcsfs >=2024.12
1313
- h5netcdf >=0.8.1
1414
- ipython
15+
- itables
1516
- matplotlib
1617
- netcdf4 >=1.5.5,!=1.6.1
1718
- pandas >=2.1.0

ci/environment.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ dependencies:
1313
- h5netcdf >=0.8.1
1414
- intake >=2.0
1515
- ipython
16+
- itables
1617
- matplotlib
1718
- netcdf4 >=1.5.5,!=1.6.1
1819
- pandas >=2.1.0

docs/source/how-to/use-catalogs-with-assets-containing-multiple-variables.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,13 +63,21 @@ cat.esmcat.has_multiple_variable_assets
6363

6464
## Search for datasets
6565

66-
The search functionatilty works in the same way:
66+
The search functionality works in the same way:
6767

6868
```{code-cell} ipython3
6969
cat_subset =cat.search(variable=["O2", "SiO3"])
7070
cat_subset.df
7171
```
7272

73+
### Interactively search the catalog
74+
75+
We can also use the `interactive` attribute of a catalog to interactively search the catalog. This will not save any searches, but allows you to explore the catalog in a quick and intuitive way.
76+
77+
```{code-cell} ipython3
78+
cat.interactive
79+
```
80+
7381
## Load assets into xarray datasets
7482

7583
When loading the data files into xarray datasets, `intake-esm` will load only **data variables** that were requested. For example, if a data file contains ten data variables and the user requests for two variables, intake-esm will load the two requested variables plus necessary coordinates information.

intake_esm/core.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,17 @@
1717
_DATATREE_AVAILABLE = True
1818
except ImportError:
1919
_DATATREE_AVAILABLE = False
20+
import itables
2021
import pandas as pd
22+
import polars as pl
2123
import pydantic
2224
from fastprogress.fastprogress import progress_bar
2325
from intake.catalog import Catalog
2426

2527
from .cat import ESMCatalogModel
2628
from .derived import DerivedVariableRegistry, default_registry
2729
from .source import ESMDataSource
30+
from .utils import MinimalExploder
2831

2932

3033
class esm_datastore(Catalog):
@@ -125,6 +128,7 @@ def __init__(
125128
self.derivedcat = registry or default_registry
126129
self._entries = {}
127130
self._requested_variables = []
131+
self._columns_with_iterables = columns_with_iterables or []
128132
self.datasets = {}
129133
self._validate_derivedcat()
130134

@@ -212,6 +216,36 @@ def df(self) -> pd.DataFrame:
212216
"""
213217
return self.esmcat.df
214218

219+
@property
220+
def interactive(self) -> None:
221+
"""
222+
Use itables to display the catalog in an interactive table. Use polars
223+
for performance ideally. Fall back to pandas if not.
224+
225+
We have to explode columns with iterables, otherwise javascript stringifcation
226+
can cause ellipsis to be rendered directly into the interactive table,
227+
losing actual data and inserting junk.
228+
"""
229+
230+
try:
231+
pl_df = self.esmcat._frames.polars # type:ignore[union-attr]
232+
except AttributeError:
233+
pl_df = pl.from_pandas(self.df)
234+
235+
exploded_df = MinimalExploder(pl_df)()
236+
237+
return itables.show(
238+
exploded_df,
239+
search={'regex': True, 'caseInsensitive': True},
240+
layout={'top1': 'searchPanes'},
241+
searchPanes={
242+
'layout': 'columns-3',
243+
'cascadePanes': True,
244+
'columns': [i for i, _ in enumerate(pl_df.columns)],
245+
},
246+
maxBytes=0,
247+
)
248+
215249
def __len__(self) -> int:
216250
return len(self.keys())
217251

intake_esm/utils.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
import importlib
44
import sys
5+
from collections import defaultdict
6+
7+
import polars as pl
58

69

710
def show_versions(file=sys.stdout): # pragma: no cover
@@ -119,3 +122,87 @@ def _update(self, kwargs):
119122
def __exit__(self, type, value, traceback):
120123
"""Context management."""
121124
self._update(self.old)
125+
126+
127+
class MinimalExploder:
128+
"""
129+
A comprehensive class for analyzing and performing minimal explosions
130+
of DataFrames with multiple list columns.
131+
"""
132+
133+
def __init__(self, df: pl.DataFrame):
134+
self.df = df
135+
self._list_cols: list[str] | None = None
136+
self._length_patterns: dict[str, tuple[int, ...]] | None = None
137+
self._explodable_groups: list[list[str]] | None = None
138+
139+
@property
140+
def list_columns(self) -> list[str]:
141+
"""Get all list-type columns in the DataFrame."""
142+
if self._list_cols is None:
143+
self._list_cols = [col for col in self.df.columns if self.df[col].dtype == pl.List]
144+
return self._list_cols
145+
146+
@property
147+
def length_patterns(self) -> dict[str, tuple[int, ...]]:
148+
"""Get length patterns for all list columns.
149+
150+
This is stored as a dictionary containing tuples of all list lengths, ie
151+
'a' : (1,3,2),
152+
'b' : (2,2,2),
153+
154+
"""
155+
if self._length_patterns is None:
156+
self._length_patterns = self._analyze_patterns()
157+
return self._length_patterns
158+
159+
@property
160+
def explodable_groups(self) -> list[list[str]]:
161+
"""Get groups of columns that can be exploded together."""
162+
if self._explodable_groups is None:
163+
self._explodable_groups = self._compute_groups()
164+
return self._explodable_groups
165+
166+
def _analyze_patterns(self) -> dict[str, tuple[int, ...]]:
167+
"""Analyze length patterns of all list columns. Returns a value
168+
rather than setting self._length_patterns to shut up mypy."""
169+
_length_patterns = {}
170+
171+
for col in self.list_columns:
172+
lengths = self.df.select(pl.col(col).list.len()).to_series().to_list()
173+
_length_patterns[col] = tuple(lengths)
174+
175+
return _length_patterns
176+
177+
def _compute_groups(self):
178+
"""Compute explodable groups based on length patterns. Returns a value
179+
rather than setting self._explodable_groups to shut up mypy."""
180+
pattern_groups = defaultdict(list)
181+
182+
for col, pattern in self.length_patterns.items():
183+
pattern_groups[pattern].append(col)
184+
185+
return list(pattern_groups.values())
186+
187+
@property
188+
def summary(self) -> dict:
189+
"""Get a summary of the explosion analysis."""
190+
return {
191+
'total_columns': len(self.df.columns),
192+
'list_columns': len(self.list_columns),
193+
'unique_patterns': len(set(self.length_patterns.values())),
194+
'explodable_groups': len(self.explodable_groups),
195+
'explosion_operations_needed': len(self.explodable_groups),
196+
'groups': self.explodable_groups,
197+
}
198+
199+
def __call__(self) -> pl.DataFrame:
200+
"""Perform the minimal explosion."""
201+
if not self.list_columns:
202+
return self.df
203+
204+
result_df = self.df
205+
for group in self.explodable_groups:
206+
result_df = result_df.explode(*group)
207+
208+
return result_df

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ dask[complete]>=2024.12
22
fastprogress>=1.0.0
33
fsspec>=2024.12
44
intake>=2.0.0
5+
itables
56
netCDF4>=1.5.5
67
pandas>=2.1.0
78
polars>=1.24.0

0 commit comments

Comments
 (0)