Skip to content

Commit d20f21c

Browse files
Defer conversion between polars & pandas
1 parent f3aaecb commit d20f21c

File tree

1 file changed

+48
-16
lines changed

1 file changed

+48
-16
lines changed

intake_esm/cat.py

Lines changed: 48 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,8 @@ class ESMCatalogModel(pydantic.BaseModel):
109109
description: pydantic.StrictStr | None = None
110110
title: pydantic.StrictStr | None = None
111111
last_updated: datetime.datetime | datetime.date | None = None
112-
_df: pd.DataFrame = pydantic.PrivateAttr()
113-
_pl_df: pl.DataFrame = pydantic.PrivateAttr()
112+
_df: pd.DataFrame | None = pydantic.PrivateAttr()
113+
_pl_df: pl.DataFrame | None = pydantic.PrivateAttr()
114114

115115
model_config = ConfigDict(arbitrary_types_allowed=True, validate_assignment=True)
116116

@@ -256,29 +256,50 @@ def load(
256256
return cat
257257

258258
def _df_from_file(
259-
self, cat, _mapper, storage_options, read_csv_kwargs
260-
) -> tuple[pd.DataFrame, pl.DataFrame]:
259+
self,
260+
cat: 'ESMCatalogModel',
261+
_mapper: fsspec.FSMap,
262+
storage_options: dict[str, typing.Any],
263+
read_csv_kwargs: dict[str, typing.Any],
264+
) -> tuple[pd.DataFrame | None, pl.DataFrame | None]:
261265
"""
262266
Reading the catalog from disk is a bit messy right now, as polars doesn't support reading
263267
bz2 compressed files directly. So we need to screw around a bit to get what we want.
268+
269+
We return a tuple of (pd.DataFrame | None, pl.DataFrame | None ) so that
270+
we can defer evaluation of the conversion to/from a polars dataframe until
271+
we need to.
272+
273+
Parameters
274+
----------
275+
cat: ESMCatalogModel
276+
The catalog model
277+
_mapper: fsspec mapper
278+
A fsspec mapper object
279+
storage_options: dict
280+
fsspec parameters passed to the backend file-system such as Google Cloud Storage,
281+
Amazon Web Service S3.
282+
read_csv_kwargs: dict
283+
Additional keyword arguments passed through to the :py:func:`~pandas.read_csv` function.
284+
285+
Returns
286+
-------
287+
pd.DataFrame | None
288+
A pandas dataframe, or None if the catalog file was read using polars
289+
pl.DataFrame
290+
A polars dataframe, or None if the catalog file was read using pandas
264291
"""
265292
if _mapper.fs.exists(cat.catalog_file):
266293
csv_path = cat.catalog_file
267294
else:
268295
csv_path = f'{os.path.dirname(_mapper.root)}/{cat.catalog_file}'
269296
cat.catalog_file = csv_path
270297
converters = read_csv_kwargs.pop('converters', {}) # Hack
271-
if cat.catalog_file.endswith('.csv.bz2'):
272-
df = pd.read_csv(
273-
cat.catalog_file,
274-
storage_options=storage_options,
275-
**read_csv_kwargs,
276-
)
277-
return df, pl.from_pandas(df)
278-
else:
298+
if not cat.catalog_file.endswith('.csv.bz2'): # type: ignore[union-attr]
299+
# See https://github.com/pola-rs/polars/issues/13040 - can't use read_csv.
279300
pl_df = (
280-
pl.scan_csv( # See https://github.com/pola-rs/polars/issues/13040 - can't use read_csv.
281-
cat.catalog_file,
301+
pl.scan_csv(
302+
cat.catalog_file, # type: ignore[arg-type]
282303
storage_options=storage_options,
283304
**read_csv_kwargs,
284305
)
@@ -297,7 +318,14 @@ def _df_from_file(
297318
)
298319
.collect()
299320
)
300-
return pl_df.to_pandas(), pl_df
321+
return None, pl_df
322+
else:
323+
df = pd.read_csv(
324+
cat.catalog_file,
325+
storage_options=storage_options,
326+
**read_csv_kwargs,
327+
)
328+
return df, None
301329

302330
@property
303331
def columns_with_iterables(self) -> set[str]:
@@ -310,7 +338,11 @@ def columns_with_iterables(self) -> set[str]:
310338

311339
@property
312340
def df(self) -> pd.DataFrame:
313-
"""Return the dataframe."""
341+
"""Return the dataframe, performing pandas <=> polars conversion if necessary."""
342+
if self._df is None:
343+
return self._pl_df.to_pandas(use_pyarrow_extension_array=True)
344+
elif self._pl_df is None:
345+
self._pl_df = pl.from_pandas(self._df)
314346
return self._df
315347

316348
@property

0 commit comments

Comments
 (0)