@@ -109,8 +109,8 @@ class ESMCatalogModel(pydantic.BaseModel):
109
109
description : pydantic .StrictStr | None = None
110
110
title : pydantic .StrictStr | None = None
111
111
last_updated : datetime .datetime | datetime .date | None = None
112
- _df : pd .DataFrame = pydantic .PrivateAttr ()
113
- _pl_df : pl .DataFrame = pydantic .PrivateAttr ()
112
+ _df : pd .DataFrame | None = pydantic .PrivateAttr ()
113
+ _pl_df : pl .DataFrame | None = pydantic .PrivateAttr ()
114
114
115
115
model_config = ConfigDict (arbitrary_types_allowed = True , validate_assignment = True )
116
116
@@ -256,29 +256,50 @@ def load(
256
256
return cat
257
257
258
258
def _df_from_file (
259
- self , cat , _mapper , storage_options , read_csv_kwargs
260
- ) -> tuple [pd .DataFrame , pl .DataFrame ]:
259
+ self ,
260
+ cat : 'ESMCatalogModel' ,
261
+ _mapper : fsspec .FSMap ,
262
+ storage_options : dict [str , typing .Any ],
263
+ read_csv_kwargs : dict [str , typing .Any ],
264
+ ) -> tuple [pd .DataFrame | None , pl .DataFrame | None ]:
261
265
"""
262
266
Reading the catalog from disk is a bit messy right now, as polars doesn't support reading
263
267
bz2 compressed files directly. So we need to screw around a bit to get what we want.
268
+
269
+ We return a tuple of (pd.DataFrame | None, pl.DataFrame | None ) so that
270
+ we can defer evaluation of the conversion to/from a polars dataframe until
271
+ we need to.
272
+
273
+ Parameters
274
+ ----------
275
+ cat: ESMCatalogModel
276
+ The catalog model
277
+ _mapper: fsspec mapper
278
+ A fsspec mapper object
279
+ storage_options: dict
280
+ fsspec parameters passed to the backend file-system such as Google Cloud Storage,
281
+ Amazon Web Service S3.
282
+ read_csv_kwargs: dict
283
+ Additional keyword arguments passed through to the :py:func:`~pandas.read_csv` function.
284
+
285
+ Returns
286
+ -------
287
+ pd.DataFrame | None
288
+ A pandas dataframe, or None if the catalog file was read using polars
289
+ pl.DataFrame
290
+ A polars dataframe, or None if the catalog file was read using pandas
264
291
"""
265
292
if _mapper .fs .exists (cat .catalog_file ):
266
293
csv_path = cat .catalog_file
267
294
else :
268
295
csv_path = f'{ os .path .dirname (_mapper .root )} /{ cat .catalog_file } '
269
296
cat .catalog_file = csv_path
270
297
converters = read_csv_kwargs .pop ('converters' , {}) # Hack
271
- if cat .catalog_file .endswith ('.csv.bz2' ):
272
- df = pd .read_csv (
273
- cat .catalog_file ,
274
- storage_options = storage_options ,
275
- ** read_csv_kwargs ,
276
- )
277
- return df , pl .from_pandas (df )
278
- else :
298
+ if not cat .catalog_file .endswith ('.csv.bz2' ): # type: ignore[union-attr]
299
+ # See https://github.com/pola-rs/polars/issues/13040 - can't use read_csv.
279
300
pl_df = (
280
- pl .scan_csv ( # See https://github.com/pola-rs/polars/issues/13040 - can't use read_csv.
281
- cat .catalog_file ,
301
+ pl .scan_csv (
302
+ cat .catalog_file , # type: ignore[arg-type]
282
303
storage_options = storage_options ,
283
304
** read_csv_kwargs ,
284
305
)
@@ -297,7 +318,14 @@ def _df_from_file(
297
318
)
298
319
.collect ()
299
320
)
300
- return pl_df .to_pandas (), pl_df
321
+ return None , pl_df
322
+ else :
323
+ df = pd .read_csv (
324
+ cat .catalog_file ,
325
+ storage_options = storage_options ,
326
+ ** read_csv_kwargs ,
327
+ )
328
+ return df , None
301
329
302
330
@property
303
331
def columns_with_iterables (self ) -> set [str ]:
@@ -310,7 +338,11 @@ def columns_with_iterables(self) -> set[str]:
310
338
311
339
@property
312
340
def df (self ) -> pd .DataFrame :
313
- """Return the dataframe."""
341
+ """Return the dataframe, performing pandas <=> polars conversion if necessary."""
342
+ if self ._df is None :
343
+ return self ._pl_df .to_pandas (use_pyarrow_extension_array = True )
344
+ elif self ._pl_df is None :
345
+ self ._pl_df = pl .from_pandas (self ._df )
314
346
return self ._df
315
347
316
348
@property
0 commit comments