Skip to content

Commit f3aaecb

Browse files
All tests passing
1 parent 207ed90 commit f3aaecb

File tree

3 files changed

+68
-34
lines changed

3 files changed

+68
-34
lines changed

intake_esm/cat.py

Lines changed: 48 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -245,41 +245,60 @@ def load(
245245
data['last_updated'] = None
246246
cat = cls.model_validate(data)
247247
if cat.catalog_file:
248-
if _mapper.fs.exists(cat.catalog_file):
249-
csv_path = cat.catalog_file
250-
else:
251-
csv_path = f'{os.path.dirname(_mapper.root)}/{cat.catalog_file}'
252-
cat.catalog_file = csv_path
253-
converters = read_csv_kwargs.pop('converters', {}) # Hack
254-
pl_df = (
255-
pl.scan_csv( # See https://github.com/pola-rs/polars/issues/13040 - can't use read_csv.
256-
cat.catalog_file,
257-
storage_options=storage_options,
258-
**read_csv_kwargs,
259-
)
260-
.with_columns(
261-
[
262-
pl.col(colname)
263-
.str.replace('^.', '[') # Replace first/last chars with [ or ].
264-
.str.replace('.$', ']') # set/tuple => list
265-
.str.replace_all(
266-
"'",
267-
'"',
268-
)
269-
.str.json_decode() # This is to do with the way polars reads json - single versus double quotes
270-
for colname in converters.keys()
271-
]
272-
)
273-
.collect()
248+
cat._df, cat._pl_df = cat._df_from_file(
249+
cat, _mapper, storage_options, read_csv_kwargs
274250
)
275251
else:
276-
pl_df = pl.DataFrame(cat.catalog_dict)
252+
cat._pl_df = pl.DataFrame(cat.catalog_dict)
253+
cat._df = cat._pl_df.to_pandas()
277254

278-
cat._df = pl_df.to_pandas()
279-
cat._pl_df = pl_df
280255
cat._cast_agg_columns_with_iterables()
281256
return cat
282257

258+
def _df_from_file(
259+
self, cat, _mapper, storage_options, read_csv_kwargs
260+
) -> tuple[pd.DataFrame, pl.DataFrame]:
261+
"""
262+
Reading the catalog from disk is a bit messy right now, as polars doesn't support reading
263+
bz2 compressed files directly. So we need to screw around a bit to get what we want.
264+
"""
265+
if _mapper.fs.exists(cat.catalog_file):
266+
csv_path = cat.catalog_file
267+
else:
268+
csv_path = f'{os.path.dirname(_mapper.root)}/{cat.catalog_file}'
269+
cat.catalog_file = csv_path
270+
converters = read_csv_kwargs.pop('converters', {}) # Hack
271+
if cat.catalog_file.endswith('.csv.bz2'):
272+
df = pd.read_csv(
273+
cat.catalog_file,
274+
storage_options=storage_options,
275+
**read_csv_kwargs,
276+
)
277+
return df, pl.from_pandas(df)
278+
else:
279+
pl_df = (
280+
pl.scan_csv( # See https://github.com/pola-rs/polars/issues/13040 - can't use read_csv.
281+
cat.catalog_file,
282+
storage_options=storage_options,
283+
**read_csv_kwargs,
284+
)
285+
.with_columns(
286+
[
287+
pl.col(colname)
288+
.str.replace('^.', '[') # Replace first/last chars with [ or ].
289+
.str.replace('.$', ']') # set/tuple => list
290+
.str.replace_all(
291+
"'",
292+
'"',
293+
)
294+
.str.json_decode() # This is to do with the way polars reads json - single versus double quotes
295+
for colname in converters.keys()
296+
]
297+
)
298+
.collect()
299+
)
300+
return pl_df.to_pandas(), pl_df
301+
283302
@property
284303
def columns_with_iterables(self) -> set[str]:
285304
"""Return a set of columns that have iterables."""

intake_esm/source.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ def _open_dataset(
8383
if preprocess is not None:
8484
ds = preprocess(ds)
8585

86-
if varname and isinstance(varname, str):
86+
if varname is not None and isinstance(varname, str):
8787
varname = [varname]
8888

8989
if requested_variables:
@@ -102,7 +102,7 @@ def _open_dataset(
102102
ds = ds.set_coords(scalar_variables)
103103
ds = ds[variables]
104104
ds.attrs[OPTIONS['vars_key']] = variables
105-
elif varname:
105+
elif varname is not None:
106106
ds.attrs[OPTIONS['vars_key']] = varname
107107

108108
ds = _expand_dims(expand_dims, ds)

tests/test_core.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import intake
55
import packaging.version
66
import pandas as pd
7+
import polars as pl
78
import pydantic
89
import pytest
910
import xarray as xr
@@ -262,10 +263,24 @@ def test_catalog_serialize(catalog_type, to_csv_kwargs, json_dump_kwargs, direct
262263
if directory is None:
263264
directory = os.getcwd()
264265
cat = intake.open_esm_datastore(f'{directory}/{name}.json')
266+
subset_df = cat_subset.esmcat._pl_df.with_columns(
267+
[
268+
pl.col(colname).cast(pl.Null)
269+
for colname in cat_subset.esmcat._pl_df.columns
270+
if cat_subset.esmcat._pl_df.get_column(colname).is_null().all()
271+
]
272+
)
273+
274+
df = cat.esmcat._pl_df.with_columns(
275+
[
276+
pl.col(colname).cast(pl.Null)
277+
for colname in cat.esmcat._pl_df.columns
278+
if cat.esmcat._pl_df.get_column(colname).is_null().all()
279+
]
280+
)
265281
assert_frame_equal_pl(
266-
cat_subset.esmcat._pl_df,
267-
cat.esmcat._pl_df,
268-
check_dtypes=False, # lNull types are chaging ty
282+
subset_df,
283+
df,
269284
)
270285
assert cat.esmcat.id == name
271286

0 commit comments

Comments
 (0)