@@ -245,41 +245,60 @@ def load(
245
245
data ['last_updated' ] = None
246
246
cat = cls .model_validate (data )
247
247
if cat .catalog_file :
248
- if _mapper .fs .exists (cat .catalog_file ):
249
- csv_path = cat .catalog_file
250
- else :
251
- csv_path = f'{ os .path .dirname (_mapper .root )} /{ cat .catalog_file } '
252
- cat .catalog_file = csv_path
253
- converters = read_csv_kwargs .pop ('converters' , {}) # Hack
254
- pl_df = (
255
- pl .scan_csv ( # See https://github.com/pola-rs/polars/issues/13040 - can't use read_csv.
256
- cat .catalog_file ,
257
- storage_options = storage_options ,
258
- ** read_csv_kwargs ,
259
- )
260
- .with_columns (
261
- [
262
- pl .col (colname )
263
- .str .replace ('^.' , '[' ) # Replace first/last chars with [ or ].
264
- .str .replace ('.$' , ']' ) # set/tuple => list
265
- .str .replace_all (
266
- "'" ,
267
- '"' ,
268
- )
269
- .str .json_decode () # This is to do with the way polars reads json - single versus double quotes
270
- for colname in converters .keys ()
271
- ]
272
- )
273
- .collect ()
248
+ cat ._df , cat ._pl_df = cat ._df_from_file (
249
+ cat , _mapper , storage_options , read_csv_kwargs
274
250
)
275
251
else :
276
- pl_df = pl .DataFrame (cat .catalog_dict )
252
+ cat ._pl_df = pl .DataFrame (cat .catalog_dict )
253
+ cat ._df = cat ._pl_df .to_pandas ()
277
254
278
- cat ._df = pl_df .to_pandas ()
279
- cat ._pl_df = pl_df
280
255
cat ._cast_agg_columns_with_iterables ()
281
256
return cat
282
257
258
+ def _df_from_file (
259
+ self , cat , _mapper , storage_options , read_csv_kwargs
260
+ ) -> tuple [pd .DataFrame , pl .DataFrame ]:
261
+ """
262
+ Reading the catalog from disk is a bit messy right now, as polars doesn't support reading
263
+ bz2 compressed files directly. So we need to screw around a bit to get what we want.
264
+ """
265
+ if _mapper .fs .exists (cat .catalog_file ):
266
+ csv_path = cat .catalog_file
267
+ else :
268
+ csv_path = f'{ os .path .dirname (_mapper .root )} /{ cat .catalog_file } '
269
+ cat .catalog_file = csv_path
270
+ converters = read_csv_kwargs .pop ('converters' , {}) # Hack
271
+ if cat .catalog_file .endswith ('.csv.bz2' ):
272
+ df = pd .read_csv (
273
+ cat .catalog_file ,
274
+ storage_options = storage_options ,
275
+ ** read_csv_kwargs ,
276
+ )
277
+ return df , pl .from_pandas (df )
278
+ else :
279
+ pl_df = (
280
+ pl .scan_csv ( # See https://github.com/pola-rs/polars/issues/13040 - can't use read_csv.
281
+ cat .catalog_file ,
282
+ storage_options = storage_options ,
283
+ ** read_csv_kwargs ,
284
+ )
285
+ .with_columns (
286
+ [
287
+ pl .col (colname )
288
+ .str .replace ('^.' , '[' ) # Replace first/last chars with [ or ].
289
+ .str .replace ('.$' , ']' ) # set/tuple => list
290
+ .str .replace_all (
291
+ "'" ,
292
+ '"' ,
293
+ )
294
+ .str .json_decode () # This is to do with the way polars reads json - single versus double quotes
295
+ for colname in converters .keys ()
296
+ ]
297
+ )
298
+ .collect ()
299
+ )
300
+ return pl_df .to_pandas (), pl_df
301
+
283
302
@property
284
303
def columns_with_iterables (self ) -> set [str ]:
285
304
"""Return a set of columns that have iterables."""
0 commit comments