@@ -27,6 +27,7 @@ def parse_stac_items_to_arrow(
27
27
* ,
28
28
chunk_size : int = 8192 ,
29
29
schema : Optional [pa .Schema ] = None ,
30
+ downcast : bool = True ,
30
31
) -> pa .Table :
31
32
"""Parse a collection of STAC Items to a :class:`pyarrow.Table`.
32
33
@@ -41,6 +42,7 @@ def parse_stac_items_to_arrow(
41
42
schema: The schema of the input data. If provided, can improve memory use;
42
43
otherwise all items need to be parsed into a single array for schema
43
44
inference. Defaults to None.
45
+ downcast: if True, store bbox as float32 for memory and disk saving.
44
46
45
47
Returns:
46
48
a pyarrow Table with the STAC-GeoParquet representation of items.
@@ -53,22 +55,23 @@ def parse_stac_items_to_arrow(
53
55
for chunk in _chunks (items , chunk_size ):
54
56
batches .append (_stac_items_to_arrow (chunk , schema = schema ))
55
57
56
- stac_table = pa .Table .from_batches (batches , schema = schema )
58
+ table = pa .Table .from_batches (batches , schema = schema )
57
59
else :
58
60
# If schema is _not_ provided, then we must convert to Arrow all at once, or
59
61
# else it would be possible for a STAC item late in the collection (after the
60
62
# first chunk) to have a different schema and not match the schema inferred for
61
63
# the first chunk.
62
- stac_table = pa .Table .from_batches ([_stac_items_to_arrow (items )])
64
+ table = pa .Table .from_batches ([_stac_items_to_arrow (items )])
63
65
64
- return _process_arrow_table (stac_table )
66
+ return _process_arrow_table (table , downcast = downcast )
65
67
66
68
67
69
def parse_stac_ndjson_to_arrow (
68
70
path : Union [str , Path ],
69
71
* ,
70
72
chunk_size : int = 8192 ,
71
73
schema : Optional [pa .Schema ] = None ,
74
+ downcast : bool = True ,
72
75
) -> pa .Table :
73
76
# Define outside of if/else to make mypy happy
74
77
items : List [dict ] = []
@@ -98,14 +101,14 @@ def parse_stac_ndjson_to_arrow(
98
101
if len (items ) > 0 :
99
102
batches .append (_stac_items_to_arrow (items , schema = schema ))
100
103
101
- stac_table = pa .Table .from_batches (batches , schema = schema )
102
- return _process_arrow_table (stac_table )
104
+ table = pa .Table .from_batches (batches , schema = schema )
105
+ return _process_arrow_table (table , downcast = downcast )
103
106
104
107
105
- def _process_arrow_table (table : pa .Table ) -> pa .Table :
108
+ def _process_arrow_table (table : pa .Table , * , downcast : bool = True ) -> pa .Table :
106
109
table = _bring_properties_to_top_level (table )
107
110
table = _convert_timestamp_columns (table )
108
- table = _convert_bbox_to_struct (table )
111
+ table = _convert_bbox_to_struct (table , downcast = downcast )
109
112
return table
110
113
111
114
@@ -192,11 +195,21 @@ def _convert_timestamp_columns(table: pa.Table) -> pa.Table:
192
195
except KeyError :
193
196
continue
194
197
198
+ field_index = table .schema .get_field_index (column_name )
199
+
195
200
if pa .types .is_timestamp (column .type ):
196
201
continue
202
+
203
+ # STAC allows datetimes to be null. If all rows are null, the column type may be
204
+ # inferred as null. We cast this to a timestamp column.
205
+ elif pa .types .is_null (column .type ):
206
+ table = table .set_column (
207
+ field_index , column_name , column .cast (pa .timestamp ("us" ))
208
+ )
209
+
197
210
elif pa .types .is_string (column .type ):
198
- table = table .drop ( column_name ). append_column (
199
- column_name , _convert_timestamp_column (column )
211
+ table = table .set_column (
212
+ field_index , column_name , _convert_timestamp_column (column )
200
213
)
201
214
else :
202
215
raise ValueError (
@@ -224,7 +237,26 @@ def _convert_timestamp_column(column: pa.ChunkedArray) -> pa.ChunkedArray:
224
237
return pa .chunked_array (chunks )
225
238
226
239
227
- def _convert_bbox_to_struct (table : pa .Table , * , downcast : bool = True ) -> pa .Table :
240
+ def is_bbox_3d (bbox_col : pa .ChunkedArray ) -> bool :
241
+ """Infer whether the bounding box column represents 2d or 3d bounding boxes."""
242
+ offsets_set = set ()
243
+ for chunk in bbox_col .chunks :
244
+ offsets = chunk .offsets .to_numpy ()
245
+ offsets_set .update (np .unique (offsets [1 :] - offsets [:- 1 ]))
246
+
247
+ if len (offsets_set ) > 1 :
248
+ raise ValueError ("Mixed 2d-3d bounding boxes not yet supported" )
249
+
250
+ offset = list (offsets_set )[0 ]
251
+ if offset == 6 :
252
+ return True
253
+ elif offset == 4 :
254
+ return False
255
+ else :
256
+ raise ValueError (f"Unexpected bbox offset: { offset = } " )
257
+
258
+
259
+ def _convert_bbox_to_struct (table : pa .Table , * , downcast : bool ) -> pa .Table :
228
260
"""Convert bbox column to a struct representation
229
261
230
262
Since the bbox in JSON is stored as an array, pyarrow automatically converts the
@@ -244,6 +276,7 @@ def _convert_bbox_to_struct(table: pa.Table, *, downcast: bool = True) -> pa.Tab
244
276
"""
245
277
bbox_col_idx = table .schema .get_field_index ("bbox" )
246
278
bbox_col = table .column (bbox_col_idx )
279
+ bbox_3d = is_bbox_3d (bbox_col )
247
280
248
281
new_chunks = []
249
282
for chunk in bbox_col .chunks :
@@ -252,36 +285,80 @@ def _convert_bbox_to_struct(table: pa.Table, *, downcast: bool = True) -> pa.Tab
252
285
or pa .types .is_large_list (chunk .type )
253
286
or pa .types .is_fixed_size_list (chunk .type )
254
287
)
255
- coords = chunk .flatten ().to_numpy ().reshape (- 1 , 4 )
256
- xmin = coords [:, 0 ]
257
- ymin = coords [:, 1 ]
258
- xmax = coords [:, 2 ]
259
- ymax = coords [:, 3 ]
288
+ if bbox_3d :
289
+ coords = chunk .flatten ().to_numpy ().reshape (- 1 , 6 )
290
+ else :
291
+ coords = chunk .flatten ().to_numpy ().reshape (- 1 , 4 )
260
292
261
293
if downcast :
262
294
coords = coords .astype (np .float32 )
263
295
264
- # Round min values down to the next float32 value
265
- # Round max values up to the next float32 value
266
- xmin = np .nextafter (xmin , - np .Infinity )
267
- ymin = np .nextafter (ymin , - np .Infinity )
268
- xmax = np .nextafter (xmax , np .Infinity )
269
- ymax = np .nextafter (ymax , np .Infinity )
270
-
271
- struct_arr = pa .StructArray .from_arrays (
272
- [
273
- xmin ,
274
- ymin ,
275
- xmax ,
276
- ymax ,
277
- ],
278
- names = [
279
- "xmin" ,
280
- "ymin" ,
281
- "xmax" ,
282
- "ymax" ,
283
- ],
284
- )
296
+ if bbox_3d :
297
+ xmin = coords [:, 0 ]
298
+ ymin = coords [:, 1 ]
299
+ zmin = coords [:, 2 ]
300
+ xmax = coords [:, 3 ]
301
+ ymax = coords [:, 4 ]
302
+ zmax = coords [:, 5 ]
303
+
304
+ if downcast :
305
+ # Round min values down to the next float32 value
306
+ # Round max values up to the next float32 value
307
+ xmin = np .nextafter (xmin , - np .Infinity )
308
+ ymin = np .nextafter (ymin , - np .Infinity )
309
+ zmin = np .nextafter (zmin , - np .Infinity )
310
+ xmax = np .nextafter (xmax , np .Infinity )
311
+ ymax = np .nextafter (ymax , np .Infinity )
312
+ zmax = np .nextafter (zmax , np .Infinity )
313
+
314
+ struct_arr = pa .StructArray .from_arrays (
315
+ [
316
+ xmin ,
317
+ ymin ,
318
+ zmin ,
319
+ xmax ,
320
+ ymax ,
321
+ zmax ,
322
+ ],
323
+ names = [
324
+ "xmin" ,
325
+ "ymin" ,
326
+ "zmin" ,
327
+ "xmax" ,
328
+ "ymax" ,
329
+ "zmax" ,
330
+ ],
331
+ )
332
+
333
+ else :
334
+ xmin = coords [:, 0 ]
335
+ ymin = coords [:, 1 ]
336
+ xmax = coords [:, 2 ]
337
+ ymax = coords [:, 3 ]
338
+
339
+ if downcast :
340
+ # Round min values down to the next float32 value
341
+ # Round max values up to the next float32 value
342
+ xmin = np .nextafter (xmin , - np .Infinity )
343
+ ymin = np .nextafter (ymin , - np .Infinity )
344
+ xmax = np .nextafter (xmax , np .Infinity )
345
+ ymax = np .nextafter (ymax , np .Infinity )
346
+
347
+ struct_arr = pa .StructArray .from_arrays (
348
+ [
349
+ xmin ,
350
+ ymin ,
351
+ xmax ,
352
+ ymax ,
353
+ ],
354
+ names = [
355
+ "xmin" ,
356
+ "ymin" ,
357
+ "xmax" ,
358
+ "ymax" ,
359
+ ],
360
+ )
361
+
285
362
new_chunks .append (struct_arr )
286
363
287
364
return table .set_column (bbox_col_idx , "bbox" , new_chunks )
0 commit comments