1
1
"""
2
2
Generate geoparquet from a sequence of STAC items.
3
3
"""
4
+
4
5
from __future__ import annotations
6
+ import collections
5
7
6
- from typing import Sequence , Any
8
+ from typing import Sequence , Any , Literal
9
+ import warnings
7
10
8
11
import pystac
9
12
import geopandas
10
13
import pandas as pd
14
+ import pyarrow as pa
11
15
import numpy as np
12
16
import shapely .geometry
13
17
16
20
from stac_geoparquet .utils import fix_empty_multipolygon
17
21
18
22
STAC_ITEM_TYPES = ["application/json" , "application/geo+json" ]
19
-
23
+ DTYPE_BACKEND = Literal [ "numpy_nullable" , "pyarrow" ]
20
24
SELF_LINK_COLUMN = "self_link"
21
25
22
26
@@ -31,7 +35,10 @@ def _fix_array(v):
31
35
32
36
33
37
def to_geodataframe (
34
- items : Sequence [dict [str , Any ]], add_self_link : bool = False
38
+ items : Sequence [dict [str , Any ]],
39
+ add_self_link : bool = False ,
40
+ dtype_backend : DTYPE_BACKEND | None = None ,
41
+ datetime_precision : str = "ns" ,
35
42
) -> geopandas .GeoDataFrame :
36
43
"""
37
44
Convert a sequence of STAC items to a :class:`geopandas.GeoDataFrame`.
@@ -42,19 +49,72 @@ def to_geodataframe(
42
49
Parameters
43
50
----------
44
51
items: A sequence of STAC items.
45
- add_self_link: Add the absolute link (if available) to the source STAC Item as a separate column named "self_link"
52
+ add_self_link: bool, default False
53
+ Add the absolute link (if available) to the source STAC Item
54
+ as a separate column named "self_link"
55
+ dtype_backend: {'pyarrow', 'numpy_nullable'}, optional
56
+ The dtype backend to use for storing arrays.
57
+
58
+ By default, this will use 'numpy_nullable' and emit a
59
+ FutureWarning that the default will change to 'pyarrow' in
60
+ the next release.
61
+
62
+ Set to 'numpy_nullable' to silence the warning and accept the
63
+ old behavior.
64
+
65
+ Set to 'pyarrow' to silence the warning and accept the new behavior.
66
+
67
+ There are some difference in the output as well: with
68
+ ``dtype_backend="pyarrow"``, struct-like fields will explicitly
69
+ contain null values for fields that appear in only some of the
70
+ records. For example, given an ``assets`` like::
71
+
72
+ {
73
+ "a": {
74
+ "href": "a.tif",
75
+ },
76
+ "b": {
77
+ "href": "b.tif",
78
+ "title": "B",
79
+ }
80
+ }
81
+
82
+ The ``assets`` field of the output for the first row with
83
+ ``dtype_backend="numpy_nullable"`` will be a Python dictionary with
84
+ just ``{"href": "a.tiff"}``.
85
+
86
+ With ``dtype_backend="pyarrow"``, this will be a pyarrow struct
87
+ with fields ``{"href": "a.tif", "title", None}``. pyarrow will
88
+ infer that the struct field ``asset.title`` is nullable.
89
+
90
+ datetime_precision: str, default "ns"
91
+ The precision to use for the datetime columns. For example,
92
+ "us" is microsecond and "ns" is nanosecond.
46
93
47
94
Returns
48
95
-------
49
96
The converted GeoDataFrame.
50
97
"""
51
- items2 = []
98
+ items2 = collections .defaultdict (list )
99
+
52
100
for item in items :
53
- item2 = {k : v for k , v in item .items () if k != "properties" }
101
+ keys = set (item ) - {"properties" , "geometry" }
102
+
103
+ for k in keys :
104
+ items2 [k ].append (item [k ])
105
+
106
+ item_geometry = item ["geometry" ]
107
+ if item_geometry :
108
+ item_geometry = fix_empty_multipolygon (item_geometry )
109
+
110
+ items2 ["geometry" ].append (item_geometry )
111
+
54
112
for k , v in item ["properties" ].items ():
55
- if k in item2 :
56
- raise ValueError ("k" , k )
57
- item2 [k ] = v
113
+ if k in item :
114
+ msg = f"Key '{ k } ' appears in both 'properties' and the top level."
115
+ raise ValueError (msg )
116
+ items2 [k ].append (v )
117
+
58
118
if add_self_link :
59
119
self_href = None
60
120
for link in item ["links" ]:
@@ -65,23 +125,11 @@ def to_geodataframe(
65
125
):
66
126
self_href = link ["href" ]
67
127
break
68
- item2 [SELF_LINK_COLUMN ] = self_href
69
- items2 .append (item2 )
70
-
71
- # Filter out missing geoms in MultiPolygons
72
- # https://github.com/shapely/shapely/issues/1407
73
- # geometry = [shapely.geometry.shape(x["geometry"]) for x in items2]
74
-
75
- geometry = []
76
- for item2 in items2 :
77
- item_geometry = item2 ["geometry" ]
78
- if item_geometry :
79
- item_geometry = fix_empty_multipolygon (item_geometry ) # type: ignore
80
- geometry .append (item_geometry )
81
-
82
- gdf = geopandas .GeoDataFrame (items2 , geometry = geometry , crs = "WGS84" )
128
+ items2 [SELF_LINK_COLUMN ].append (self_href )
83
129
84
- for column in [
130
+ # TODO: Ideally we wouldn't have to hard-code this list.
131
+ # Could we get it from the JSON schema.
132
+ DATETIME_COLUMNS = {
85
133
"datetime" , # common metadata
86
134
"start_datetime" ,
87
135
"end_datetime" ,
@@ -90,9 +138,43 @@ def to_geodataframe(
90
138
"expires" , # timestamps extension
91
139
"published" ,
92
140
"unpublished" ,
93
- ]:
94
- if column in gdf .columns :
95
- gdf [column ] = pd .to_datetime (gdf [column ], format = "ISO8601" )
141
+ }
142
+
143
+ items2 ["geometry" ] = geopandas .array .from_shapely (items2 ["geometry" ])
144
+
145
+ if dtype_backend is None :
146
+ msg = (
147
+ "The default argument for 'dtype_backend' will change from "
148
+ "'numpy_nullable' to 'pyarrow'. To keep the previous default "
149
+ "specify ``dtype_backend='numpy_nullable'``. To accept the future "
150
+ "behavior specify ``dtype_backend='pyarrow'."
151
+ )
152
+ warnings .warn (FutureWarning (msg ))
153
+ dtype_backend = "numpy_nullable"
154
+
155
+ if dtype_backend == "pyarrow" :
156
+ for k , v in items2 .items ():
157
+ if k in DATETIME_COLUMNS :
158
+ dt = pd .to_datetime (v , format = "ISO8601" ).as_unit (datetime_precision )
159
+ items2 [k ] = pd .arrays .ArrowExtensionArray (pa .array (dt ))
160
+
161
+ elif k != "geometry" :
162
+ items2 [k ] = pd .arrays .ArrowExtensionArray (pa .array (v ))
163
+
164
+ elif dtype_backend == "numpy_nullable" :
165
+ for k , v in items2 .items ():
166
+ if k in DATETIME_COLUMNS :
167
+ items2 [k ] = pd .to_datetime (v , format = "ISO8601" ).as_unit (
168
+ datetime_precision
169
+ )
170
+
171
+ if k in {"type" , "stac_version" , "id" , "collection" , SELF_LINK_COLUMN }:
172
+ items2 [k ] = pd .array (v , dtype = "string" )
173
+ else :
174
+ msg = f"Invalid 'dtype_backend={ dtype_backend } '."
175
+ raise TypeError (msg )
176
+
177
+ gdf = geopandas .GeoDataFrame (items2 , geometry = "geometry" , crs = "WGS84" )
96
178
97
179
columns = [
98
180
"type" ,
@@ -111,10 +193,6 @@ def to_geodataframe(
111
193
columns .remove (col )
112
194
113
195
gdf = pd .concat ([gdf [columns ], gdf .drop (columns = columns )], axis = "columns" )
114
- for k in ["type" , "stac_version" , "id" , "collection" , SELF_LINK_COLUMN ]:
115
- if k in gdf :
116
- gdf [k ] = gdf [k ].astype ("string" )
117
-
118
196
return gdf
119
197
120
198
@@ -144,12 +222,16 @@ def to_dict(record: dict) -> dict:
144
222
145
223
if k == SELF_LINK_COLUMN :
146
224
continue
225
+ elif k == "assets" :
226
+ item [k ] = {k2 : v2 for k2 , v2 in v .items () if v2 is not None }
147
227
elif k in top_level_keys :
148
228
item [k ] = v
149
229
else :
150
230
properties [k ] = v
151
231
152
- item ["geometry" ] = shapely .geometry .mapping (item ["geometry" ])
232
+ if item ["geometry" ]:
233
+ item ["geometry" ] = shapely .geometry .mapping (item ["geometry" ])
234
+
153
235
item ["properties" ] = properties
154
236
155
237
return item
@@ -175,6 +257,11 @@ def to_item_collection(df: geopandas.GeoDataFrame) -> pystac.ItemCollection:
175
257
include = ["datetime64[ns, UTC]" , "datetime64[ns]" ]
176
258
).columns
177
259
for k in datelike :
260
+ # %f isn't implemented in pyarrow
261
+ # https://github.com/apache/arrow/issues/20146
262
+ if isinstance (df2 [k ].dtype , pd .ArrowDtype ):
263
+ df2 [k ] = df2 [k ].astype ("datetime64[ns, utc]" )
264
+
178
265
df2 [k ] = (
179
266
df2 [k ].dt .strftime ("%Y-%m-%dT%H:%M:%S.%fZ" ).fillna ("" ).replace ({"" : None })
180
267
)
0 commit comments