Skip to content

Commit 3b2464a

Browse files
committed
Update product search script
1 parent 96badf2 commit 3b2464a

File tree

1 file changed

+48
-29
lines changed

1 file changed

+48
-29
lines changed

scripts/planetary_computer_sentinel2_product_search.py

Lines changed: 48 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,26 @@
1+
import warnings
2+
13
import geopandas as gpd
4+
from pyogrio.errors import DataSourceError
25

36
from geospatial_tools import DATA_DIR
47
from geospatial_tools.planetary_computer.sentinel_2 import BestProductsForFeatures
58
from geospatial_tools.vector import (
69
create_vector_grid_parallel,
10+
dask_spatial_join,
711
select_polygons_by_location,
812
to_geopackage,
913
)
1014

15+
warnings.filterwarnings("ignore", category=FutureWarning)
16+
1117
###
1218
# Base arguments
1319
###
1420

1521
# Base files
1622
USA_POLYGON_FILE = DATA_DIR / "usa_polygon_5070.gpkg"
23+
# S2_USA_GRID_FILE = DATA_DIR / "s2_grid_usa_polygon_5070_subset.gpkg"
1724
S2_USA_GRID_FILE = DATA_DIR / "s2_grid_usa_polygon_5070.gpkg"
1825

1926
# Grid size for vector polygons
@@ -35,6 +42,10 @@
3542
# Max cloud cover to use as search parameter
3643
MAX_CLOUD_COVER = 15
3744

45+
# Processing arguments
46+
NUM_OF_WORKERS_VECTOR_GRID = 16
47+
NUM_OF_WORKERS_SPATIAL_SELECT = 8
48+
3849
###
3950
# Base data
4051
###
@@ -72,12 +83,20 @@
7283
# query the [Planetary Computer](https://planetarycomputer.microsoft.com/dataset/sentinel-2-l2a)
7384
# Sentinel 2 dataset and clip the selected Sentinel 2 images.
7485

75-
86+
grid_800m_filename = DATA_DIR / "polygon_grid_800m_2.gpkg"
7687
bbox = usa_polygon.total_bounds
77-
grid_800m_filename = DATA_DIR / "polygon_grid_800m.gpkg"
78-
print("Starting processing for [create_vector_grid_parallel]")
79-
grid_800m = create_vector_grid_parallel(bounding_box=bbox, grid_size=GRID_SIZE, crs="EPSG:5070")
80-
print(f"Printing len(grid_parallel) to check if grid contains same amount of polygons : {len(grid_800m)}")
88+
try:
89+
print(f"Trying to load {grid_800m_filename}, if it exists")
90+
grid_800m = gpd.read_file(grid_800m_filename)
91+
print("Succeeded!")
92+
except DataSourceError:
93+
print(f"Failed to load {grid_800m_filename}")
94+
print("Starting processing for [create_vector_grid_parallel]")
95+
grid_800m = create_vector_grid_parallel(
96+
bounding_box=bbox, num_of_workers=NUM_OF_WORKERS_VECTOR_GRID, grid_size=GRID_SIZE, crs="EPSG:5070"
97+
)
98+
to_geopackage(grid_800m, grid_800m_filename)
99+
print(f"Printing len(grid_parallel) to check if grid contains same amount of polygons : {len(grid_800m)}")
81100

82101
# Selecting the useful polygons
83102
#
@@ -91,50 +110,50 @@
91110
# big or too complex, it would be better off going through QGIS, PyGQIS, GDAL or
92111
# some other more efficient way to do this operation.
93112

94-
usa_polygon_grid_800m_filename = DATA_DIR / "usa_polygon_grid_800m.gpkg"
95-
print("Starting intersect selection")
96-
usa_polygon_grid_800m = select_polygons_by_location(grid_800m, usa_polygon)
97-
to_geopackage(usa_polygon_grid_800m, usa_polygon_grid_800m_filename)
98-
99-
# ## Data processing pipeline prototype
100-
# ### Finding the best image for each S2 tiling grid
101-
102-
# This is the full list of S2 grids
103-
s2_tile_grid_list = s2_grid["name"].to_list()
104-
105-
###
106-
# Finding the best products for our subset use case
107-
###
108-
109-
# `s2_feature_name_columns` is the name of the column in `s2_grid` where the id of
110-
# the different tiles is found.
111-
#
112-
# `vector_column_name` is the name of the column in which the best results will be stored
113+
usa_polygon_grid_800m_filename = DATA_DIR / "usa_polygon_grid_800m_dask_3.gpkg"
114+
try:
115+
print(f"Trying to load {usa_polygon_grid_800m_filename}, if it exists")
116+
usa_polygon_grid_800m = gpd.read_file(usa_polygon_grid_800m_filename)
117+
print("Succeeded!")
118+
except DataSourceError:
119+
print(f"Failed to load {usa_polygon_grid_800m_filename}")
120+
print("Starting intersect selection")
121+
usa_polygon_grid_800m = select_polygons_by_location(
122+
select_features_from=grid_800m,
123+
intersected_with=usa_polygon,
124+
num_of_workers=NUM_OF_WORKERS_SPATIAL_SELECT,
125+
join_function=dask_spatial_join,
126+
)
127+
to_geopackage(usa_polygon_grid_800m, usa_polygon_grid_800m_filename)
128+
129+
# Finding the best image for each S2 tiling grid
113130

114131
# Initiating our client
132+
s2_tile_grid_list = s2_grid["name"].to_list()
115133
best_products_client = BestProductsForFeatures(
116134
sentinel2_tiling_grid=s2_grid,
117135
sentinel2_tiling_grid_column=S2_FEATURE_NAME_COLUMN,
118-
vector_features=usa_polygon_grid_800m_filename,
136+
vector_features=usa_polygon_grid_800m,
119137
vector_features_column=VECTOR_COLUMN_NAME,
120138
max_cloud_cover=MAX_CLOUD_COVER,
121139
)
122140

123141
# Executing the search
124142
#
125-
# This search look only for complete products, meaning products with less than
143+
# This search looks only for complete products, meaning products with less than
126144
# 5 percent of nodata.
127145

128146
best_products_client.create_date_ranges(START_YEAR, END_YEAR, START_MONTH, END_MONTH)
129147
products = best_products_client.find_best_complete_products()
148+
best_products_client.to_file()
130149

131150
# Selecting the best products for each vector tile
151+
#
132152
# This step is necessary as some of our vector polygons can be withing multiple S2 tiles.
133153
# The best available S2 tile is therefore selected for each vector polygon.
134154

135-
best_results_path = DATA_DIR / "vector_tiles_with_s2tiles_subset.gpkg"
155+
best_results_path = DATA_DIR / "vector_tiles_800m_with_s2tiles_dask_3.gpkg"
136156
best_results = best_products_client.select_best_products_per_feature()
137-
to_geopackage(best_results, DATA_DIR / "vector_tiles_with_s2tiles_subset.gpkg")
138157

139158
# Writing the results to file
140-
best_products_client.to_file()
159+
to_geopackage(best_results, best_results_path)

0 commit comments

Comments
 (0)