Skip to content

Commit 010c777

Browse files
authored
GTC-3375 Add "copy_solo_tiles" option to optimize creation of int-dist alerts raster (#710)
* GTC-3375 Add "copy_solo_tiles" option to optimize creation of int-dist alerts When we merge integrated alerts and dist alerts rasters, much of the globe has only dist alerts. So, we want to optimize the process by copying the dist alerts tiles directly when to the final raster if there is no corresponding integrated alerts tile. We provide the copy_solo_tiles option to do this, which runs the "copy_solo_tiles.sh" script after the main raster is created with "union_bands = False". The script also correctly updates extent.geojson and tiles.geojson. * Validate that auxiliary_asset is not provided if source_uri is provided.
1 parent 6563a73 commit 010c777

File tree

6 files changed

+145
-14
lines changed

6 files changed

+145
-14
lines changed

app/crud/assets.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,9 @@ async def get_default_asset(dataset: str, version: str) -> ORMAsset:
187187

188188

189189
async def create_asset(dataset, version, **data) -> ORMAsset:
190+
'''Write the asset information into the database. put_asset runs the actual
191+
asset pipeline to create the asset data.'''
192+
190193
v: ORMVersion = await versions.get_version(dataset, version)
191194

192195
# default to version.is_downloadable if not set

app/models/pydantic/creation_options.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,14 @@ class RasterTileSetAssetCreationOptions(StrictBaseModel):
122122
"when input files are in different projections from each other."
123123
)
124124
)
125+
copy_solo_tiles: bool = Field(
126+
False,
127+
description=(
128+
"For raster calculations with multiple inputs, copy last source tile "
129+
"directly to the destination if all other source tiles are non-existent,"
130+
"even though union_bands is false."
131+
)
132+
)
125133
pixel_meaning: str = Field(
126134
..., description="Description of what the pixel value in the "
127135
"raster represents. This is used to clarify the meaning of the raster "
@@ -212,7 +220,7 @@ class RasterTileSetAssetCreationOptions(StrictBaseModel):
212220
auxiliary_assets: Optional[List[UUID]] = Field(
213221
None,
214222
description="Asset IDs of additional rasters you might want to include "
215-
"in your calc expression."
223+
"in your calc expression. Ignored if source_uri is set."
216224
)
217225
photometric: Optional[PhotometricType] = None
218226
num_processes: Optional[StrictInt] = None
@@ -240,13 +248,15 @@ class PixETLCreationOptions(RasterTileSetAssetCreationOptions):
240248
source_uri: Optional[List[str]] = Field(
241249
description="List of input sources. Sources must be the URI of either a "
242250
"tiles.geojson file on S3 or a folder (prefix) on S3 or GCS. "
243-
"Features in tiles.geojson must have path starting with either /vsis3/ or /vsigs/",
251+
"Features in tiles.geojson must have path starting with either /vsis3/ or /vsigs/"
252+
"auxiliary_assets is ignored if source_uri is set (for creating new versions)",
244253
)
245254

246255
@validator("source_uri")
247256
def validate_source_uri(cls, v, values, **kwargs):
248257
if values.get("source_type") == SourceType.raster:
249258
assert v, "Raster source types require source_uri"
259+
assert not values.get("auxiliary_assets"), "auxiliary_assets should not be specified with source_uri"
250260
else:
251261
assert not v, "Only raster source type require source_uri"
252262
return v

app/tasks/assets.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ async def put_asset(
4646
input_data: Dict[str, Any],
4747
constructor: FrozenSet[Union[AssetType, SourceType]] = ASSET_PIPELINES,
4848
) -> None:
49-
"""Call Asset Pipeline.
49+
"""Call Asset Pipeline to actually create the data of the asset.
5050
5151
Default assets use source_type for identification. All other assets
5252
use asset_type directly.

app/tasks/raster_tile_set_assets/raster_tile_set_assets.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from app.settings.globals import DATA_LAKE_BUCKET
2222
from app.tasks import Callback, callback_constructor
2323
from app.tasks.batch import execute
24-
from app.tasks.raster_tile_set_assets.utils import create_pixetl_job, create_unify_projection_job
24+
from app.tasks.raster_tile_set_assets.utils import create_pixetl_job, create_unify_projection_job, create_copy_solo_tiles_job
2525
from app.utils.aws import get_s3_client
2626
from app.utils.path import (
2727
get_asset_uri,
@@ -43,7 +43,7 @@ async def raster_tile_set_asset(
4343
# If being created as a source (default) asset, creation_options["source_uri"]
4444
# will be a list. When being created as an auxiliary asset, it will be None.
4545
# In the latter case we will generate one for pixETL based on the default asset,
46-
# below.
46+
# below. NOTE: auxiliary_assets is ignored if source_uri option is set.
4747

4848
co = deepcopy(input_data["creation_options"])
4949

@@ -92,16 +92,24 @@ async def raster_tile_set_asset(
9292
jobs.append(unify_job)
9393
creation_options.source_uri = new_src_uris
9494

95-
jobs.append(
96-
await create_pixetl_job(
97-
dataset,
98-
version,
99-
creation_options,
100-
"create_raster_tile_set",
101-
callback,
102-
[unify_job] if unify_job is not None else None,
103-
)
95+
pixetl_job = await create_pixetl_job(
96+
dataset,
97+
version,
98+
creation_options,
99+
"create_raster_tile_set",
100+
callback,
101+
[unify_job] if unify_job is not None else None,
104102
)
103+
jobs.append(pixetl_job)
104+
105+
if creation_options.copy_solo_tiles:
106+
# Copy the solo tiles from the last source URI after the main raster job
107+
# (which should have union_bands = False) has finished.
108+
copy_solo_job = await create_copy_solo_tiles_job(
109+
dataset, creation_options.source_uri[-1],
110+
default_asset.asset_uri, "copy_solo_tiles",
111+
callback, [pixetl_job])
112+
jobs.append(copy_solo_job)
105113

106114
log: ChangeLog = await execute(jobs)
107115

app/tasks/raster_tile_set_assets/utils.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,3 +260,29 @@ async def create_unify_projection_job(
260260
environment=JOB_ENV,
261261
callback=callback,
262262
)
263+
264+
265+
async def create_copy_solo_tiles_job(
266+
dataset: str,
267+
source_uri: str,
268+
target_uri: str,
269+
job_name: str,
270+
callback: Callback,
271+
parents: Optional[List[Job]] = None,
272+
) -> GDAL2TilesJob:
273+
command = [
274+
"copy_solo_tiles.sh",
275+
"--source",
276+
source_uri,
277+
"--target",
278+
target_uri,
279+
]
280+
281+
return GDAL2TilesJob(
282+
dataset=dataset,
283+
job_name=job_name,
284+
command=command,
285+
environment=JOB_ENV,
286+
callback=callback,
287+
parents=[parent.job_name for parent in parents] if parents else None,
288+
)

batch/scripts/copy_solo_tiles.sh

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
#!/bin/bash
2+
3+
set -e
4+
5+
# requires arguments
6+
# -s | --source
7+
# -T | --target
8+
9+
# merge_dist --source source_uri --target target_uri
10+
# where target/source are URIs like
11+
# "s3://gfw-data-lake/umd_glad_dist_alerts/v20251018/raster/epsg-4326/10/100000/resample10m/geotiff/{tile_id}.tif"
12+
13+
# Copy all tiles in the target raster that don't exist in the source raster, and
14+
# update tiles.geojson and extent.geojson. Requires that the target raster has every
15+
# tile that is in the source raster, and fails if this is not true.
16+
17+
ME=$(basename "$0")
18+
. get_arguments.sh "$@"
19+
20+
# Remove s3:// start of the uris.
21+
spath="${SOURCE#s3://}"
22+
tpath="${TARGET#s3://}"
23+
24+
# Separate into path components
25+
IFS='/' read -r -a scomponents <<< "$spath"
26+
IFS='/' read -r -a tcomponents <<< "$tpath"
27+
28+
# The uris should end in either geotiff/tiles.geojson or geotiff/{tile_id}.tif
29+
if [[ ${#scomponents[@]} -ne 10 || ${scomponents[8]} -ne "geotiff" ]]; then
30+
echo "Error: bad format for target $source"
31+
exit 1
32+
fi
33+
if [[ ${#tcomponents[@]} -ne 10 || ${tcomponents[8]} -ne "geotiff" ]]; then
34+
echo "Error: bad format for target $target"
35+
exit 1
36+
fi
37+
38+
# dataset/version and pixel_mean for both source and target
39+
sversion="${scomponents[1]}/${scomponents[2]}"
40+
smeaning="${scomponents[7]}"
41+
tversion="${tcomponents[1]}/${tcomponents[2]}"
42+
tmeaning="${tcomponents[7]}"
43+
44+
# Same as SOURCE and TARGET, but with geotiff/{...} removed
45+
source="s3://${scomponents[0]}/$sversion/${scomponents[3]}/${scomponents[4]}/${scomponents[5]}/${scomponents[6]}/${scomponents[7]}"
46+
target="s3://${tcomponents[0]}/$tversion/${tcomponents[3]}/${tcomponents[4]}/${tcomponents[5]}/${tcomponents[6]}/${tcomponents[7]}"
47+
48+
# Get the set of tiles in each raster
49+
stiles=$(aws s3 ls $source/geotiff/ | grep '.*.tif$' | cut -c32-)
50+
ttiles=$(aws s3 ls $target/geotiff/ | grep '.*.tif$' | cut -c32-)
51+
52+
uniqSource=$(comm -13 <(echo "$ttiles" | tr ' ' '\n') <(echo "$stiles" | tr ' ' '\n'))
53+
uniqTarget=$(comm -23 <(echo "$ttiles" | tr ' ' '\n') <(echo "$stiles" | tr ' ' '\n'))
54+
55+
if [ ! -z "$uniqTarget" ]; then
56+
echo "There are tiles in the target that are not in the source, so failing:"
57+
echo $uniqTarget
58+
exit 1
59+
fi
60+
61+
if [ -z "$uniqSource" ]; then
62+
echo "There are no unique tiles in source to copy"
63+
exit 1
64+
fi
65+
66+
l=($uniqSource)
67+
len=${#l[@]}
68+
echo Copying $len tiles
69+
70+
j=1
71+
for f in $uniqSource; do
72+
echo "Copying $j/$len tile"
73+
((j++))
74+
aws s3 cp $source/geotiff/$f $target/geotiff/$f
75+
aws s3 cp $source/gdal-geotiff/$f $target/gdal-geotiff/$f
76+
done
77+
78+
echo Copying extent.geojson
79+
aws s3 cp $source/geotiff/extent.geojson $target/geotiff/extent.geojson
80+
aws s3 cp $source/gdal-geotiff/extent.geojson $target/gdal-geotiff/extent.geojson
81+
82+
echo Copying tiles.geojson
83+
aws s3 cp $source/geotiff/tiles.geojson - | sed "s#/$sversion/#/$tversion/#g; s#/$smeaning/#/$tmeaning/#g" | aws s3 cp - $target/geotiff/tiles.geojson
84+
aws s3 cp $source/gdal-geotiff/tiles.geojson - | sed "s#/$sversion/#/$tversion/#g; s#/$smeaning/#/$tmeaning/#g" | aws s3 cp - $target/gdal-geotiff/tiles.geojson

0 commit comments

Comments
 (0)