From d2ab2fcaaebd80cd28f4d84e4af41d7c34add4c5 Mon Sep 17 00:00:00 2001 From: Ariana Barzinpour Date: Tue, 8 Apr 2025 05:10:59 +0000 Subject: [PATCH 01/37] update to 1.9 and use odc.geo; most tests passing --- README.md | 2 +- docker/requirements.txt | 7 +-- odc/stats/_cli_publish_tasks.py | 2 +- odc/stats/_gjson.py | 13 +++--- odc/stats/io.py | 2 +- odc/stats/model.py | 9 ++-- odc/stats/plugins/_base.py | 2 +- odc/stats/plugins/lc_level34.py | 2 +- odc/stats/plugins/lc_ml_treelite.py | 2 +- odc/stats/plugins/lc_tf_urban.py | 2 +- odc/stats/plugins/mangroves.py | 2 +- odc/stats/tasks.py | 11 +++-- odc/stats/utils.py | 12 ++--- setup.cfg | 8 ++-- tests/__init__.py | 3 +- tests/requirements.txt | 4 +- .../{test-env-py38.yml => test-env-py310.yml} | 38 +++++++++------ tests/test_lc_level34.py | 13 +++--- tests/test_mangroves.py | 8 ++-- tests/test_save_tasks.py | 2 +- tests/test_sqs.py | 2 +- tests/test_utils.py | 46 +++++++++---------- 22 files changed, 103 insertions(+), 89 deletions(-) rename tests/{test-env-py38.yml => test-env-py310.yml} (61%) diff --git a/README.md b/README.md index f5594c11..136ec03e 100644 --- a/README.md +++ b/README.md @@ -334,6 +334,6 @@ The dockerfile for the docker image used in the ochastration, which can be used ## Integration test -The integration test for the summary products is located under [tests](./tests). Currently the test is performed on all the official summary products published by DEA. The "golden files" to test against are stored on the public accessible [S3 bucket](s3://dea-public-data-dev/stats-golden-files). The "golder files" should be achived but not deleted in the case that we decide to amend or upgrade any product. It will help with tracking the changes that we intend and alerting those that we do not. +The integration test for the summary products is located under [tests](./tests). Currently the test is performed on all the official summary products published by DEA. The "golden files" to test against are stored on the public accessible [S3 bucket](s3://dea-public-data-dev/stats-golden-files). The "golden files" should be archived but not deleted in the case that we decide to amend or upgrade any product. It will help with tracking the changes that we intend and alerting those that we do not. The test is meant to be regressive, i.e., the new version of `odc-stats` need to pass the test on the last version of docker image. The new version of docker image needs to pass the test on the current released version of `odc-stats`. diff --git a/docker/requirements.txt b/docker/requirements.txt index dd581939..05b75d79 100644 --- a/docker/requirements.txt +++ b/docker/requirements.txt @@ -1,8 +1,9 @@ --extra-index-url https://packages.dea.ga.gov.au/ -datacube-ows<1.9 -datacube[performance,s3]<1.9 -eodatasets3<1.9 +# most of these don't need to pulled from here... +datacube-ows>=1.9 +datacube[performance,s3]>=1.9 +eodatasets3>=1.9 hdstats==0.1.8.post1 numexpr @ git+https://github.com/pydata/numexpr@a99412e odc-algo @ git+https://github.com/opendatacube/odc-algo@adb1856 diff --git a/odc/stats/_cli_publish_tasks.py b/odc/stats/_cli_publish_tasks.py index 71e20843..aed2ced7 100644 --- a/odc/stats/_cli_publish_tasks.py +++ b/odc/stats/_cli_publish_tasks.py @@ -5,7 +5,7 @@ import click import fsspec import toolz -from datacube.utils.geometry import Geometry +from odc.geo import Geometry from odc.aws.queue import get_queue, publish_messages from odc.dscache.tools.tiling import GRIDS from odc.stats.model import TileIdx_txy diff --git a/odc/stats/_gjson.py b/odc/stats/_gjson.py index 33246897..15db85f5 100644 --- a/odc/stats/_gjson.py +++ b/odc/stats/_gjson.py @@ -4,8 +4,9 @@ from typing import Tuple, Dict, Any from datetime import timedelta -from datacube.model import GridSpec -from datacube.utils.geometry import polygon_from_transform, Geometry +from odc.geo.gridspec import GridSpec +from odc.geo import Geometry, wh_ +from odc.geo.geom import polygon_from_transform from odc.dscache.tools import solar_offset from .model import TileIdx_xy, TileIdx_txy @@ -20,14 +21,14 @@ def gs_bounds(gs: GridSpec, tiles: Tuple[Tuple[int, int], Tuple[int, int]]) -> G X,Y ranges are inclusive on the left and exclusive on the right, same as numpy slicing. """ ((x0, x1), (y0, y1)) = tiles - if gs.resolution[0] < 0: + if gs.resolution.y < 0: gb = gs.tile_geobox((x0, y1 - 1)) else: gb = gs.tile_geobox((x0, y0)) - nx = (x1 - x0) * gb.shape[1] - ny = (y1 - y0) * gb.shape[0] - return polygon_from_transform(nx, ny, gb.affine, gb.crs) + nx = (x1 - x0) * gb.shape.x + ny = (y1 - y0) * gb.shape.y + return polygon_from_transform(wh_(nx, ny), gb.affine, gb.crs) def timedelta_to_hours(td: timedelta) -> float: diff --git a/odc/stats/io.py b/odc/stats/io.py index 9fae2215..071481b0 100644 --- a/odc/stats/io.py +++ b/odc/stats/io.py @@ -17,7 +17,7 @@ from odc.aws.s3_client import S3Client from dask.distributed import get_worker from datacube.utils.dask import save_blob_to_file -from datacube.utils.cog import to_cog +from odc.geo.cog import to_cog from datacube.model import Dataset from .model import Task, EXT_TIFF from .plugins import StatsPluginInterface diff --git a/odc/stats/model.py b/odc/stats/model.py index 0ad7b408..19fa7118 100644 --- a/odc/stats/model.py +++ b/odc/stats/model.py @@ -12,7 +12,7 @@ import xarray as xr from datacube.model import Dataset from datacube.utils.dates import normalise_dt -from datacube.utils.geometry import GeoBox +from odc.geo.geobox import GeoBox from ._text import split_and_check from pystac.extensions.projection import ProjectionExtension from toolz import dicttoolz @@ -318,13 +318,12 @@ def _lineage(self) -> Tuple[UUID, ...]: # TODO: replace this and test # if 'fused' in ds.metadata._doc['properties'].keys(): - if "fused" in ds.type.name: + if "fused" in ds.product.name: lineage = tuple( set( x for ds in self.datasets - for y in ds.metadata.sources.values() - for x in y.values() + for x in ds.metadata.sources ) ) else: @@ -544,7 +543,7 @@ def render_metadata( ) ProjectionExtension.add_to(item) proj_ext = ProjectionExtension.ext(item) - proj_ext.apply(geobox.crs.epsg, transform=geobox.transform, shape=geobox.shape) + proj_ext.apply(epsg=geobox.crs.epsg, transform=geobox.transform, shape=list(geobox.shape)) # Lineage last item.properties["odc:lineage"] = {"inputs": inputs} diff --git a/odc/stats/plugins/_base.py b/odc/stats/plugins/_base.py index f6df4ac6..7d3cec67 100644 --- a/odc/stats/plugins/_base.py +++ b/odc/stats/plugins/_base.py @@ -4,7 +4,7 @@ import xarray as xr import numpy as np from datacube.model import Dataset -from datacube.utils.geometry import GeoBox +from odc.geo.geobox import GeoBox from odc.algo import to_rgba from odc.algo.io import load_with_native_transform from odc.algo._masking import _nodata_fuser diff --git a/odc/stats/plugins/lc_level34.py b/odc/stats/plugins/lc_level34.py index 905f21a1..1ba4c9ce 100644 --- a/odc/stats/plugins/lc_level34.py +++ b/odc/stats/plugins/lc_level34.py @@ -129,7 +129,7 @@ def reduce(self, xx: xr.Dataset) -> xr.Dataset: # 215 -> 216 if urban_mask == 0 urban_mask = rasterize_vector_mask( self.urban_mask, - xx.geobox.transform, + xx.odc.geobox.transform, xx.artificial_surface.shape, filter_expression=self.filter_expression, threshold=self.mask_threshold, diff --git a/odc/stats/plugins/lc_ml_treelite.py b/odc/stats/plugins/lc_ml_treelite.py index 7f2b6a06..b898c81f 100644 --- a/odc/stats/plugins/lc_ml_treelite.py +++ b/odc/stats/plugins/lc_ml_treelite.py @@ -14,7 +14,7 @@ from dask.distributed import get_worker from datacube.model import Dataset -from datacube.utils.geometry import GeoBox +from odc.geo.geobox import GeoBox from odc.algo._memsink import yxbt_sink, yxt_sink from odc.algo.io import load_with_native_transform diff --git a/odc/stats/plugins/lc_tf_urban.py b/odc/stats/plugins/lc_tf_urban.py index d899ee40..6c63a959 100644 --- a/odc/stats/plugins/lc_tf_urban.py +++ b/odc/stats/plugins/lc_tf_urban.py @@ -12,7 +12,7 @@ from dask.distributed import get_worker from datacube.model import Dataset -from datacube.utils.geometry import GeoBox +from odc.geo.geobox import GeoBox from odc.algo._memsink import yxbt_sink from odc.algo.io import load_with_native_transform diff --git a/odc/stats/plugins/mangroves.py b/odc/stats/plugins/mangroves.py index c509c27d..e8ede56c 100644 --- a/odc/stats/plugins/mangroves.py +++ b/odc/stats/plugins/mangroves.py @@ -57,7 +57,7 @@ def reduce(self, xx: xr.Dataset) -> xr.Dataset: it is not a 'reduce' though """ extent_mask = rasterize_vector_mask( - self.mangroves_extent, xx.geobox.transform, xx.pv_pc_10.shape + self.mangroves_extent, xx.odc.geobox.transform, xx.pv_pc_10.shape ) good_data = extent_mask == 1 good_data &= xx.wet_pc_10 > self.tcw_threshold diff --git a/odc/stats/tasks.py b/odc/stats/tasks.py index 534ad54c..79fd68b2 100644 --- a/odc/stats/tasks.py +++ b/odc/stats/tasks.py @@ -18,8 +18,9 @@ from odc.dscache import DatasetCache from datacube import Datacube -from datacube.model import Dataset, GridSpec -from datacube.utils.geometry import Geometry +from datacube.model import Dataset +from odc.geo import Geometry +from odc.geo.gridspec import GridSpec from datacube.utils.documents import transform_object_tree from datacube.utils.dates import normalise_dt @@ -326,7 +327,7 @@ def _find_dss( query.update({"product": indexed_products, **dataset_filter}) dss = ordered_dss( dc, - freq="y", + freq="Y", key=lambda ds: ( (ds.center_time, ds.metadata.region_code) if hasattr(ds.metadata, "region_code") @@ -340,7 +341,7 @@ def _find_dss( query.update({"product": list(ignore_time), "time": ("1970", "2038")}) dss_extra = ordered_dss( dc, - freq="y", + freq="Y", key=lambda ds: ( (ds.center_time, ds.metadata.region_code) if hasattr(ds.metadata, "region_code") @@ -660,7 +661,7 @@ def _write_info(self, tasks, msg, cells, debug): msg("Dumping GeoJSON(s)") grid_info = compute_grid_info( - cells, resolution=max(self._gridspec.tile_size) / 4 + cells, resolution=max(self._gridspec.tile_size.xy) / 4 ) tasks_geo = gjson_from_tasks(tasks, grid_info) for temporal_range, gjson in tasks_geo.items(): diff --git a/odc/stats/utils.py b/odc/stats/utils.py index 104ccf70..010e738a 100644 --- a/odc/stats/utils.py +++ b/odc/stats/utils.py @@ -5,7 +5,7 @@ from dateutil.relativedelta import relativedelta from .model import DateTimeRange, odc_uuid from datacube.storage import measurement_paths -from datacube.model import Dataset, DatasetType +from datacube.model import Dataset, Product from datacube.index.eo3 import prep_eo3 @@ -299,7 +299,7 @@ def dedup_s2_datasets(dss): return out, skipped -def fuse_products(*ds_types) -> DatasetType: +def fuse_products(*ds_types) -> Product: """ Fuses two products. This function requires access to a Datacube to access the metadata type. @@ -352,12 +352,12 @@ def fuse_products(*ds_types) -> DatasetType: for d in def_s: fused_def["measurements"] += d["measurements"] - return DatasetType(ds_types[0].metadata_type, fused_def) + return Product(ds_types[0].metadata_type, fused_def) def fuse_ds( *dss, - product: Optional[DatasetType] = None, + product: Optional[Product] = None, ) -> Dataset: """ This function fuses two datasets. It requires that: @@ -375,7 +375,7 @@ def fuse_ds( doc_s = [ds.metadata_doc for ds in dss] if product is None: - product = fuse_products(*[ds.type for ds in dss]) + product = fuse_products(*[ds.product for ds in dss]) fused_doc = { "id": str(odc_uuid(product.name, "0.0.0", sources=[d["id"] for d in doc_s])), @@ -454,6 +454,6 @@ def fuse_ds( for key, path in {**measurement_paths(ds)}.items(): fused_doc["measurements"][key]["path"] = path - fused_ds = Dataset(product, prep_eo3(fused_doc), uris=[""]) + fused_ds = Dataset(product, prep_eo3(fused_doc), uri="") fused_doc["properties"]["fused"] = "True" return fused_ds diff --git a/setup.cfg b/setup.cfg index 7f7cd778..0b880771 100644 --- a/setup.cfg +++ b/setup.cfg @@ -16,13 +16,13 @@ url = https://github.com/opendatacube/odc-stats/ include_package_data = true zip_safe = false packages = find_namespace: -python_requires = >=3.8 +python_requires = >=3.10 tests_require = pytest install_requires = botocore click>=8.0.0 dask - datacube<1.9 + datacube>=1.9 distributed numpy odc-cloud[ASYNC]>=0.2.5 @@ -32,7 +32,7 @@ install_requires = odc_stac pandas pystac>=1.1.0 - eodatasets3>=0.22.0 + eodatasets3>=1.9 toolz tqdm xarray>=2023.1.0 @@ -56,7 +56,7 @@ include = # datacube_ows<1.8.21 has issue on function config internal over-writing [options.extras_require] ows = - datacube_ows>=1.8.21 + datacube_ows>=1.9 sentry-sdk blinker diff --git a/tests/__init__.py b/tests/__init__.py index ed793e49..85f6c52f 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -8,6 +8,7 @@ import xarray as xr import dask.array as da import numpy as np +from odc.geo.xr import xr_coords from odc.stats.utils import CompressedDataset from odc.stats.plugins import StatsPluginInterface from odc.stats.model import DateTimeRange @@ -119,7 +120,7 @@ def mk_dask_xx( elif data.dtype != dtype: data = data.astype(dtype) - coords = geobox.xr_coords(with_crs=True) + coords = xr_coords(geobox, crs_coord_name="spatial_ref") coords["time"] = mk_time_coords(timestamps) return xr.DataArray(data=data, dims=("time", "y", "x"), coords=coords, attrs=attrs) diff --git a/tests/requirements.txt b/tests/requirements.txt index cf18641b..039bd3d7 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,12 +1,12 @@ --extra-index-url https://packages.dea.ga.gov.au/ -datacube<1.9 +datacube>=1.9 # for pytest-depends deepdiff future_fstrings mock moto networkx -numpy<2.0 +numpy>=2.0 odc-algo @ git+https://github.com/opendatacube/odc-algo@adb1856 odc-cloud>=0.2.5 odc-stac @ git+https://github.com/opendatacube/odc-stac@69bdf64 diff --git a/tests/test-env-py38.yml b/tests/test-env-py310.yml similarity index 61% rename from tests/test-env-py38.yml rename to tests/test-env-py310.yml index e7c3c892..4a00ad47 100644 --- a/tests/test-env-py38.yml +++ b/tests/test-env-py310.yml @@ -1,26 +1,26 @@ # Conda environment for running tests in odc-tools -# conda env create -f test-env-py38.yml -# conda activate odc-tests-py38 +# conda env create -f test-env-py310.yml +# conda activate odc-tests-py310 -name: odc-tests-py38 +name: odc-tests-py310 channels: - conda-forge dependencies: - - python=3.8 + - python=3.10 # Datacube - - datacube>=1.8.7 + - datacube>=1.9.0 # odc.{aws,aio}: aiobotocore/boto3 # pin aiobotocore for easier resolution of dependencies - - aiobotocore==1.3.3 + - aiobotocore #==1.3.3 - boto3 # eodatasets3 (for odc-stats) - boltons - ciso8601 - python-rapidjson - - requests-cache==0.7.4 # 0.8.0 broke eodatasets3 + - requests-cache # 0.8.0 broke eodatasets3 - ruamel.yaml - structlog - url-normalize @@ -31,7 +31,7 @@ dependencies: - toolz - tqdm # pin xarray version since groupby bug on multi-indexed dataset in 2022.6.0 - - xarray==2022.3.0 + - xarray #==2023.12.0 - gdal - fsspec>=2022.1.0 - fiona @@ -44,6 +44,10 @@ dependencies: - moto - mock - deepdiff + # these may need to be in pip + - s3fs + # - tflite-runtime + - tl2cgen # for pytest-depends - future_fstrings @@ -56,21 +60,29 @@ dependencies: - sphinx-autodoc-typehints - nbsphinx - - pip=20 + - pip=24 - pip: - - odc-algo - - odc-dscache>=0.2.2 + # - odc-algo + # - odc-dscache>=0.2.2 - odc-cloud[ASYNC] + - odc-geo>=0.4.10 + - odc-stac + - odc-loader>=0.5.1 - thredds-crawler # odc.stats - - eodatasets3>=0.22.0 + - eodatasets3>=1.9 # odc.algo optional dependency - hdstats + - scipy<1.15.0 # cwt import removed in 1.15 # tests - pytest-depends + - tflite-runtime + + # init db + - odc-apps-dc-tools>=1.9 # for odc-stats - - datacube_ows>=1.8.21 + - datacube_ows>=1.9 diff --git a/tests/test_lc_level34.py b/tests/test_lc_level34.py index c19536b1..06f77143 100644 --- a/tests/test_lc_level34.py +++ b/tests/test_lc_level34.py @@ -7,7 +7,9 @@ import pandas as pd import xarray as xr import dask.array as da -from datacube.utils.geometry import GeoBox +from odc.geo import wh_ +from odc.geo.geobox import GeoBox +from odc.geo.xr import xr_coords from affine import Affine from unittest.mock import patch @@ -125,9 +127,9 @@ def image_groups(): (20 - 10) / l34.shape[2], (5 - 0) / l34.shape[1] ) geobox = GeoBox( - crs="epsg:3577", affine=affine, width=l34.shape[2], height=l34.shape[1] + crs="epsg:3577", affine=affine, shape=wh_(l34.shape[2], l34.shape[1]) ) - coords = geobox.xr_coords() + coords = xr_coords(geobox) data_vars = { "level_3_4": xr.DataArray( @@ -401,10 +403,9 @@ def test_level4(urban_shape): geobox = GeoBox( crs="epsg:3577", affine=affine, - width=level_3_4.shape[2], - height=level_3_4.shape[1], + shape=wh_(level_3_4.shape[2], level_3_4.shape[1]), ) - coords = geobox.xr_coords() + coords = xr_coords(geobox) data_vars = { "level_3_4": xr.DataArray( diff --git a/tests/test_mangroves.py b/tests/test_mangroves.py index 87a99e5b..b18b9cae 100644 --- a/tests/test_mangroves.py +++ b/tests/test_mangroves.py @@ -7,7 +7,9 @@ import json import fiona from fiona.crs import CRS -from datacube.utils.geometry import GeoBox +from odc.geo import wh_ +from odc.geo.geobox import GeoBox +from odc.geo.xr import xr_coords from affine import Affine import pytest @@ -117,9 +119,9 @@ def dataset(): (20 - 10) / band_1.shape[2], (5 - 0) / band_1.shape[1] ) geobox = GeoBox( - crs="epsg:3577", affine=affine, width=band_1.shape[2], height=band_1.shape[1] + crs="epsg:3577", affine=affine, shape=wh_(band_1.shape[2], band_1.shape[1]) ) - coords = geobox.xr_coords() + coords = xr_coords(geobox) coords.update({"time": index}) data_vars = { diff --git a/tests/test_save_tasks.py b/tests/test_save_tasks.py index 1e116565..66887805 100644 --- a/tests/test_save_tasks.py +++ b/tests/test_save_tasks.py @@ -148,7 +148,7 @@ def test_create_dss_by_stac(s3_path): for d in dss: with_uris = False for key in s3_path: - with_uris |= "/".join(key.split("/")[-2:]) in d.uris[0] + with_uris |= "/".join(key.split("/")[-2:]) in d.uri assert with_uris diff --git a/tests/test_sqs.py b/tests/test_sqs.py index 2113ad2f..524ba3db 100644 --- a/tests/test_sqs.py +++ b/tests/test_sqs.py @@ -3,7 +3,7 @@ import boto3 import moto -from datacube.utils.geometry import Geometry +from odc.geo import Geometry from odc.aws.queue import get_queue, publish_message from odc.stats._cli_publish_tasks import filter_tasks, get_geometry, publish_tasks from odc.stats._sqs import SQSWorkToken diff --git a/tests/test_utils.py b/tests/test_utils.py index 8b8034b2..a3d54a36 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -5,7 +5,7 @@ import pystac import pytest from datacube import Datacube -from datacube.model import Dataset, DatasetType, metadata_from_doc +from datacube.model import Dataset, Product, metadata_from_doc from datacube.index.eo3 import prep_eo3 from datacube.index.abstract import default_metadata_type_docs from odc.stats.model import DateTimeRange @@ -292,19 +292,19 @@ def test_fuse_products(wo_definition, fc_definition): } eo3 = standard_metadata_types["eo3"] - wo_product = DatasetType(eo3, wo_definition) - fc_product = DatasetType(eo3, fc_definition) + wo_product = Product(eo3, wo_definition) + fc_product = Product(eo3, fc_definition) fuse_products(wo_product, fc_product) bad_definition = deepcopy(wo_definition) bad_definition["metadata"]["properties"]["odc:file_format"] = "bad" - bad_product = DatasetType(eo3, bad_definition) + bad_product = Product(eo3, bad_definition) with pytest.raises(ValueError): fuse_products(bad_product, fc_product) bad_definition = deepcopy(wo_definition) bad_definition["measurements"].append(fc_definition["measurements"][1]) - bad_product = DatasetType(eo3, bad_definition) + bad_product = Product(eo3, bad_definition) with pytest.raises(ValueError): fuse_products(bad_product, fc_product) @@ -314,8 +314,8 @@ def test_fuse_products(wo_definition, fc_definition): fc_no_ff = deepcopy(fc_definition) del fc_no_ff["metadata"]["properties"]["odc:file_format"] - wo_product = DatasetType(eo3, wo_no_ff) - fc_product = DatasetType(eo3, fc_no_ff) + wo_product = Product(eo3, wo_no_ff) + fc_product = Product(eo3, fc_no_ff) fuse_products(wo_product, fc_product) @@ -329,8 +329,8 @@ def test_fuse_dss(wo_definition, fc_definition): } eo3 = standard_metadata_types["eo3"] - wo_product = DatasetType(eo3, wo_definition) - fc_product = DatasetType(eo3, fc_definition) + wo_product = Product(eo3, wo_definition) + fc_product = Product(eo3, fc_definition) fused_product = fuse_products(wo_product, fc_product) wo_metadata = { @@ -477,16 +477,12 @@ def test_fuse_dss(wo_definition, fc_definition): # paths get made absolute here # TODO: force paths to stay relative - wo_uris = [ - "s3://dea-public-data/derivative/ga_ls_wo_3/1-6-0/091/086/2020/04/04/\ - ga_ls_wo_3_091086_2020-04-04_final.stac-item.json" - ] - wo_ds = Dataset(wo_product, prep_eo3(wo_metadata), uris=wo_uris) - fc_uris = [ - "s3://dea-public-data/derivative/ga_ls_fc_3/2-5-0/091/086/2020/04/04/\ - ga_ls_fc_3_091086_2020-04-04_final.stac-item.json" - ] - fc_ds = Dataset(fc_product, prep_eo3(fc_metadata), uris=fc_uris) + wo_uri = "s3://dea-public-data/derivative/ga_ls_wo_3/1-6-0/091/086/2020/04/04/\ + ga_ls_wo_3_091086_2020-04-04_final.stac-item.json" + wo_ds = Dataset(wo_product, prep_eo3(wo_metadata), uri=wo_uri) + fc_uri = "s3://dea-public-data/derivative/ga_ls_fc_3/2-5-0/091/086/2020/04/04/\ + ga_ls_fc_3_091086_2020-04-04_final.stac-item.json" + fc_ds = Dataset(fc_product, prep_eo3(fc_metadata), uri=fc_uri) fused_ds = fuse_ds(wo_ds, fc_ds, product=fused_product) assert _get_msr_paths(fused_ds) == _get_msr_paths(fc_ds).union( @@ -499,25 +495,25 @@ def test_fuse_dss(wo_definition, fc_definition): bad_metadata = deepcopy(fc_metadata) bad_metadata["properties"]["datetime"] = "2020-04-03T23:33:10.644420Z" - bad_ds = Dataset(fc_product, prep_eo3(bad_metadata), uris=fc_uris) + bad_ds = Dataset(fc_product, prep_eo3(bad_metadata), uri=fc_uri) with pytest.raises(ValueError): fused_ds = fuse_ds(wo_ds, bad_ds, product=fused_product) bad_metadata = deepcopy(fc_metadata) bad_metadata["crs"] = "epsg:32656" - bad_ds = Dataset(fc_product, prep_eo3(bad_metadata), uris=fc_uris) + bad_ds = Dataset(fc_product, prep_eo3(bad_metadata), uri=fc_uri) with pytest.raises(ValueError): fused_ds = fuse_ds(wo_ds, bad_ds, product=fused_product) bad_metadata = deepcopy(fc_metadata) bad_metadata["grids"]["default"]["shape"] = [7212, 8311] - bad_ds = Dataset(fc_product, prep_eo3(bad_metadata), uris=fc_uris) + bad_ds = Dataset(fc_product, prep_eo3(bad_metadata), uri=fc_uri) with pytest.raises(ValueError): fused_ds = fuse_ds(wo_ds, bad_ds, product=fused_product) bad_metadata = deepcopy(fc_metadata) bad_metadata["label"] += "a" - bad_ds = Dataset(fc_product, prep_eo3(bad_metadata), uris=fc_uris) + bad_ds = Dataset(fc_product, prep_eo3(bad_metadata), uri=fc_uri) with pytest.raises(ValueError): fused_ds = fuse_ds(wo_ds, bad_ds, product=fused_product) @@ -526,6 +522,6 @@ def test_fuse_dss(wo_definition, fc_definition): fc_no_ff = deepcopy(fc_metadata) del wo_no_ff["properties"]["odc:file_format"] del fc_no_ff["properties"]["odc:file_format"] - wo_ds = Dataset(wo_product, prep_eo3(wo_no_ff), uris=wo_uris) - fc_ds = Dataset(fc_product, prep_eo3(fc_no_ff), uris=fc_uris) + wo_ds = Dataset(wo_product, prep_eo3(wo_no_ff), uri=wo_uri) + fc_ds = Dataset(fc_product, prep_eo3(fc_no_ff), uri=fc_uri) fuse_ds(wo_ds, fc_ds, product=fused_product) From 7209aeb46ef52e3ff2ccf1eb18f1e4d717472c32 Mon Sep 17 00:00:00 2001 From: Ariana Barzinpour Date: Fri, 11 Apr 2025 00:43:49 +0000 Subject: [PATCH 02/37] expect dataset sources as list --- odc/stats/model.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/odc/stats/model.py b/odc/stats/model.py index 19fa7118..2cf2621e 100644 --- a/odc/stats/model.py +++ b/odc/stats/model.py @@ -408,8 +408,7 @@ def render_assembler_metadata( platforms, instruments = ([], []) for dataset in self.datasets: - if "fused" in dataset.type.name: - sources = [e["id"] for e in dataset.metadata.sources.values()] + if "fused" in dataset.product.name: if dataset.metadata_doc["properties"].get("eo:platform") is not None: platforms.append(dataset.metadata_doc["properties"]["eo:platform"]) if dataset.metadata_doc["properties"].get("eo:instrument") is not None: @@ -424,7 +423,7 @@ def render_assembler_metadata( dataset.metadata_doc["properties"]["eo:instrument"] ] dataset_assembler.note_source_datasets( - self.product.classifier, *sources + self.product.classifier, *dataset.metadata.sources ) else: dataset.metadata_doc.setdefault("$schema", "") @@ -491,7 +490,7 @@ def render_assembler_metadata( path, expand_valid_data=False, grid=GridSpec( - shape=self.geobox.shape, + shape=self.geobox.shape.yx, transform=self.geobox.transform, crs=CRS.from_epsg(self.geobox.crs.to_epsg()), ), From 50904be409a863911c979f464fbc997bcf9fd776 Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Fri, 11 Apr 2025 01:06:30 +0000 Subject: [PATCH 03/37] update test --- tests/requirements.txt | 10 ++++++---- tests/test_gm_ls.py | 5 +++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/tests/requirements.txt b/tests/requirements.txt index 039bd3d7..714490a2 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,15 +1,17 @@ --extra-index-url https://packages.dea.ga.gov.au/ -datacube>=1.9 +datacube @ git+https://github.com/opendatacube/datacube-core@dc7f5e9 +datacube-ows>=1.9 # for pytest-depends deepdiff +eodatasets3>=1.9 future_fstrings mock moto networkx -numpy>=2.0 -odc-algo @ git+https://github.com/opendatacube/odc-algo@adb1856 +numpy<2.0 +odc-algo @ git+https://github.com/opendatacube/odc-algo@integrate_1.9 odc-cloud>=0.2.5 -odc-stac @ git+https://github.com/opendatacube/odc-stac@69bdf64 +odc-dscache @ git+https://github.com/opendatacube/odc-dscache@integrate_1.9 # For tests pytest diff --git a/tests/test_gm_ls.py b/tests/test_gm_ls.py index cc45504a..a4c4c210 100644 --- a/tests/test_gm_ls.py +++ b/tests/test_gm_ls.py @@ -7,6 +7,10 @@ from odc.stats.model import product_for_plugin from odc.stats.plugins.gm import StatsGMLS from odc.stats.tasks import TaskReader +from dask.distributed import Client, LocalCluster + +cluster = LocalCluster(n_workers=1, threads_per_worker=2) +client = Client(cluster) @pytest.fixture @@ -204,6 +208,7 @@ def test_no_data_value(monkeypatch): task.datasets = task.datasets[2:3] xx_0_0 = gm_ls.input_data(task.datasets, task.geobox) + print(xx_0_0.compute()) xx_0_0 = xx_0_0.sel( indexers={"x": slice(None, None, 100), "y": slice(None, None, 100)} ) From b5de1d6b083f3ea4c05a04f321468b21f8a62b81 Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Wed, 16 Apr 2025 04:19:50 +0000 Subject: [PATCH 04/37] set XYSCALE=None explicitly to use odc-geo warp --- odc/stats/io.py | 17 +++++++++-------- odc/stats/proc.py | 1 + odc/stats/utils.py | 2 +- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/odc/stats/io.py b/odc/stats/io.py index 071481b0..575839e9 100644 --- a/odc/stats/io.py +++ b/odc/stats/io.py @@ -35,11 +35,11 @@ WriteResult = namedtuple("WriteResult", ["path", "sha1", "error"]) _log = logging.getLogger(__name__) -DEFAULT_COG_OPTS = dict( - compress="deflate", - zlevel=6, - blocksize=512, -) +DEFAULT_COG_OPTS = { + "compress": "deflate", + "zlevel": 6, + "blocksize": 512, +} def dump_json(meta: Dict[str, Any]) -> str: @@ -123,9 +123,9 @@ def __init__( """ if cog_opts is None: - cog_opts = dict(**DEFAULT_COG_OPTS) + cog_opts = {**DEFAULT_COG_OPTS} else: - tmp = dict(**DEFAULT_COG_OPTS) + tmp = {**DEFAULT_COG_OPTS} tmp.update(cog_opts) cog_opts = tmp @@ -381,7 +381,7 @@ def dump_with_eodatasets3( ) dataset_assembler.extend_user_metadata( - "input-products", sorted({e.type.name for e in task.datasets}) + "input-products", sorted({e.product.name for e in task.datasets}) ) dataset_assembler.extend_user_metadata("odc-stats-config", vars(task.product)) @@ -504,6 +504,7 @@ def dump_with_eodatasets3( with_deps=odc_meta_done, ) + # pylint: disable=too-many-positional-arguments def dump( self, task: Task, diff --git a/odc/stats/proc.py b/odc/stats/proc.py index deceb182..40ce4557 100644 --- a/odc/stats/proc.py +++ b/odc/stats/proc.py @@ -241,6 +241,7 @@ def _run(self, tasks: Iterable[Task], apply_eodatasets3) -> Iterator[TaskResult] task.geobox, transform_code=proc.transform_code, area_of_interest=proc.area_of_interest, + **{"XSCALE": None, "YSCALE": None}, ) ) diff --git a/odc/stats/utils.py b/odc/stats/utils.py index 010e738a..0ec31264 100644 --- a/odc/stats/utils.py +++ b/odc/stats/utils.py @@ -454,6 +454,6 @@ def fuse_ds( for key, path in {**measurement_paths(ds)}.items(): fused_doc["measurements"][key]["path"] = path - fused_ds = Dataset(product, prep_eo3(fused_doc), uri="") + fused_ds = Dataset(product, prep_eo3(fused_doc), uri="fake") fused_doc["properties"]["fused"] = "True" return fused_ds From cf1cf79a7f5d3ac4790160e78e0af643a3a3c693 Mon Sep 17 00:00:00 2001 From: Ariana Barzinpour Date: Thu, 17 Apr 2025 07:05:48 +0000 Subject: [PATCH 05/37] update reqs --- docker/requirements.txt | 2 +- tests/test-env-py310.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/requirements.txt b/docker/requirements.txt index 05b75d79..b3bef6da 100644 --- a/docker/requirements.txt +++ b/docker/requirements.txt @@ -9,7 +9,7 @@ numexpr @ git+https://github.com/pydata/numexpr@a99412e odc-algo @ git+https://github.com/opendatacube/odc-algo@adb1856 odc-apps-cloud>=0.2.2 # For testing -odc-apps-dc-tools>=0.2.12 +odc-apps-dc-tools>=1.9 odc-cloud>=0.2.5 odc-dscache>=0.2.3 odc-stac @ git+https://github.com/opendatacube/odc-stac@69bdf64 diff --git a/tests/test-env-py310.yml b/tests/test-env-py310.yml index 4a00ad47..b3906d05 100644 --- a/tests/test-env-py310.yml +++ b/tests/test-env-py310.yml @@ -9,7 +9,7 @@ dependencies: - python=3.10 # Datacube - - datacube>=1.9.0 + - datacube==1.9.0 # odc.{aws,aio}: aiobotocore/boto3 # pin aiobotocore for easier resolution of dependencies From 06ac7105d98e87bac6885af998213beee0cba716 Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Thu, 17 Apr 2025 03:32:03 +0000 Subject: [PATCH 06/37] tests requirement with odc>1.9 --- tests/integration_test.sh | 8 ++++---- tests/requirements.txt | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/integration_test.sh b/tests/integration_test.sh index 40bbe32f..3ef15517 100755 --- a/tests/integration_test.sh +++ b/tests/integration_test.sh @@ -12,27 +12,27 @@ odc-stats --version echo "Test LS GeoMAD" odc-stats save-tasks --config https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/709daaee176c04e33de4cc9600462717cca5b34d/dev/services/odc-stats/geomedian/ga_ls8c_nbart_gm_cyear_3.yaml --year=2015 --tiles 49:50,24:25 --overwrite ls-geomad-cyear.db -odc-stats run --threads=1 --config https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/709daaee176c04e33de4cc9600462717cca5b34d/dev/services/odc-stats/geomedian/ga_ls8c_nbart_gm_cyear_3.yaml --location file:///tmp --overwrite ls-geomad-cyear.db +odc-stats run --threads=8 --config https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/709daaee176c04e33de4cc9600462717cca5b34d/dev/services/odc-stats/geomedian/ga_ls8c_nbart_gm_cyear_3.yaml --location file:///tmp --overwrite ls-geomad-cyear.db ./tests/compare_data.sh /tmp/x49/y24/ ga_ls8c_nbart_gm_cyear_3_x49y24_2015--P1Y_final*.tif echo "Test LS WO summary" odc-stats save-tasks --config https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/709daaee176c04e33de4cc9600462717cca5b34d/dev/services/odc-stats/wofs_summary/ga_ls_wo_fq_cyear_3.yaml --year=2015 --tiles 49:50,24:25 --overwrite ls-wofs-cyear.db -odc-stats run --threads=1 --config https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/709daaee176c04e33de4cc9600462717cca5b34d/dev/services/odc-stats/wofs_summary/ga_ls_wo_fq_cyear_3.yaml --location file:///tmp --overwrite ls-wofs-cyear.db +odc-stats run --threads=8 --config https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/709daaee176c04e33de4cc9600462717cca5b34d/dev/services/odc-stats/wofs_summary/ga_ls_wo_fq_cyear_3.yaml --location file:///tmp --overwrite ls-wofs-cyear.db ./tests/compare_data.sh /tmp/x49/y24/ ga_ls_wo_fq_cyear_3_x49y24_2015--P1Y_fina*.tif echo "Test LS FC percentile" odc-stats save-tasks --config https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/709daaee176c04e33de4cc9600462717cca5b34d/dev/services/odc-stats/fc_percentile/ga_ls_fc_pc_cyear_3.yaml --year=2015 --tiles 49:50,24:25 --overwrite ls-fcp-cyear.db -odc-stats run --threads=1 --config https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/709daaee176c04e33de4cc9600462717cca5b34d/dev/services/odc-stats/fc_percentile/ga_ls_fc_pc_cyear_3.yaml --location file:///tmp --overwrite ls-fcp-cyear.db +odc-stats run --threads=8 --config https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/709daaee176c04e33de4cc9600462717cca5b34d/dev/services/odc-stats/fc_percentile/ga_ls_fc_pc_cyear_3.yaml --location file:///tmp --overwrite ls-fcp-cyear.db ./tests/compare_data.sh /tmp/x49/y24/ ga_ls_fc_pc_cyear_3_x49y24_2015--P1Y_final*.tif echo "Test LS TC percentile" odc-stats save-tasks --config https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/709daaee176c04e33de4cc9600462717cca5b34d/dev/services/odc-stats/tc_percentile/ga_ls_tc_pc_cyear_3.yaml --year=2015 --tiles 49:50,24:25 --overwrite ls-tcp-cyear.db -odc-stats run --threads=1 --config https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/709daaee176c04e33de4cc9600462717cca5b34d/dev/services/odc-stats/tc_percentile/ga_ls_tc_pc_cyear_3.yaml --location file:///tmp --overwrite ls-tcp-cyear.db +odc-stats run --threads=8 --config https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/709daaee176c04e33de4cc9600462717cca5b34d/dev/services/odc-stats/tc_percentile/ga_ls_tc_pc_cyear_3.yaml --location file:///tmp --overwrite ls-tcp-cyear.db ./tests/compare_data.sh /tmp/x49/y24/ ga_ls_tc_pc_cyear_3_x49y24_2015--P1Y_final*.tif diff --git a/tests/requirements.txt b/tests/requirements.txt index 714490a2..2d44bc34 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -9,9 +9,9 @@ mock moto networkx numpy<2.0 -odc-algo @ git+https://github.com/opendatacube/odc-algo@integrate_1.9 +# odc-algo @ git+https://github.com/opendatacube/odc-algo@integrate_1.9 odc-cloud>=0.2.5 -odc-dscache @ git+https://github.com/opendatacube/odc-dscache@integrate_1.9 +# odc-dscache @ git+https://github.com/opendatacube/odc-dscache@integrate_1.9 # For tests pytest From 95d4c2decc3d48efb8222b7df372a248eea7a483 Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Thu, 29 May 2025 08:23:28 +0000 Subject: [PATCH 07/37] change time to timezone aware --- odc/stats/proc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/odc/stats/proc.py b/odc/stats/proc.py index 40ce4557..3b326856 100644 --- a/odc/stats/proc.py +++ b/odc/stats/proc.py @@ -9,7 +9,7 @@ Union, ) from dask.distributed import Client, WorkerPlugin -from datetime import datetime +from datetime import datetime, timezone import xarray as xr import math import psutil @@ -221,7 +221,7 @@ def _run(self, tasks: Iterable[Task], apply_eodatasets3) -> Iterator[TaskResult] if tk is not None: t0 = tk.start_time else: - t0 = datetime.utcnow() + t0 = datetime.now(timezone.utc) if not cfg.overwrite: path = sink.uri(task) _log.debug("Checking if can skip %s", path) From fec3e814c4f2da00078b48f68f8f7c424a884e9f Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Fri, 30 May 2025 06:38:51 +0000 Subject: [PATCH 08/37] relocate loading from odc-algo --- odc/stats/_grouper.py | 102 ++++++++ odc/stats/io.py | 471 ++++++++++++++++++++++++++++++++++++- odc/stats/plugins/_base.py | 4 +- tests/test_grouper.py | 70 ++++++ tests/test_io.py | 37 +++ 5 files changed, 670 insertions(+), 14 deletions(-) create mode 100644 odc/stats/_grouper.py create mode 100644 tests/test_grouper.py create mode 100644 tests/test_io.py diff --git a/odc/stats/_grouper.py b/odc/stats/_grouper.py new file mode 100644 index 00000000..958b9c82 --- /dev/null +++ b/odc/stats/_grouper.py @@ -0,0 +1,102 @@ +# This file is part of the Open Data Cube, see https://opendatacube.org for more information +# +# Copyright (c) 2015-2025 ODC Contributors +# SPDX-License-Identifier: Apache-2.0 +"""Methods for grouping Datasets spatialy and otherwise.""" + +from __future__ import annotations + +from datetime import timedelta +from typing import TYPE_CHECKING, Any + +import numpy as np +import pandas as pd +import xarray as xr +from datacube.model import Dataset +from datacube.utils.dates import normalise_dt + +if TYPE_CHECKING: + from collections.abc import Hashable, Iterable, Iterator + + from odc.geo import Geometry + + +def mid_longitude(geom: Geometry) -> float: + """Return longitude of the middle point of the geomtry.""" + ((lon,), _) = geom.centroid.to_crs("epsg:4326").xy + return lon + + +def solar_offset(geom: Geometry, precision: str = "h") -> timedelta: + """Given a geometry compute offset to add to UTC timestamp to get solar day right. + + This only work when geometry is "local enough". + :param precision: one of ``'h'`` or ``'s'``, defaults to hour precision + """ + lon = mid_longitude(geom) + + if precision == "h": + return timedelta(hours=int(lon * 24 / 360 + 0.5)) + + # 240 == (24*60*60)/360 (seconds of a day per degree of longitude) + return timedelta(seconds=int(lon * 240)) + + +def key2num( + objs: Iterable[Hashable], reverse_map: dict[int, Any] | None = None +) -> Iterator[int]: + """Given a sequence of hashable objects return sequence of numeric ids starting from 0. + + For example ``'A' 'B' 'A' 'A' 'C' -> 0 1 0 0 2`` + """ + o2id: dict[Any, int] = {} + c = 0 + for obj in objs: + _c = o2id.setdefault(obj, c) + if _c == c: + c = c + 1 + if reverse_map is not None: + reverse_map[_c] = obj + yield _c + + +def group_by_nothing( + dss: list[Dataset], solar_day_offset: timedelta | None = None +) -> xr.DataArray: + """No op grouping of datasets. + + Construct "sources" just like ``.group_dataset`` but with every slice + containing just one Dataset object wrapped in a tuple. + + Time -> (Dataset,) + """ + dss = sorted(dss, key=lambda ds: (normalise_dt(ds.center_time), ds.id)) # type: ignore + time = [normalise_dt(ds.center_time) for ds in dss] # type: ignore + solar_day = None + + if solar_day_offset is not None: + solar_day = np.asarray( + [(dt + solar_day_offset).date() for dt in time], dtype="datetime64[D]" + ) + + idx = np.arange(0, len(dss), dtype="uint32") + uuids = np.empty(len(dss), dtype="O") + data = np.empty(len(dss), dtype="O") + grid2crs: dict[int, Any] = {} + grid = list(key2num((ds.crs for ds in dss), grid2crs)) + + for i, ds in enumerate(dss): + data[i] = (ds,) + uuids[i] = ds.id + + coords = [np.asarray(time, dtype="datetime64[ms]"), idx, uuids, grid] + names = ["time", "idx", "uuid", "grid"] + if solar_day is not None: + coords.append(solar_day) + names.append("solar_day") + + coord = pd.MultiIndex.from_arrays(coords, names=names) + + return xr.DataArray( + data=data, coords={"spec": coord}, attrs={"grid2crs": grid2crs}, dims=("spec",) + ) diff --git a/odc/stats/io.py b/odc/stats/io.py index 575839e9..7e6aafa5 100644 --- a/odc/stats/io.py +++ b/odc/stats/io.py @@ -2,7 +2,13 @@ Various I/O adaptors """ -from typing import Any, Dict, List, Optional, Union, cast +from __future__ import annotations + +from typing import Any, Dict, List, Optional, Union, cast, TYPE_CHECKING +from hashlib import sha1 +from collections import namedtuple +from collections.abc import Callable, Iterable, Sequence + import json from urllib.parse import urlparse import logging @@ -14,16 +20,6 @@ from rasterio.crs import CRS from numpy import datetime64 -from odc.aws.s3_client import S3Client -from dask.distributed import get_worker -from datacube.utils.dask import save_blob_to_file -from odc.geo.cog import to_cog -from datacube.model import Dataset -from .model import Task, EXT_TIFF -from .plugins import StatsPluginInterface -from hashlib import sha1 -from collections import namedtuple - from eodatasets3.assemble import serialise from eodatasets3.scripts.tostac import json_fallback from eodatasets3.model import DatasetDoc @@ -31,6 +27,32 @@ import eodatasets3.stac as eo3stac from importlib.metadata import version +from datacube import Datacube +from datacube.testutils.io import native_geobox +from pyproj import aoi, transformer + +from odc.geo.geobox import GeoBox +from odc.geo.geobox import pad as gbox_pad +from odc.geo.xr import xr_reproject + +from ._grouper import group_by_nothing, solar_offset +from odc.algo._masking import ( + _max_fuser, + _nodata_fuser, + _or_fuser, + enum_to_bool, + mask_cleanup, +) + +from odc.aws.s3_client import S3Client +from dask.distributed import get_worker +from datacube.utils.dask import save_blob_to_file +from odc.geo.cog import to_cog + +if TYPE_CHECKING: + from .plugins import StatsPluginInterface + from .model import Task + from datacube.model import Dataset WriteResult = namedtuple("WriteResult", ["path", "sha1", "error"]) @@ -97,6 +119,7 @@ def __init__( cog_opts: Optional[Dict[str, Any]] = None, acl: Optional[str] = None, public: bool = False, + band_ext: str = "tif", ): """ :param creds: S3 write credentials @@ -150,7 +173,7 @@ def __init__( self._stac_meta_contentype = "application/json" self._odc_meta_contentype = "text/yaml" self._prod_info_meta_contentype = "text/yaml" - self._band_ext = EXT_TIFF + self._band_ext = band_ext self._acl = acl def uri(self, task: Task) -> str: @@ -517,3 +540,427 @@ def dump( return self.dump_with_eodatasets3(task, ds, aux, proc) else: return self.dump_with_pystac(task, ds, aux) + + +def compute_native_load_geobox( + dst_geobox: GeoBox, ds: Dataset, band: str, buffer: float | None = None +) -> GeoBox: + """Compute area of interest for a given Dataset given query. + + Take native projection and resolution from ``ds, band`` pair and compute + region in that projection that fully encloses footprint of the + ``dst_geobox`` with some padding. Construct GeoBox that encloses that + region fully with resolution/pixel alignment copied from supplied band. + + :param dst_geobox: + :param ds: Sample dataset (only resolution and projection is used, not footprint) + :param band: Reference band to use + (resolution of output GeoBox will match resolution of this band) + :param buffer: Buffer in units of CRS of ``ds`` (meters usually), + default is 10 pixels worth + """ + native: GeoBox = native_geobox(ds, basis=band) + if buffer is None: + buffer = 10 * cast( + float, max(map(abs, (native.resolution.y, native.resolution.x))) + ) # type: ignore + + assert native.crs is not None + return GeoBox.from_geopolygon( + dst_geobox.extent.to_crs(native.crs).buffer(buffer), + crs=native.crs, + resolution=native.resolution, + align=native.alignment, + ) + + +def choose_transform_path( + src_crs: str, + dst_crs: str, + transform_code: str | None = None, + area_of_interest: Sequence[float] | None = None, +) -> str: + # leave gdal to choose the best option if nothing is specified + if transform_code is None and area_of_interest is None: + return {} + + if area_of_interest is not None: + assert len(area_of_interest) == 4 + area_of_interest = aoi.AreaOfInterest(*area_of_interest) + + transformer_group = transformer.TransformerGroup( + src_crs, dst_crs, area_of_interest=area_of_interest + ) + if transform_code is None: + return {"COORDINATE_OPERATION": transformer_group.transformers[0].to_proj4()} + for t in transformer_group.transformers: + for step in json.loads(t.to_json()).get("steps", []): + if step.get("type", "") == "Transformation": + authority_code = step.get("id", {}) + if transform_code.split(":")[0].upper() in authority_code.get( + "authority", "" + ) and transform_code.split(":")[1] == str( + authority_code.get("code", "") + ): + return {"COORDINATE_OPERATION": t.to_proj4()} + # raise error if nothing is available + raise ValueError(f"Not able to find transform path by {transform_code}") + + +def _split_by_grid(xx: xr.DataArray) -> list[xr.DataArray]: + def extract(grid_id, ii): + yy = xx[ii] + crs = xx.grid2crs[grid_id] + yy.attrs.update(crs=crs) + yy.attrs.pop("grid2crs", None) + return yy + + return [extract(grid_id, ii) for grid_id, ii in xx.groupby(xx.grid).groups.items()] + + +def _native_load_1( + sources: xr.DataArray, + bands: tuple[str, ...], + geobox: GeoBox, + *, + optional_bands: tuple[str, ...] | None = None, + basis: str | None = None, + load_chunks: dict[str, int] | None = None, + pad: int | None = None, +) -> xr.Dataset: + if basis is None: + basis = bands[0] + (ds,) = sources.data[0] + load_geobox = compute_native_load_geobox(geobox, ds, basis) + if pad is not None: + load_geobox = gbox_pad(load_geobox, pad) + + mm = ds.product.lookup_measurements(bands) + if optional_bands is not None: + for ob in optional_bands: + try: + om = ds.product.lookup_measurements(ob) + except KeyError: + continue + else: + mm.update(om) + + xx = Datacube.load_data(sources, load_geobox, mm, dask_chunks=load_chunks) + return xx + + +def native_load( + dss: Sequence[Dataset], + bands: Sequence[str], + geobox: GeoBox, + *, + optional_bands: tuple[str, ...] | None = None, + basis: str | None = None, + load_chunks: dict[str, int] | None = None, + pad: int | None = None, +): + sources = group_by_nothing(list(dss), solar_offset(geobox.extent)) + for srcs in _split_by_grid(sources): + _xx = _native_load_1( + srcs, + tuple(bands), + geobox, + optional_bands=optional_bands, + basis=basis, + load_chunks=load_chunks, + pad=pad, + ) + yield _xx + + +def _apply_native_transform_1( + xx: xr.Dataset, + native_transform: Callable[[xr.Dataset], xr.Dataset], + groupby: str | None = None, + fuser: Callable[[xr.Dataset], xr.Dataset] | None = None, +) -> xr.Dataset: + xx = native_transform(xx) + + if groupby is not None: + if fuser is None: + fuser = _nodata_fuser # type: ignore + xx = xx.groupby(groupby).map(fuser) + + return xx + + +# pylint:disable=too-many-arguments,too-many-locals,too-many-branches +def load_with_native_transform( + dss: Sequence[Dataset], + bands: Sequence[str], + geobox: GeoBox, + native_transform: Callable[[xr.Dataset], xr.Dataset], + *, + optional_bands: tuple[str, ...] | None = None, + basis: str | None = None, + groupby: str | None = None, + fuser: Callable[[xr.Dataset], xr.Dataset] | None = None, + resampling: str = "nearest", + chunks: dict[str, int] | None = None, + load_chunks: dict[str, int] | None = None, + pad: int | None = None, + **kw, +) -> xr.Dataset: + """Load a bunch of datasets with native pixel transform. + + :param dss: A list of datasets to load + :param bands: Which measurements to load + :param geobox: GeoBox of the final output + :param native_transform: ``xr.Dataset -> xr.Dataset`` transform, + should support Dask inputs/outputs + :param basis: Name of the band to use as a reference for what is "native projection" + :param groupby: One of 'solar_day'|'time'|'idx'|None + :param fuser: Optional ``xr.Dataset -> xr.Dataset`` transform + :param resampling: Any resampling mode supported by GDAL as a string: + nearest, bilinear, average, mode, cubic, etc... + :param chunks: If set use Dask, must be in dictionary form + ``{'x': 4000, 'y': 4000}`` + + :param load_chunks: Defaults to ``chunks`` but can be different if supplied + (different chunking for native read vs reproject) + + :param pad: Optional padding in native pixels, if set will load extra + pixels beyond of what is needed to reproject to final + destination. This is useful when you plan to apply convolution + filter or morphological operators on input data. + + :param kw: Used to support old names ``dask_chunks`` and ``group_by`` + also kwargs for reproject ``tranform_code`` in the form of + "authority:code", e.g., "epsg:9688", and ``area_of_interest``, + e.g., [-180, -90, 180, 90] + + 1. Partition datasets by native Projection + 2. For every group do + - Load data + - Apply native_transform + - [Optional] fuse rasters that happened on the same day/time + - Reproject to final geobox + 3. Stack output of (2) + 4. [Optional] fuse rasters that happened on the same day/time + """ + if fuser is None: + fuser = _nodata_fuser + + if groupby is None: + groupby = kw.pop("group_by", "idx") + + if chunks is None: + chunks = kw.pop("dask_chunks", None) + + if load_chunks is None: + load_chunks = chunks + + _chunks = None + if chunks is not None: + _chunks = tuple( + getattr(geobox.shape, ax) if chunks.get(ax, -1) == -1 else chunks.get(ax) + for ax in ("y", "x") + ) + + _xx = [] + # fail if the intended transform not available + # to avoid any unexpected results + for xx in native_load( + dss, + bands, + geobox, + optional_bands=optional_bands, + basis=basis, + load_chunks=load_chunks, + pad=pad, + ): + extra_args = choose_transform_path( + xx.crs, + geobox.crs, + kw.pop("transform_code", None), + kw.pop("area_of_interest", None), + ) + extra_args.update(kw) + + yy = _apply_native_transform_1( + xx, + native_transform, + groupby=groupby, + fuser=fuser, + ) + + vars_to_scale = False + if isinstance(yy, xr.DataArray): + vars_to_scale = True + if yy.dtype == "bool": + yy = yy.astype("uint8") << 7 + else: + vars_to_scale = [var for var in yy.data_vars if yy[var].dtype == "bool"] + yy = yy.assign( + **{var: yy[var].astype("uint8") << 7 for var in vars_to_scale} + ) + + _yy = xr_reproject( + yy, + geobox, + resampling=resampling, + chunks=_chunks, + **extra_args, + ) + + if isinstance(_yy, xr.DataArray) and vars_to_scale: + _yy = _yy > 64 + elif vars_to_scale: + _yy = _yy.assign(**{var: _yy[var] > 64 for var in vars_to_scale}) + + _xx += [_yy] + + if len(_xx) == 1: + xx = _xx[0] + else: + xx = xr.concat(_xx, _xx[0].dims[0]) # type: ignore + if groupby != "idx": + xx = xx.groupby(groupby).map(fuser) + # TODO: probably want to replace spec MultiIndex with just `time` component + return xx + + +def load_enum_mask( + dss: list[Dataset], + band: str, + geobox: GeoBox, + *, + categories: Iterable[str | int], + invert: bool = False, + resampling: str = "nearest", + groupby: str | None = None, + chunks: dict[str, int] | None = None, + **kw, +) -> xr.DataArray: + """Load enumerated mask (like fmask). + + 1. Load each mask time slice separately in native projection of the file + 2. Convert enum to Boolean (F:0, T:255) + 3. Optionally (groupby='solar_day') group observations on the same day + using OR for pixel fusing: T,F->T + 4. Reproject to destination GeoBox (any resampling mode is ok) + 5. Optionally group observations on the same day using OR for pixel fusing T,F->T + 6. Finally convert to real Bool + """ + + def native_op(ds): + return ds.map( + enum_to_bool, + categories=categories, + invert=invert, + dtype="uint8", + value_true=255, + ) + + xx = load_with_native_transform( + dss, + (band,), + geobox, + native_op, + basis=band, + resampling=resampling, + groupby=groupby, + chunks=chunks, + fuser=_max_fuser, + **kw, + ) + return xx[band] > 127 + + +def load_enum_filtered( + dss: Sequence[Dataset], + band: str, + geobox: GeoBox, + *, + categories: Iterable[str | int], + filters: Iterable[tuple[str, int]] | None = None, + groupby: str | None = None, + resampling: str = "nearest", + chunks: dict[str, int] | None = None, + **kw, +) -> xr.DataArray: + """Load enumerated mask (like fmask/SCL) with native pixel filtering. + + The idea is to load "cloud" classes while adding some padding, then erase + pixels that were classified as cloud in any of the observations on a given + day. + + This method converts enum-mask to a boolean image in the native projection + of the data and then reprojects boolean image to the final + projections/resolution. This allows one to use any resampling strategy, + like ``average`` or ``cubic`` and not be limited to a few resampling + strategies that support operations on categorical data. + + :param dss: A list of datasets to load + :param band: Which measurement band to load + :param geobox: GeoBox of the final output + :param categories: Enum values or names + + :param filters: iterable tuples of morphological operations in the order + you want them to perform, e.g., [("opening", 2), ("dilation", 5)] + :param groupby: One of 'solar_day'|'time'|'idx'|None + :param resampling: Any resampling mode supported by GDAL as a string: + nearest, bilinear, average, mode, cubic, etc... + :param chunks: If set use Dask, must be in dictionary form + ``{'x': 4000, 'y': 4000}`` + :param kw: Passed on to ``load_with_native_transform`` + + + 1. Load each mask time slice separately in native projection of the file + 2. Convert enum to Boolean + 3. Optionally (groupby='solar_day') group observations on the same day + using OR for pixel fusing: T,F->T + 4. Optionally apply ``mask_cleanup`` in native projection (after fusing) + 4. Reproject to destination GeoBox (any resampling mode is ok) + 5. Optionally group observations on the same day using OR for pixel fusing T,F->T + """ + + def native_op(xx: xr.Dataset) -> xr.Dataset: + _xx = enum_to_bool(xx[band], categories) + return xr.Dataset( + {band: _xx}, + attrs={"native": True}, # <- native flag needed for fuser + ) + + def fuser(xx: xr.Dataset) -> xr.Dataset: + """Fuse with OR. + + Fuse with OR, and when fusing in native pixel domain apply mask_cleanup if + requested + """ + is_native = xx.attrs.get("native", False) + xx = xx.map(_or_fuser) + xx.attrs.pop("native", None) + + if is_native and filters is not None: + _xx = xx[band] + assert isinstance(_xx, xr.DataArray) + xx[band] = mask_cleanup(_xx, mask_filters=filters) + + return xx + + # unless set by user to some value use largest filter radius for pad value + pad: int | None = kw.pop("pad", None) + if pad is None: + if filters is not None: + pad = max(list(zip(*filters, strict=False))[1]) # type: ignore + + xx = load_with_native_transform( + dss, + (band,), + geobox, + native_op, + fuser=fuser, + groupby=groupby, + resampling=resampling, + chunks=chunks, + pad=pad, + **kw, + )[band] + assert isinstance(xx, xr.DataArray) + return xx diff --git a/odc/stats/plugins/_base.py b/odc/stats/plugins/_base.py index 7d3cec67..f48bb6c6 100644 --- a/odc/stats/plugins/_base.py +++ b/odc/stats/plugins/_base.py @@ -6,7 +6,7 @@ from datacube.model import Dataset from odc.geo.geobox import GeoBox from odc.algo import to_rgba -from odc.algo.io import load_with_native_transform +from odc.stats.io import load_with_native_transform from odc.algo._masking import _nodata_fuser @@ -16,7 +16,7 @@ class StatsPluginInterface(ABC): VERSION = "0.0.0" PRODUCT_FAMILY = "statistics" - # pylint:disable=too-many-arguments + # pylint:disable=too-many-arguments,too-many-positional-arguments def __init__( self, resampling: str = "bilinear", diff --git a/tests/test_grouper.py b/tests/test_grouper.py new file mode 100644 index 00000000..644a8755 --- /dev/null +++ b/tests/test_grouper.py @@ -0,0 +1,70 @@ +# This file is part of the Open Data Cube, see https://opendatacube.org for more information +# +# Copyright (c) 2015-2025 ODC Contributors +# SPDX-License-Identifier: Apache-2.0 +import pytest +from datacube.testutils import mk_sample_dataset + +from odc.stats._grouper import group_by_nothing, key2num, mid_longitude, solar_offset +from odc.geo.geobox import GeoBox +from odc.geo.geom import box as geom_box + + +@pytest.mark.parametrize("lon,lat", [(0, 10), (100, -10), (-120, 30)]) +def test_mid_lon(lon, lat): + r = 0.1 + rect = geom_box(lon - r, lat - r, lon + r, lat + r, "epsg:4326") + assert rect.centroid.coords[0] == pytest.approx((lon, lat)) + + assert mid_longitude(rect) == pytest.approx(lon) + assert mid_longitude(rect.to_crs("epsg:3857")) == pytest.approx(lon) + + offset = solar_offset(rect, "h") + assert offset.seconds % (60 * 60) == 0 + + offset_sec = solar_offset(rect, "s") + assert abs((offset - offset_sec).seconds) <= 60 * 60 + + +@pytest.mark.parametrize( + "input_,expect", + [ + ("ABAAC", [0, 1, 0, 0, 2]), + ("B", [0]), + ([1, 1, 1], [0, 0, 0]), + ("ABCC", [0, 1, 2, 2]), + ], +) +def test_key2num(input_, expect): + rr = list(key2num(input_)) + assert rr == expect + + reverse = {} + rr = list(key2num(input_, reverse)) + assert rr == expect + assert set(reverse.keys()) == set(range(len(set(input_)))) + assert set(reverse.values()) == set(input_) + # first entry always gets an index of 0 + assert reverse[0] == input_[0] + + +@pytest.fixture +def sample_geobox(): + yield GeoBox.from_geopolygon(geom_box(-10, -20, 11, 22, "epsg:4326"), resolution=1) + + +@pytest.fixture +def sample_ds(sample_geobox): + yield mk_sample_dataset([{"name": "red"}], geobox=sample_geobox) + + +def test_grouper(sample_ds): + xx = group_by_nothing([sample_ds]) + assert xx.values[0] == (sample_ds,) + assert xx.uuid.values[0] == sample_ds.id + + xx = group_by_nothing([sample_ds, sample_ds], solar_offset(sample_ds.extent)) + assert xx.values[0] == (sample_ds,) + assert xx.values[0] == (sample_ds,) + assert xx.uuid.values[1] == sample_ds.id + assert xx.uuid.values[1] == sample_ds.id diff --git a/tests/test_io.py b/tests/test_io.py new file mode 100644 index 00000000..65cac9ef --- /dev/null +++ b/tests/test_io.py @@ -0,0 +1,37 @@ +import pytest + +from odc.stats.io import choose_transform_path + + +@pytest.mark.parametrize("transform_code", ["EPSG:9688", "EPSG:1150"]) +@pytest.mark.parametrize("area_of_interest", [None, [-180, -90, 180, 90]]) +def test_choose_transform_path(transform_code, area_of_interest): + src_crs = "EPSG:32649" + dst_crs = "EPSG:3577" + proj_str = { + "9688": "+proj=pipeline +step +inv +proj=utm +zone=49 +ellps=WGS84 " + "+step +proj=push +v_3 " + "+step +proj=cart +ellps=WGS84 " + "+step +inv +proj=helmert +x=0.06155 +y=-0.01087 +z=-0.04019 " + "+rx=-0.0394924 +ry=-0.0327221 +rz=-0.0328979 +s=-0.009994 " + "+convention=coordinate_frame " + "+step +inv +proj=cart +ellps=GRS80 " + "+step +proj=pop +v_3 " + "+step +proj=aea +lat_0=0 +lon_0=132 +lat_1=-18 +lat_2=-36 " + "+x_0=0 +y_0=0 +ellps=GRS80", + "1150": "+proj=pipeline +step +inv +proj=utm +zone=49 +ellps=WGS84 " + "+step +proj=aea +lat_0=0 +lon_0=132 +lat_1=-18 +lat_2=-36 " + "+x_0=0 +y_0=0 +ellps=GRS80", + } + if transform_code is None and area_of_interest is None: + assert ( + choose_transform_path(src_crs, dst_crs, transform_code, area_of_interest) + == {} + ) + elif area_of_interest is None: + with pytest.raises(ValueError): + choose_transform_path(src_crs, dst_crs, transform_code, area_of_interest) + else: + assert choose_transform_path( + src_crs, dst_crs, transform_code, area_of_interest + ) == {"COORDINATE_OPERATION": proj_str.get(transform_code.split(":")[1], "")} From 149444c0d1c5c90b859875c213041f2070fbd3d4 Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Fri, 21 Feb 2025 05:39:20 +0000 Subject: [PATCH 09/37] delete cached stage periodically --- .github/workflows/delete-old-stage.yml | 47 ++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 .github/workflows/delete-old-stage.yml diff --git a/.github/workflows/delete-old-stage.yml b/.github/workflows/delete-old-stage.yml new file mode 100644 index 00000000..0ed31705 --- /dev/null +++ b/.github/workflows/delete-old-stage.yml @@ -0,0 +1,47 @@ +name: Delete Old GHCR Stage + +on: + schedule: + - cron: "0 0 * * 0" # Runs every Sunday at midnight (UTC) + workflow_dispatch: # Allows manual trigger + +permissions: + packages: write + contents: read + +jobs: + delete-ghcr-image: + runs-on: ubuntu-latest + steps: + - name: Authenticate with GitHub API + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: gh auth status + + - name: Get Image Versions + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IMAGE_NAME: "odc-stats-stages" + run: | + gh api --paginate /user/packages/container/$IMAGE_NAME/versions > versions.json + cat versions.json | jq '.[] | {id, created_at}' + + - name: Delete Old Image Versions + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IMAGE_NAME: "odc-stats-stages" + DAYS_THRESHOLD: 7 + run: | + NOW=$(date -u +%s) + THRESHOLD=$((NOW - DAYS_THRESHOLD * 86400)) + + for row in $(cat versions.json | jq -c '.[]'); do + ID=$(echo $row | jq -r '.id') + CREATED_AT=$(echo $row | jq -r '.created_at') + CREATED_TIME=$(date -d "$CREATED_AT" +%s) + + if [[ $CREATED_TIME -lt $THRESHOLD ]]; then + echo "Deleting image version $ID (created at $CREATED_AT)" + gh api --method DELETE /user/packages/container/$IMAGE_NAME/versions/$ID + fi + done From f8136ebf97b0fa7bbc3d0ab54e723bf837ee951c Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Fri, 21 Feb 2025 05:42:06 +0000 Subject: [PATCH 10/37] rename docker image workflow yaml --- .github/workflows/delete-old-stage.yml | 47 -------------------------- 1 file changed, 47 deletions(-) delete mode 100644 .github/workflows/delete-old-stage.yml diff --git a/.github/workflows/delete-old-stage.yml b/.github/workflows/delete-old-stage.yml deleted file mode 100644 index 0ed31705..00000000 --- a/.github/workflows/delete-old-stage.yml +++ /dev/null @@ -1,47 +0,0 @@ -name: Delete Old GHCR Stage - -on: - schedule: - - cron: "0 0 * * 0" # Runs every Sunday at midnight (UTC) - workflow_dispatch: # Allows manual trigger - -permissions: - packages: write - contents: read - -jobs: - delete-ghcr-image: - runs-on: ubuntu-latest - steps: - - name: Authenticate with GitHub API - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: gh auth status - - - name: Get Image Versions - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - IMAGE_NAME: "odc-stats-stages" - run: | - gh api --paginate /user/packages/container/$IMAGE_NAME/versions > versions.json - cat versions.json | jq '.[] | {id, created_at}' - - - name: Delete Old Image Versions - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - IMAGE_NAME: "odc-stats-stages" - DAYS_THRESHOLD: 7 - run: | - NOW=$(date -u +%s) - THRESHOLD=$((NOW - DAYS_THRESHOLD * 86400)) - - for row in $(cat versions.json | jq -c '.[]'); do - ID=$(echo $row | jq -r '.id') - CREATED_AT=$(echo $row | jq -r '.created_at') - CREATED_TIME=$(date -d "$CREATED_AT" +%s) - - if [[ $CREATED_TIME -lt $THRESHOLD ]]; then - echo "Deleting image version $ID (created at $CREATED_AT)" - gh api --method DELETE /user/packages/container/$IMAGE_NAME/versions/$ID - fi - done From 9aed4f629e3bb015eca819e8b72aa9c29c2e89bc Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Fri, 28 Feb 2025 05:25:15 +0000 Subject: [PATCH 11/37] update basics with pass --- docker/Dockerfile | 10 +++++----- docker/env.yaml | 14 +++++++------- docker/requirements.txt | 24 +++++++++++------------- 3 files changed, 23 insertions(+), 25 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index c30fcf89..7d8ecce0 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -12,13 +12,13 @@ ARG UPDATE_VERSION=1 COPY requirements.txt /conf/ # required to build numexpr # or any --no-binary -ENV CC=/env/bin/x86_64-conda_cos6-linux-gnu-gcc \ - CXX=/env/bin/x86_64-conda_cos6-linux-gnu-g++ \ - LDSHARED="/env/bin/x86_64-conda_cos6-linux-gnu-gcc -pthread -shared -B /env/compiler_compat -L/env/lib -Wl,-rpath=/env/lib -Wl,--no-as-needed" +ENV CC=/env/bin/x86_64-conda-linux-gnu-gcc \ + CXX=/env/bin/x86_64-conda-linux-gnu-g++ \ + LDSHARED="/env/bin/x86_64-conda-linux-gnu-gcc -pthread -shared -B /env/compiler_compat -L/env/lib -Wl,-rpath=/env/lib -Wl,--no-as-needed" RUN micromamba run -p /env pip install --no-cache-dir \ --no-build-isolation -r /conf/requirements.txt -FROM ubuntu:jammy-20240212 +FROM ubuntu:noble-20250127 COPY --from=stats-conda /env /env COPY distributed.yaml /etc/dask/ @@ -29,4 +29,4 @@ ENV GDAL_DRIVER_PATH=/env/lib/gdalplugins \ WORKDIR /tmp -RUN odc-stats --version +# RUN odc-stats --version diff --git a/docker/env.yaml b/docker/env.yaml index 758d2764..9efc8115 100644 --- a/docker/env.yaml +++ b/docker/env.yaml @@ -2,7 +2,7 @@ name: env channels: - conda-forge dependencies: - - python=3.10 + - python=3.12 - libgdal - gdal>=3.8 - proj @@ -48,7 +48,7 @@ dependencies: - dask-image - deepdiff - defusedxml - - distributed<2024.11.0 + - distributed - docutils - fiona - Flask @@ -80,7 +80,7 @@ dependencies: - netCDF4 - networkx # for the sake of geomedian address this later - - numpy<2.0 + - numpy - ordered-set - packaging - pandas @@ -93,7 +93,7 @@ dependencies: - pyasn1 - pydantic - pyerfa - - pyparsing=2.4.7 + - pyparsing - pyproj - pyrsistent - pystac<1.12 @@ -107,12 +107,12 @@ dependencies: - regex - requests - rsa - - ruamel.yaml + - ruamel.yaml<0.18 - ruamel.yaml.clib - s3fs - s3transfer - scikit-image - - scipy<1.15 + - scipy - sentry-sdk - setuptools-scm - Shapely>=2.0 @@ -120,7 +120,7 @@ dependencies: - slicerator - snuggs - sortedcontainers - - SQLAlchemy<2.0 + - SQLAlchemy - structlog - tblib - text-unidecode diff --git a/docker/requirements.txt b/docker/requirements.txt index b3bef6da..7e7c9ef9 100644 --- a/docker/requirements.txt +++ b/docker/requirements.txt @@ -1,22 +1,20 @@ --extra-index-url https://packages.dea.ga.gov.au/ -# most of these don't need to pulled from here... -datacube-ows>=1.9 -datacube[performance,s3]>=1.9 -eodatasets3>=1.9 -hdstats==0.1.8.post1 +# odc-stac is in PyPI +# odc-stats[ows] + +# For ML +ai-edge-litert +datacube-ows +datacube[performance,s3] +# For testing +eodatasets3 @ git+https://github.com/opendatacube/eo-datasets +# hdstats==0.1.8.post1 numexpr @ git+https://github.com/pydata/numexpr@a99412e odc-algo @ git+https://github.com/opendatacube/odc-algo@adb1856 odc-apps-cloud>=0.2.2 -# For testing -odc-apps-dc-tools>=1.9 +odc-apps-dc-tools>=0.2.12 odc-cloud>=0.2.5 odc-dscache>=0.2.3 odc-stac @ git+https://github.com/opendatacube/odc-stac@69bdf64 - -# odc-stac is in PyPI -odc-stats[ows] - -# For ML -tflite-runtime tl2cgen From 27e344f5136915dc100b3371865eff2dbf135e71 Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Fri, 28 Feb 2025 05:35:34 +0000 Subject: [PATCH 12/37] update tflite runtime name --- docker/requirements.txt | 3 +++ setup.cfg | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/docker/requirements.txt b/docker/requirements.txt index 7e7c9ef9..b6e7e1b0 100644 --- a/docker/requirements.txt +++ b/docker/requirements.txt @@ -17,4 +17,7 @@ odc-apps-dc-tools>=0.2.12 odc-cloud>=0.2.5 odc-dscache>=0.2.3 odc-stac @ git+https://github.com/opendatacube/odc-stac@69bdf64 + +# odc-stac is in PyPI +odc-stats @ git+https://github.com/opendatacube/odc-stats@major_upgrade_in_progress tl2cgen diff --git a/setup.cfg b/setup.cfg index 0b880771..4506cff2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -42,7 +42,7 @@ install_requires = fiona rasterio>=1.3.2 s3fs - tflite-runtime + ai-edge-litert tl2cgen [options.entry_points] From 1f973d63da7a5ce2fd014b475d41013611f30144 Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Thu, 27 Mar 2025 04:35:11 +0000 Subject: [PATCH 13/37] add back stats and install newest numexpr --- docker/Dockerfile | 2 +- docker/requirements.txt | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 7d8ecce0..f11d7b24 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -29,4 +29,4 @@ ENV GDAL_DRIVER_PATH=/env/lib/gdalplugins \ WORKDIR /tmp -# RUN odc-stats --version +RUN odc-stats --version diff --git a/docker/requirements.txt b/docker/requirements.txt index b6e7e1b0..50a3657e 100644 --- a/docker/requirements.txt +++ b/docker/requirements.txt @@ -10,7 +10,6 @@ datacube[performance,s3] # For testing eodatasets3 @ git+https://github.com/opendatacube/eo-datasets # hdstats==0.1.8.post1 -numexpr @ git+https://github.com/pydata/numexpr@a99412e odc-algo @ git+https://github.com/opendatacube/odc-algo@adb1856 odc-apps-cloud>=0.2.2 odc-apps-dc-tools>=0.2.12 From 0828840e80078a8e9a1a4432b402d2a453ea6243 Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Thu, 17 Apr 2025 03:33:38 +0000 Subject: [PATCH 14/37] switch from ubuntu to debian --- docker/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index f11d7b24..da9eb7d2 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,4 +1,4 @@ -FROM mambaorg/micromamba:git-df79b72-jammy AS stats-conda +FROM mambaorg/micromamba:git-fddee42-debian12-slim AS stats-conda USER root COPY env.yaml /conf/ @@ -18,7 +18,7 @@ ENV CC=/env/bin/x86_64-conda-linux-gnu-gcc \ RUN micromamba run -p /env pip install --no-cache-dir \ --no-build-isolation -r /conf/requirements.txt -FROM ubuntu:noble-20250127 +FROM debian:12.10-slim COPY --from=stats-conda /env /env COPY distributed.yaml /etc/dask/ From f368d448b0fa11ff887fa7416774b3d812f42693 Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Thu, 17 Apr 2025 03:38:39 +0000 Subject: [PATCH 15/37] remove odc-stats --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index da9eb7d2..59845b7e 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -29,4 +29,4 @@ ENV GDAL_DRIVER_PATH=/env/lib/gdalplugins \ WORKDIR /tmp -RUN odc-stats --version +# RUN odc-stats --version From 5b784739c2517c4c54139393e587c06d256cc860 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 30 May 2025 06:45:16 +0000 Subject: [PATCH 16/37] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- odc/stats/model.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/odc/stats/model.py b/odc/stats/model.py index 2cf2621e..785009c5 100644 --- a/odc/stats/model.py +++ b/odc/stats/model.py @@ -319,13 +319,7 @@ def _lineage(self) -> Tuple[UUID, ...]: # TODO: replace this and test # if 'fused' in ds.metadata._doc['properties'].keys(): if "fused" in ds.product.name: - lineage = tuple( - set( - x - for ds in self.datasets - for x in ds.metadata.sources - ) - ) + lineage = tuple(set(x for ds in self.datasets for x in ds.metadata.sources)) else: lineage = tuple(ds.id for ds in self.datasets) @@ -542,7 +536,9 @@ def render_metadata( ) ProjectionExtension.add_to(item) proj_ext = ProjectionExtension.ext(item) - proj_ext.apply(epsg=geobox.crs.epsg, transform=geobox.transform, shape=list(geobox.shape)) + proj_ext.apply( + epsg=geobox.crs.epsg, transform=geobox.transform, shape=list(geobox.shape) + ) # Lineage last item.properties["odc:lineage"] = {"inputs": inputs} From 5807a1966d5feeb955c7ca95667637a55b888bb1 Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Fri, 30 May 2025 06:52:43 +0000 Subject: [PATCH 17/37] fix plugin import --- odc/stats/plugins/lc_ml_treelite.py | 2 +- odc/stats/plugins/lc_tf_urban.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/odc/stats/plugins/lc_ml_treelite.py b/odc/stats/plugins/lc_ml_treelite.py index b898c81f..f102dc7f 100644 --- a/odc/stats/plugins/lc_ml_treelite.py +++ b/odc/stats/plugins/lc_ml_treelite.py @@ -16,7 +16,7 @@ from datacube.model import Dataset from odc.geo.geobox import GeoBox from odc.algo._memsink import yxbt_sink, yxt_sink -from odc.algo.io import load_with_native_transform +from odc.stats.io import load_with_native_transform from odc.stats._algebra import expr_eval from odc.stats.model import DateTimeRange diff --git a/odc/stats/plugins/lc_tf_urban.py b/odc/stats/plugins/lc_tf_urban.py index 6c63a959..dc195786 100644 --- a/odc/stats/plugins/lc_tf_urban.py +++ b/odc/stats/plugins/lc_tf_urban.py @@ -14,7 +14,7 @@ from datacube.model import Dataset from odc.geo.geobox import GeoBox from odc.algo._memsink import yxbt_sink -from odc.algo.io import load_with_native_transform +from odc.stats.io import load_with_native_transform from odc.stats._algebra import expr_eval from ._registry import StatsPluginInterface, register From 695b7cb80fe29492fc2f88a1e44465b29d783be6 Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Thu, 12 Jun 2025 08:26:22 +0000 Subject: [PATCH 18/37] upgrade to odc>1.9 --- docker/Dockerfile | 12 ++++++------ docker/env.yaml | 14 +++++++------- docker/requirements.txt | 27 +++++++++++++-------------- setup.cfg | 11 ++++++----- tests/requirements.txt | 7 ++++--- 5 files changed, 36 insertions(+), 35 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 59845b7e..c30fcf89 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,4 +1,4 @@ -FROM mambaorg/micromamba:git-fddee42-debian12-slim AS stats-conda +FROM mambaorg/micromamba:git-df79b72-jammy AS stats-conda USER root COPY env.yaml /conf/ @@ -12,13 +12,13 @@ ARG UPDATE_VERSION=1 COPY requirements.txt /conf/ # required to build numexpr # or any --no-binary -ENV CC=/env/bin/x86_64-conda-linux-gnu-gcc \ - CXX=/env/bin/x86_64-conda-linux-gnu-g++ \ - LDSHARED="/env/bin/x86_64-conda-linux-gnu-gcc -pthread -shared -B /env/compiler_compat -L/env/lib -Wl,-rpath=/env/lib -Wl,--no-as-needed" +ENV CC=/env/bin/x86_64-conda_cos6-linux-gnu-gcc \ + CXX=/env/bin/x86_64-conda_cos6-linux-gnu-g++ \ + LDSHARED="/env/bin/x86_64-conda_cos6-linux-gnu-gcc -pthread -shared -B /env/compiler_compat -L/env/lib -Wl,-rpath=/env/lib -Wl,--no-as-needed" RUN micromamba run -p /env pip install --no-cache-dir \ --no-build-isolation -r /conf/requirements.txt -FROM debian:12.10-slim +FROM ubuntu:jammy-20240212 COPY --from=stats-conda /env /env COPY distributed.yaml /etc/dask/ @@ -29,4 +29,4 @@ ENV GDAL_DRIVER_PATH=/env/lib/gdalplugins \ WORKDIR /tmp -# RUN odc-stats --version +RUN odc-stats --version diff --git a/docker/env.yaml b/docker/env.yaml index 9efc8115..758d2764 100644 --- a/docker/env.yaml +++ b/docker/env.yaml @@ -2,7 +2,7 @@ name: env channels: - conda-forge dependencies: - - python=3.12 + - python=3.10 - libgdal - gdal>=3.8 - proj @@ -48,7 +48,7 @@ dependencies: - dask-image - deepdiff - defusedxml - - distributed + - distributed<2024.11.0 - docutils - fiona - Flask @@ -80,7 +80,7 @@ dependencies: - netCDF4 - networkx # for the sake of geomedian address this later - - numpy + - numpy<2.0 - ordered-set - packaging - pandas @@ -93,7 +93,7 @@ dependencies: - pyasn1 - pydantic - pyerfa - - pyparsing + - pyparsing=2.4.7 - pyproj - pyrsistent - pystac<1.12 @@ -107,12 +107,12 @@ dependencies: - regex - requests - rsa - - ruamel.yaml<0.18 + - ruamel.yaml - ruamel.yaml.clib - s3fs - s3transfer - scikit-image - - scipy + - scipy<1.15 - sentry-sdk - setuptools-scm - Shapely>=2.0 @@ -120,7 +120,7 @@ dependencies: - slicerator - snuggs - sortedcontainers - - SQLAlchemy + - SQLAlchemy<2.0 - structlog - tblib - text-unidecode diff --git a/docker/requirements.txt b/docker/requirements.txt index 50a3657e..114c45c4 100644 --- a/docker/requirements.txt +++ b/docker/requirements.txt @@ -1,22 +1,21 @@ --extra-index-url https://packages.dea.ga.gov.au/ -# odc-stac is in PyPI -# odc-stats[ows] - -# For ML -ai-edge-litert -datacube-ows -datacube[performance,s3] -# For testing -eodatasets3 @ git+https://github.com/opendatacube/eo-datasets -# hdstats==0.1.8.post1 -odc-algo @ git+https://github.com/opendatacube/odc-algo@adb1856 +datacube-ows>=1.9 +datacube[performance,s3]>=1.9 +eodatasets3>1.9 +hdstats==0.1.8.post1 +odc-algo @ git+https://github.com/opendatacube/odc-algo@baacbb odc-apps-cloud>=0.2.2 +# For testing odc-apps-dc-tools>=0.2.12 odc-cloud>=0.2.5 -odc-dscache>=0.2.3 -odc-stac @ git+https://github.com/opendatacube/odc-stac@69bdf64 +odc-dscache>=1.9 +odc-geo>=0.5.0rc1 +odc-stac>=0.4.0 # odc-stac is in PyPI -odc-stats @ git+https://github.com/opendatacube/odc-stats@major_upgrade_in_progress +odc-stats[ows] + +# For ML +tflite-runtime tl2cgen diff --git a/setup.cfg b/setup.cfg index 4506cff2..5274d286 100644 --- a/setup.cfg +++ b/setup.cfg @@ -16,7 +16,7 @@ url = https://github.com/opendatacube/odc-stats/ include_package_data = true zip_safe = false packages = find_namespace: -python_requires = >=3.10 +python_requires = >=3.8 tests_require = pytest install_requires = botocore @@ -27,12 +27,13 @@ install_requires = numpy odc-cloud[ASYNC]>=0.2.5 odc_algo - odc_dscache>=0.2.3 + odc_dscache>=1.9 odc_io odc_stac + odc-geo pandas pystac>=1.1.0 - eodatasets3>=1.9 + eodatasets3>=0.22.0 toolz tqdm xarray>=2023.1.0 @@ -42,7 +43,7 @@ install_requires = fiona rasterio>=1.3.2 s3fs - ai-edge-litert + tflite-runtime tl2cgen [options.entry_points] @@ -56,7 +57,7 @@ include = # datacube_ows<1.8.21 has issue on function config internal over-writing [options.extras_require] ows = - datacube_ows>=1.9 + datacube_ows>=1.8.21 sentry-sdk blinker diff --git a/tests/requirements.txt b/tests/requirements.txt index 2d44bc34..2d5af0d8 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://packages.dea.ga.gov.au/ -datacube @ git+https://github.com/opendatacube/datacube-core@dc7f5e9 +datacube>=1.9 datacube-ows>=1.9 # for pytest-depends deepdiff @@ -9,9 +9,10 @@ mock moto networkx numpy<2.0 -# odc-algo @ git+https://github.com/opendatacube/odc-algo@integrate_1.9 +odc-algo @ git+https://github.com/opendatacube/odc-algo@baacbb odc-cloud>=0.2.5 -# odc-dscache @ git+https://github.com/opendatacube/odc-dscache@integrate_1.9 +odc-dscache>=1.9 +odc-geo>=0.5.0rc1 # For tests pytest From f7fca140ad0eb2d137fc0503130ea7c94e707f10 Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Thu, 12 Jun 2025 08:30:22 +0000 Subject: [PATCH 19/37] update integration test --- tests/integration_test.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integration_test.sh b/tests/integration_test.sh index 3ef15517..40bbe32f 100755 --- a/tests/integration_test.sh +++ b/tests/integration_test.sh @@ -12,27 +12,27 @@ odc-stats --version echo "Test LS GeoMAD" odc-stats save-tasks --config https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/709daaee176c04e33de4cc9600462717cca5b34d/dev/services/odc-stats/geomedian/ga_ls8c_nbart_gm_cyear_3.yaml --year=2015 --tiles 49:50,24:25 --overwrite ls-geomad-cyear.db -odc-stats run --threads=8 --config https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/709daaee176c04e33de4cc9600462717cca5b34d/dev/services/odc-stats/geomedian/ga_ls8c_nbart_gm_cyear_3.yaml --location file:///tmp --overwrite ls-geomad-cyear.db +odc-stats run --threads=1 --config https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/709daaee176c04e33de4cc9600462717cca5b34d/dev/services/odc-stats/geomedian/ga_ls8c_nbart_gm_cyear_3.yaml --location file:///tmp --overwrite ls-geomad-cyear.db ./tests/compare_data.sh /tmp/x49/y24/ ga_ls8c_nbart_gm_cyear_3_x49y24_2015--P1Y_final*.tif echo "Test LS WO summary" odc-stats save-tasks --config https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/709daaee176c04e33de4cc9600462717cca5b34d/dev/services/odc-stats/wofs_summary/ga_ls_wo_fq_cyear_3.yaml --year=2015 --tiles 49:50,24:25 --overwrite ls-wofs-cyear.db -odc-stats run --threads=8 --config https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/709daaee176c04e33de4cc9600462717cca5b34d/dev/services/odc-stats/wofs_summary/ga_ls_wo_fq_cyear_3.yaml --location file:///tmp --overwrite ls-wofs-cyear.db +odc-stats run --threads=1 --config https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/709daaee176c04e33de4cc9600462717cca5b34d/dev/services/odc-stats/wofs_summary/ga_ls_wo_fq_cyear_3.yaml --location file:///tmp --overwrite ls-wofs-cyear.db ./tests/compare_data.sh /tmp/x49/y24/ ga_ls_wo_fq_cyear_3_x49y24_2015--P1Y_fina*.tif echo "Test LS FC percentile" odc-stats save-tasks --config https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/709daaee176c04e33de4cc9600462717cca5b34d/dev/services/odc-stats/fc_percentile/ga_ls_fc_pc_cyear_3.yaml --year=2015 --tiles 49:50,24:25 --overwrite ls-fcp-cyear.db -odc-stats run --threads=8 --config https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/709daaee176c04e33de4cc9600462717cca5b34d/dev/services/odc-stats/fc_percentile/ga_ls_fc_pc_cyear_3.yaml --location file:///tmp --overwrite ls-fcp-cyear.db +odc-stats run --threads=1 --config https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/709daaee176c04e33de4cc9600462717cca5b34d/dev/services/odc-stats/fc_percentile/ga_ls_fc_pc_cyear_3.yaml --location file:///tmp --overwrite ls-fcp-cyear.db ./tests/compare_data.sh /tmp/x49/y24/ ga_ls_fc_pc_cyear_3_x49y24_2015--P1Y_final*.tif echo "Test LS TC percentile" odc-stats save-tasks --config https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/709daaee176c04e33de4cc9600462717cca5b34d/dev/services/odc-stats/tc_percentile/ga_ls_tc_pc_cyear_3.yaml --year=2015 --tiles 49:50,24:25 --overwrite ls-tcp-cyear.db -odc-stats run --threads=8 --config https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/709daaee176c04e33de4cc9600462717cca5b34d/dev/services/odc-stats/tc_percentile/ga_ls_tc_pc_cyear_3.yaml --location file:///tmp --overwrite ls-tcp-cyear.db +odc-stats run --threads=1 --config https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/709daaee176c04e33de4cc9600462717cca5b34d/dev/services/odc-stats/tc_percentile/ga_ls_tc_pc_cyear_3.yaml --location file:///tmp --overwrite ls-tcp-cyear.db ./tests/compare_data.sh /tmp/x49/y24/ ga_ls_tc_pc_cyear_3_x49y24_2015--P1Y_final*.tif From 9c25437012c20abd8de223e28f780c920943a203 Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Fri, 28 Mar 2025 17:31:37 +1030 Subject: [PATCH 20/37] add indexing test (#191) Co-authored-by: Emma Ai --- tests/integration_test.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/integration_test.sh b/tests/integration_test.sh index 40bbe32f..3fcd5ef6 100755 --- a/tests/integration_test.sh +++ b/tests/integration_test.sh @@ -36,6 +36,15 @@ odc-stats run --threads=1 --config https://raw.githubusercontent.com/Geoscience ./tests/compare_data.sh /tmp/x49/y24/ ga_ls_tc_pc_cyear_3_x49y24_2015--P1Y_final*.tif +echo "Test Indexing" + +datacube product add https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/5681c4bdced2fa5262554feac66e732d8bb824f9/products/baseline_satellite_data/geomedian-au/ga_ls8c_nbart_gm_cyear_3.odc-product.yaml +datacube product add https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/5681c4bdced2fa5262554feac66e732d8bb824f9/products/inland_water/wofs/ga_ls_wo_fq_cyear_3.odc-product.yaml +datacube product add https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/5681c4bdced2fa5262554feac66e732d8bb824f9/products/land_and_vegetation/fc/ga_ls_fc_pc_cyear_3.odc-product.yaml +datacube product add https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/5681c4bdced2fa5262554feac66e732d8bb824f9/products/inland_water/c3_tc/ga_ls_tc_pc_cyear_3.odc-product.yaml + +fs-to-dc --stac /tmp/x49/y24/ + # echo "Test S2 GeoMAD" # # use au-30 to save cost # odc-stats save-tasks --config https://raw.githubusercontent.com/GeoscienceAustralia/dea-config/feature/add-S2ab-GM-processing-cfg/dev/services/odc-stats/geomedian/ga_s2ab_gm_4fyear_3.yaml --input-products ga_s2am_ard_3 --grid au-30 --year=2020 --tiles 43:44,15:16 --overwrite s2-geomad-cyear.db From e86f22c988f83a894db5824793a451b587a9418d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 12 May 2025 14:16:42 +0930 Subject: [PATCH 21/37] [pre-commit.ci] pre-commit autoupdate (#193) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/adrienverge/yamllint.git: v1.37.0 → v1.37.1](https://github.com/adrienverge/yamllint.git/compare/v1.37.0...v1.37.1) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 90b30f5f..bc1125a4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ ci: repos: - repo: https://github.com/adrienverge/yamllint.git - rev: v1.37.0 + rev: v1.37.1 hooks: - id: yamllint args: ['-c', '.yamllint'] From b0201723104a895960ab52c127c0ac256a44dec9 Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Fri, 13 Jun 2025 01:55:49 +0000 Subject: [PATCH 22/37] update dependencies in test --- docker/env.yaml | 2 +- tests/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/env.yaml b/docker/env.yaml index 758d2764..f1d7d567 100644 --- a/docker/env.yaml +++ b/docker/env.yaml @@ -48,7 +48,7 @@ dependencies: - dask-image - deepdiff - defusedxml - - distributed<2024.11.0 + - distributed - docutils - fiona - Flask diff --git a/tests/requirements.txt b/tests/requirements.txt index 2d5af0d8..0328cac4 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://packages.dea.ga.gov.au/ -datacube>=1.9 +datacube @ git+https://github.com/opendatacube/datacube-core@565de91 datacube-ows>=1.9 # for pytest-depends deepdiff From 02b01066cb159c6fe255427d494ecb72efe98e2c Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Fri, 13 Jun 2025 02:03:35 +0000 Subject: [PATCH 23/37] remove dask client from unit test --- tests/test_gm_ls.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/test_gm_ls.py b/tests/test_gm_ls.py index a4c4c210..cc45504a 100644 --- a/tests/test_gm_ls.py +++ b/tests/test_gm_ls.py @@ -7,10 +7,6 @@ from odc.stats.model import product_for_plugin from odc.stats.plugins.gm import StatsGMLS from odc.stats.tasks import TaskReader -from dask.distributed import Client, LocalCluster - -cluster = LocalCluster(n_workers=1, threads_per_worker=2) -client = Client(cluster) @pytest.fixture @@ -208,7 +204,6 @@ def test_no_data_value(monkeypatch): task.datasets = task.datasets[2:3] xx_0_0 = gm_ls.input_data(task.datasets, task.geobox) - print(xx_0_0.compute()) xx_0_0 = xx_0_0.sel( indexers={"x": slice(None, None, 100), "y": slice(None, None, 100)} ) From 9451e14be1c922a5a71ca9c51cef814ec41bf5c2 Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Mon, 16 Jun 2025 04:43:16 +0000 Subject: [PATCH 24/37] update odc-algo hash --- docker/requirements.txt | 4 ++-- tests/requirements.txt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/requirements.txt b/docker/requirements.txt index 114c45c4..496db443 100644 --- a/docker/requirements.txt +++ b/docker/requirements.txt @@ -1,10 +1,10 @@ --extra-index-url https://packages.dea.ga.gov.au/ datacube-ows>=1.9 -datacube[performance,s3]>=1.9 +datacube[performance,s3] @ git+https://github.com/opendatacube/datacube-core@565de91 eodatasets3>1.9 hdstats==0.1.8.post1 -odc-algo @ git+https://github.com/opendatacube/odc-algo@baacbb +odc-algo @ git+https://github.com/opendatacube/odc-algo@8cfaf89 odc-apps-cloud>=0.2.2 # For testing odc-apps-dc-tools>=0.2.12 diff --git a/tests/requirements.txt b/tests/requirements.txt index 0328cac4..6c46d271 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -9,7 +9,7 @@ mock moto networkx numpy<2.0 -odc-algo @ git+https://github.com/opendatacube/odc-algo@baacbb +odc-algo @ git+https://github.com/opendatacube/odc-algo@8cfaf89 odc-cloud>=0.2.5 odc-dscache>=1.9 odc-geo>=0.5.0rc1 From 9a33abec97d505bb0797b9fd12c654cefe2d8176 Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Mon, 16 Jun 2025 04:51:24 +0000 Subject: [PATCH 25/37] update env variable with datacube>1.9 --- .github/workflows/main.yml | 8 ++++---- docker/docker-compose.yml | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 16085aba..5e6f6f53 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -135,10 +135,10 @@ jobs: ./tests/integration_test.sh env: - DB_HOSTNAME: postgres - DB_USERNAME: opendatacube - DB_PASSWORD: opendatacubepassword - DB_DATABASE: opendatacube + ODC_DEFAULT_DB_HOSTNAME: postgres + ODC_DEFAULT_DB_USERNAME: opendatacube + ODC_DEFAULT_DB_PASSWORD: opendatacubepassword + ODC_DEFAULT_DB_DATABASE: opendatacube AWS_NO_SIGN_REQUEST: true AWS_DEFAULT_REGION: ap-southeast-2 AWS_REGION: ap-southeast-2 diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 4ba9a258..0aad6e28 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -12,10 +12,10 @@ services: build: context: . environment: - - DB_HOSTNAME=postgres - - DB_USERNAME=opendatacube - - DB_PASSWORD=opendatacubepassword - - DB_DATABASE=opendatacube + - ODC_DEFAULT_DB_HOSTNAME=postgres + - ODC_DEFAULT_DB_USERNAME=opendatacube + - ODC_DEFAULT_DB_PASSWORD=opendatacubepassword + - ODC_DEFAULT_DB_DATABASE=opendatacube - AWS_NO_SIGN_REQUEST=true - STAC_API_URL=https://earth-search.aws.element84.com/v0/ - GDAL_HTTP_MAX_RETRY=5 From 332146b10646f9c8edcec18025f0ae2113959ef2 Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Mon, 16 Jun 2025 04:53:32 +0000 Subject: [PATCH 26/37] remove pin on sqlalchemy --- docker/env.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/env.yaml b/docker/env.yaml index f1d7d567..f6df1057 100644 --- a/docker/env.yaml +++ b/docker/env.yaml @@ -120,7 +120,7 @@ dependencies: - slicerator - snuggs - sortedcontainers - - SQLAlchemy<2.0 + - SQLAlchemy - structlog - tblib - text-unidecode From 81941684faf05bcceb22cae615ffccd9975683c3 Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Mon, 16 Jun 2025 05:00:42 +0000 Subject: [PATCH 27/37] lower pin numexpr --- docker/env.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/env.yaml b/docker/env.yaml index f6df1057..72ac025b 100644 --- a/docker/env.yaml +++ b/docker/env.yaml @@ -81,6 +81,7 @@ dependencies: - networkx # for the sake of geomedian address this later - numpy<2.0 + - numexpr>=2.11 - ordered-set - packaging - pandas From 92c45e63ecb6fb11353a38ff1b9057024a8d3067 Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Tue, 17 Jun 2025 04:56:34 +0000 Subject: [PATCH 28/37] update dependencies version --- docker/env.yaml | 2 +- docker/requirements.txt | 4 +- setup.cfg | 14 +++---- tests/requirements.txt | 4 +- tests/test-env-py310.yml | 88 ---------------------------------------- 5 files changed, 12 insertions(+), 100 deletions(-) delete mode 100644 tests/test-env-py310.yml diff --git a/docker/env.yaml b/docker/env.yaml index 72ac025b..5c88b2a3 100644 --- a/docker/env.yaml +++ b/docker/env.yaml @@ -48,7 +48,7 @@ dependencies: - dask-image - deepdiff - defusedxml - - distributed + - distributed>=2025.4 - docutils - fiona - Flask diff --git a/docker/requirements.txt b/docker/requirements.txt index 496db443..01347fd9 100644 --- a/docker/requirements.txt +++ b/docker/requirements.txt @@ -1,10 +1,10 @@ --extra-index-url https://packages.dea.ga.gov.au/ datacube-ows>=1.9 -datacube[performance,s3] @ git+https://github.com/opendatacube/datacube-core@565de91 +datacube[performance,s3]>=1.9.5 eodatasets3>1.9 hdstats==0.1.8.post1 -odc-algo @ git+https://github.com/opendatacube/odc-algo@8cfaf89 +odc-algo>=1.0.1 odc-apps-cloud>=0.2.2 # For testing odc-apps-dc-tools>=0.2.12 diff --git a/setup.cfg b/setup.cfg index 5274d286..b07247bc 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,18 +22,18 @@ install_requires = botocore click>=8.0.0 dask - datacube>=1.9 - distributed + datacube>=1.9.5 + distributed>=2025.4 numpy odc-cloud[ASYNC]>=0.2.5 - odc_algo + odc_algo>=1.0.1 odc_dscache>=1.9 odc_io - odc_stac - odc-geo + odc_stac>=0.4.0 + odc-geo>=0.5.0rc1 pandas pystac>=1.1.0 - eodatasets3>=0.22.0 + eodatasets3>=1.9 toolz tqdm xarray>=2023.1.0 @@ -57,7 +57,7 @@ include = # datacube_ows<1.8.21 has issue on function config internal over-writing [options.extras_require] ows = - datacube_ows>=1.8.21 + datacube_ows>=1.9 sentry-sdk blinker diff --git a/tests/requirements.txt b/tests/requirements.txt index 6c46d271..2b7abd4d 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://packages.dea.ga.gov.au/ -datacube @ git+https://github.com/opendatacube/datacube-core@565de91 +datacube>=1.9.5 datacube-ows>=1.9 # for pytest-depends deepdiff @@ -9,7 +9,7 @@ mock moto networkx numpy<2.0 -odc-algo @ git+https://github.com/opendatacube/odc-algo@8cfaf89 +odc-algo>=1.0.1 odc-cloud>=0.2.5 odc-dscache>=1.9 odc-geo>=0.5.0rc1 diff --git a/tests/test-env-py310.yml b/tests/test-env-py310.yml deleted file mode 100644 index b3906d05..00000000 --- a/tests/test-env-py310.yml +++ /dev/null @@ -1,88 +0,0 @@ -# Conda environment for running tests in odc-tools -# conda env create -f test-env-py310.yml -# conda activate odc-tests-py310 - -name: odc-tests-py310 -channels: - - conda-forge -dependencies: - - python=3.10 - - # Datacube - - datacube==1.9.0 - - # odc.{aws,aio}: aiobotocore/boto3 - # pin aiobotocore for easier resolution of dependencies - - aiobotocore #==1.3.3 - - boto3 - - # eodatasets3 (for odc-stats) - - boltons - - ciso8601 - - python-rapidjson - - requests-cache # 0.8.0 broke eodatasets3 - - ruamel.yaml - - structlog - - url-normalize - - # odc-stats - - pandas - - pystac>=1.1.0 - - toolz - - tqdm - # pin xarray version since groupby bug on multi-indexed dataset in 2022.6.0 - - xarray #==2023.12.0 - - gdal - - fsspec>=2022.1.0 - - fiona - - # For tests - - pytest - - pytest-httpserver - - pytest-cov - - pytest-timeout - - moto - - mock - - deepdiff - # these may need to be in pip - - s3fs - # - tflite-runtime - - tl2cgen - - # for pytest-depends - - future_fstrings - - networkx - - colorama - - # for docs - - sphinx - - sphinx_rtd_theme - - sphinx-autodoc-typehints - - nbsphinx - - - pip=24 - - pip: - # - odc-algo - # - odc-dscache>=0.2.2 - - odc-cloud[ASYNC] - - odc-geo>=0.4.10 - - odc-stac - - odc-loader>=0.5.1 - - thredds-crawler - - # odc.stats - - eodatasets3>=1.9 - - # odc.algo optional dependency - - hdstats - - scipy<1.15.0 # cwt import removed in 1.15 - - # tests - - pytest-depends - - tflite-runtime - - # init db - - odc-apps-dc-tools>=1.9 - - # for odc-stats - - datacube_ows>=1.9 From c3ba4109ec1d396dcbe1041a81eb24e1e84186f7 Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Tue, 17 Jun 2025 05:20:14 +0000 Subject: [PATCH 29/37] remove expr_eval --- odc/stats/_algebra.py | 44 --------------------- odc/stats/plugins/lc_fc_wo_a0.py | 3 +- odc/stats/plugins/lc_level34.py | 3 +- odc/stats/plugins/lc_ml_treelite.py | 2 +- odc/stats/plugins/lc_tf_urban.py | 2 +- odc/stats/plugins/lc_treelite_cultivated.py | 3 +- odc/stats/plugins/lc_treelite_woody.py | 2 +- odc/stats/plugins/lc_veg_class_a1.py | 3 +- 8 files changed, 11 insertions(+), 51 deletions(-) diff --git a/odc/stats/_algebra.py b/odc/stats/_algebra.py index ef478cc1..f45189f8 100644 --- a/odc/stats/_algebra.py +++ b/odc/stats/_algebra.py @@ -5,51 +5,7 @@ import dask.array as da import xarray as xr import numpy as np -import numexpr as ne -import functools from dask.base import tokenize -from typing import Any, Dict, Optional -from odc.algo._dask import flatten_kv, unflatten_kv - - -def apply_numexpr_np( - expr: str, - data: Optional[Dict[str, Any]] = None, - dtype=None, - casting="safe", - order="K", - **params, -) -> np.ndarray: - """ - Apply numexpr to numpy arrays - """ - - if data is None: - data = params - else: - data.update(params) - - out = ne.evaluate(expr, local_dict=data, casting=casting, order=order) - if dtype is None: - return out - else: - return out.astype(dtype) - - -def expr_eval(expr, data, dtype="float32", name="expr_eval", **kwargs): - tk = tokenize(apply_numexpr_np, *flatten_kv(data)) - op = functools.partial( - apply_numexpr_np, expr, dtype=dtype, casting="unsafe", order="K", **kwargs - ) - - return da.map_blocks( - lambda op, *data: op(unflatten_kv(data)), - op, - *flatten_kv(data), - name=f"{name}_{tk}", - dtype=dtype, - meta=np.array((), dtype=dtype), - ) def _median_by_ind(a): diff --git a/odc/stats/plugins/lc_fc_wo_a0.py b/odc/stats/plugins/lc_fc_wo_a0.py index 230d064d..81e9bb30 100644 --- a/odc/stats/plugins/lc_fc_wo_a0.py +++ b/odc/stats/plugins/lc_fc_wo_a0.py @@ -16,7 +16,8 @@ to_float, _nodata_fuser, ) -from odc.stats._algebra import expr_eval, median_ds +from odc.stats._algebra import median_ds +from odc.algo import expr_eval from ._registry import StatsPluginInterface, register diff --git a/odc/stats/plugins/lc_level34.py b/odc/stats/plugins/lc_level34.py index 1ba4c9ce..6eb47745 100644 --- a/odc/stats/plugins/lc_level34.py +++ b/odc/stats/plugins/lc_level34.py @@ -13,7 +13,7 @@ from ._registry import StatsPluginInterface, register from ._utils import rasterize_vector_mask, generate_numexpr_expressions -from odc.stats._algebra import expr_eval +from odc.algo import expr_eval from osgeo import gdal NODATA = 255 @@ -28,6 +28,7 @@ class StatsLccsLevel4(StatsPluginInterface): def __init__( self, + *, class_def_path: str = None, class_condition: Dict[str, List] = None, urban_mask: str = None, diff --git a/odc/stats/plugins/lc_ml_treelite.py b/odc/stats/plugins/lc_ml_treelite.py index f102dc7f..b17746fd 100644 --- a/odc/stats/plugins/lc_ml_treelite.py +++ b/odc/stats/plugins/lc_ml_treelite.py @@ -18,7 +18,7 @@ from odc.algo._memsink import yxbt_sink, yxt_sink from odc.stats.io import load_with_native_transform -from odc.stats._algebra import expr_eval +from odc.algo import expr_eval from odc.stats.model import DateTimeRange from ._registry import StatsPluginInterface from ._worker import TreeliteModelPlugin diff --git a/odc/stats/plugins/lc_tf_urban.py b/odc/stats/plugins/lc_tf_urban.py index dc195786..50a7c0f9 100644 --- a/odc/stats/plugins/lc_tf_urban.py +++ b/odc/stats/plugins/lc_tf_urban.py @@ -16,7 +16,7 @@ from odc.algo._memsink import yxbt_sink from odc.stats.io import load_with_native_transform -from odc.stats._algebra import expr_eval +from odc.algo import expr_eval from ._registry import StatsPluginInterface, register from ._worker import TensorFlowLiteModelPlugin diff --git a/odc/stats/plugins/lc_treelite_cultivated.py b/odc/stats/plugins/lc_treelite_cultivated.py index 096b9e24..909a3af8 100644 --- a/odc/stats/plugins/lc_treelite_cultivated.py +++ b/odc/stats/plugins/lc_treelite_cultivated.py @@ -7,7 +7,7 @@ import dask.array as da import numexpr as ne -from odc.stats._algebra import expr_eval +from odc.io import expr_eval from ._registry import register from .lc_ml_treelite import StatsMLTree, mask_and_predict @@ -53,6 +53,7 @@ def feature_BSI(input_block, nbart_swir_1, nbart_red, nbart_nir, nbart_blue): ).astype("float32") +# pylint: disable=too-many-positional-arguments def feature_TCW( input_block, nbart_blue, diff --git a/odc/stats/plugins/lc_treelite_woody.py b/odc/stats/plugins/lc_treelite_woody.py index 949e7743..f6f0964e 100644 --- a/odc/stats/plugins/lc_treelite_woody.py +++ b/odc/stats/plugins/lc_treelite_woody.py @@ -5,7 +5,7 @@ import xarray as xr import dask.array as da -from odc.stats._algebra import expr_eval +from odc.io import expr_eval from ._registry import register from .lc_ml_treelite import StatsMLTree, mask_and_predict diff --git a/odc/stats/plugins/lc_veg_class_a1.py b/odc/stats/plugins/lc_veg_class_a1.py index a0928b43..697756e1 100644 --- a/odc/stats/plugins/lc_veg_class_a1.py +++ b/odc/stats/plugins/lc_veg_class_a1.py @@ -6,7 +6,7 @@ import numpy as np import xarray as xr -from odc.stats._algebra import expr_eval +from odc.algo import expr_eval from ._registry import StatsPluginInterface, register from ._utils import replace_nodata_with_mode @@ -52,6 +52,7 @@ class StatsVegClassL1(StatsPluginInterface): def __init__( self, + *, output_classes: Dict, dem_threshold: Optional[int] = None, mudflat_threshold: Optional[int] = None, From 35ff5958a32b1bfa59ef820a3ee8727f3708ccc1 Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Tue, 17 Jun 2025 05:24:35 +0000 Subject: [PATCH 30/37] install numexpr from pypi --- docker/env.yaml | 1 - docker/requirements.txt | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/env.yaml b/docker/env.yaml index 5c88b2a3..6d6476a1 100644 --- a/docker/env.yaml +++ b/docker/env.yaml @@ -81,7 +81,6 @@ dependencies: - networkx # for the sake of geomedian address this later - numpy<2.0 - - numexpr>=2.11 - ordered-set - packaging - pandas diff --git a/docker/requirements.txt b/docker/requirements.txt index 01347fd9..867bd4ab 100644 --- a/docker/requirements.txt +++ b/docker/requirements.txt @@ -4,6 +4,7 @@ datacube-ows>=1.9 datacube[performance,s3]>=1.9.5 eodatasets3>1.9 hdstats==0.1.8.post1 +numexpr>=2.11 odc-algo>=1.0.1 odc-apps-cloud>=0.2.2 # For testing From 385153e15a45599bdebfc6d86767c562a6671994 Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Tue, 17 Jun 2025 05:26:17 +0000 Subject: [PATCH 31/37] fix typos --- odc/stats/plugins/lc_treelite_cultivated.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/odc/stats/plugins/lc_treelite_cultivated.py b/odc/stats/plugins/lc_treelite_cultivated.py index 909a3af8..ff91b23d 100644 --- a/odc/stats/plugins/lc_treelite_cultivated.py +++ b/odc/stats/plugins/lc_treelite_cultivated.py @@ -7,7 +7,7 @@ import dask.array as da import numexpr as ne -from odc.io import expr_eval +from odc.algo import expr_eval from ._registry import register from .lc_ml_treelite import StatsMLTree, mask_and_predict From 9bd0f4882e65ae249c907c693be795b089c424de Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Tue, 17 Jun 2025 05:30:47 +0000 Subject: [PATCH 32/37] update dependencies in test --- tests/requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/requirements.txt b/tests/requirements.txt index 2b7abd4d..5e64e91e 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -3,6 +3,7 @@ datacube>=1.9.5 datacube-ows>=1.9 # for pytest-depends deepdiff +distributed>=2025.4 eodatasets3>=1.9 future_fstrings mock @@ -13,6 +14,7 @@ odc-algo>=1.0.1 odc-cloud>=0.2.5 odc-dscache>=1.9 odc-geo>=0.5.0rc1 +odc-stac>=0.4.0 # For tests pytest From e8d2f5242c7836b2c5218d29e15b322461c53282 Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Tue, 17 Jun 2025 05:32:04 +0000 Subject: [PATCH 33/37] more typos --- odc/stats/plugins/lc_treelite_woody.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/odc/stats/plugins/lc_treelite_woody.py b/odc/stats/plugins/lc_treelite_woody.py index f6f0964e..78d5e3e0 100644 --- a/odc/stats/plugins/lc_treelite_woody.py +++ b/odc/stats/plugins/lc_treelite_woody.py @@ -5,7 +5,7 @@ import xarray as xr import dask.array as da -from odc.io import expr_eval +from odc.algo import expr_eval from ._registry import register from .lc_ml_treelite import StatsMLTree, mask_and_predict From ae83fffb2b65e5836265186ba88c45199805189a Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Wed, 18 Jun 2025 04:21:38 +0000 Subject: [PATCH 34/37] save and retrieve info on fused product from properties explicitly --- odc/stats/model.py | 7 +++---- odc/stats/utils.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/odc/stats/model.py b/odc/stats/model.py index 785009c5..441dc311 100644 --- a/odc/stats/model.py +++ b/odc/stats/model.py @@ -316,9 +316,7 @@ def location(self) -> str: def _lineage(self) -> Tuple[UUID, ...]: ds, *_ = self.datasets - # TODO: replace this and test - # if 'fused' in ds.metadata._doc['properties'].keys(): - if "fused" in ds.product.name: + if ds.metadata_doc["properties"].get("fused", False): lineage = tuple(set(x for ds in self.datasets for x in ds.metadata.sources)) else: lineage = tuple(ds.id for ds in self.datasets) @@ -402,7 +400,7 @@ def render_assembler_metadata( platforms, instruments = ([], []) for dataset in self.datasets: - if "fused" in dataset.product.name: + if dataset.metadata_doc["properties"].get("fused", False): if dataset.metadata_doc["properties"].get("eo:platform") is not None: platforms.append(dataset.metadata_doc["properties"]["eo:platform"]) if dataset.metadata_doc["properties"].get("eo:instrument") is not None: @@ -574,6 +572,7 @@ def render_metadata( def product_for_plugin( # pylint:disable=too-many-arguments,too-many-locals plugin: StatsPluginInterface, + *, location: str, name: Optional[str] = None, short_name: Optional[str] = None, diff --git a/odc/stats/utils.py b/odc/stats/utils.py index 0ec31264..c3f19858 100644 --- a/odc/stats/utils.py +++ b/odc/stats/utils.py @@ -454,6 +454,6 @@ def fuse_ds( for key, path in {**measurement_paths(ds)}.items(): fused_doc["measurements"][key]["path"] = path - fused_ds = Dataset(product, prep_eo3(fused_doc), uri="fake") fused_doc["properties"]["fused"] = "True" + fused_ds = Dataset(product, prep_eo3(fused_doc), uri="fake") return fused_ds From b51a8a36b24c46ed03cb6fa43b3f8531051a75e2 Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Wed, 18 Jun 2025 04:29:55 +0000 Subject: [PATCH 35/37] always some typos --- odc/stats/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/odc/stats/model.py b/odc/stats/model.py index 441dc311..dcea8456 100644 --- a/odc/stats/model.py +++ b/odc/stats/model.py @@ -572,8 +572,8 @@ def render_metadata( def product_for_plugin( # pylint:disable=too-many-arguments,too-many-locals plugin: StatsPluginInterface, - *, location: str, + *, name: Optional[str] = None, short_name: Optional[str] = None, version: Optional[str] = None, From 3c91b5b4c94db3360c5f0fbdb5c452b492182f8d Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Wed, 18 Jun 2025 06:38:12 +0000 Subject: [PATCH 36/37] update typing to 3.10 --- odc/stats/_algebra.py | 4 +- odc/stats/_cli_common.py | 11 ++- odc/stats/_cli_publish_tasks.py | 7 +- odc/stats/_gjson.py | 16 ++-- odc/stats/_grouper.py | 2 +- odc/stats/_sqs.py | 3 +- odc/stats/_text.py | 28 +++---- odc/stats/io.py | 32 ++++---- odc/stats/model.py | 75 +++++++++--------- odc/stats/plugins/_base.py | 24 +++--- odc/stats/plugins/_registry.py | 6 +- odc/stats/plugins/fc_percentiles.py | 17 ++-- odc/stats/plugins/gm.py | 43 +++++------ odc/stats/plugins/gm_ls_bitmask.py | 56 +++++++------- odc/stats/plugins/lc_fc_wo_a0.py | 12 ++- odc/stats/plugins/lc_level34.py | 10 +-- odc/stats/plugins/lc_ml_treelite.py | 10 +-- odc/stats/plugins/lc_tf_urban.py | 6 +- odc/stats/plugins/lc_veg_class_a1.py | 16 ++-- odc/stats/plugins/mangroves.py | 4 +- odc/stats/plugins/pq.py | 13 ++-- odc/stats/plugins/pq_bitmask.py | 18 ++--- odc/stats/plugins/tcw_percentiles.py | 8 +- odc/stats/plugins/wofs.py | 8 +- odc/stats/proc.py | 41 ++++------ odc/stats/tasks.py | 111 +++++++++++++++------------ odc/stats/utils.py | 53 ++++++------- scripts/patch_version.py | 4 +- tests/conftest.py | 2 +- tests/test_fc_percentiles.py | 10 ++- tests/test_gm_ls.py | 52 ++++++------- tests/test_gm_ls_bitmask.py | 12 +-- tests/test_pq_bitmask.py | 29 +++---- tests/test_save_tasks.py | 2 +- tests/test_utils.py | 4 +- 35 files changed, 369 insertions(+), 380 deletions(-) diff --git a/odc/stats/_algebra.py b/odc/stats/_algebra.py index f45189f8..f5345261 100644 --- a/odc/stats/_algebra.py +++ b/odc/stats/_algebra.py @@ -32,7 +32,7 @@ def median_by_ind(xr_da, dim, dtype="float32", name="median_by_ind"): meta=np.array((), dtype=dtype), drop_axis=0, ) - coords = dict((dim, xr_da.coords[dim]) for dim in xr_da.dims[1:]) + coords = {dim: xr_da.coords[dim] for dim in xr_da.dims[1:]} return xr.DataArray( res, dims=xr_da.dims[1:], coords=coords, attrs=xr_da.attrs.copy() @@ -44,5 +44,5 @@ def median_ds(xr_ds, dim, dtype="float32", name="median_ds"): for var, data in xr_ds.data_vars.items(): res[var] = median_by_ind(data, dim, dtype, name) # pylint: disable=undefined-loop-variable - coords = dict((dim, xr_ds.coords[dim]) for dim in data.dims[1:]) + coords = {dim: xr_ds.coords[dim] for dim in data.dims[1:]} return xr.Dataset(res, coords=coords, attrs=xr_ds.attrs.copy()) diff --git a/odc/stats/_cli_common.py b/odc/stats/_cli_common.py index 80fac714..14bb39da 100644 --- a/odc/stats/_cli_common.py +++ b/odc/stats/_cli_common.py @@ -2,7 +2,6 @@ import logging import sys -from typing import List, Tuple import click @@ -12,7 +11,7 @@ from urllib.parse import urlparse -TileIdx_txy = Tuple[str, int, int] # pylint: disable=invalid-name +TileIdx_txy = tuple[str, int, int] # pylint: disable=invalid-name def parse_task(s: str) -> TileIdx_txy: @@ -27,8 +26,8 @@ def parse_task(s: str) -> TileIdx_txy: def parse_all_tasks( - inputs: List[str], all_possible_tasks: List[TileIdx_txy] -) -> List[TileIdx_txy]: + inputs: list[str], all_possible_tasks: list[TileIdx_txy] +) -> list[TileIdx_txy]: """ Select a subset of all possible tasks given user input on cli. @@ -43,7 +42,7 @@ def parse_all_tasks( x+10/y-3/2019--P1Y """ - out: List[TileIdx_txy] = [] + out: list[TileIdx_txy] = [] full_set = set(all_possible_tasks) for s in inputs: @@ -68,7 +67,7 @@ def parse_all_tasks( return out -def parse_resolution(s: str, separator: str = ",") -> Tuple[float, float]: +def parse_resolution(s: str, separator: str = ",") -> tuple[float, float]: parts = [float(v) for v in split_and_check(s, separator, (1, 2))] if len(parts) == 1: diff --git a/odc/stats/_cli_publish_tasks.py b/odc/stats/_cli_publish_tasks.py index aed2ced7..794b7e24 100644 --- a/odc/stats/_cli_publish_tasks.py +++ b/odc/stats/_cli_publish_tasks.py @@ -1,6 +1,5 @@ import json import sys -from typing import List, Optional import click import fsspec @@ -28,7 +27,7 @@ def get_geometry(geojson_file: str) -> Geometry: ) -def filter_tasks(tasks: List[TileIdx_txy], geometry: Geometry, grid_name: str): +def filter_tasks(tasks: list[TileIdx_txy], geometry: Geometry, grid_name: str): for task in tasks: task_geometry = GRIDS[grid_name].tile_geobox((task[1], task[2])).extent if task_geometry.intersects(geometry): @@ -36,7 +35,7 @@ def filter_tasks(tasks: List[TileIdx_txy], geometry: Geometry, grid_name: str): def publish_tasks( - db: str, task_filter: str, geojson_filter: Optional[str], dryrun: bool, queue: str + db: str, task_filter: str, geojson_filter: str | None, dryrun: bool, queue: str ): reader = TaskReader(db) if len(task_filter) == 0: @@ -67,7 +66,7 @@ def publish_tasks( # We assume the db files are always be the S3 uri. If they are not, there is no need to use SQS queue to process. messages = ( - dict(Id=str(idx), MessageBody=json.dumps(render_sqs(tidx, db))) + {"Id": str(idx), "MessageBody": json.dumps(render_sqs(tidx, db))} for idx, tidx in enumerate(tasks) ) diff --git a/odc/stats/_gjson.py b/odc/stats/_gjson.py index 15db85f5..9824d00b 100644 --- a/odc/stats/_gjson.py +++ b/odc/stats/_gjson.py @@ -1,7 +1,7 @@ import math from copy import deepcopy import toolz -from typing import Tuple, Dict, Any +from typing import Any from datetime import timedelta from odc.geo.gridspec import GridSpec @@ -11,7 +11,7 @@ from .model import TileIdx_xy, TileIdx_txy -def gs_bounds(gs: GridSpec, tiles: Tuple[Tuple[int, int], Tuple[int, int]]) -> Geometry: +def gs_bounds(gs: GridSpec, tiles: tuple[tuple[int, int], tuple[int, int]]) -> Geometry: """ Compute Polygon for a selection of tiles. @@ -36,8 +36,8 @@ def timedelta_to_hours(td: timedelta) -> float: def compute_grid_info( - cells: Dict[TileIdx_xy, Any], resolution: float = math.inf, title_width: int = 0 -) -> Dict[TileIdx_xy, Any]: + cells: dict[TileIdx_xy, Any], resolution: float = math.inf, title_width: int = 0 +) -> dict[TileIdx_xy, Any]: """ Compute geojson feature for every cell in ``cells``. Where ``cells`` is produced by ``bin_dataset_stream`` @@ -75,8 +75,8 @@ def compute_grid_info( def gjson_from_tasks( - tasks: Dict[TileIdx_txy, Any], grid_info: Dict[TileIdx_xy, Any] -) -> Dict[str, Dict[str, Any]]: + tasks: dict[TileIdx_txy, Any], grid_info: dict[TileIdx_xy, Any] +) -> dict[str, dict[str, Any]]: """ Group tasks by time period and compute geosjon describing every tile covered by each time period. @@ -96,14 +96,14 @@ def _get(idx): dss = tasks[idx] utc_offset = timedelta(hours=geo["properties"]["utc_offset"]) - ndays = len(set((ds.time + utc_offset).date() for ds in dss)) + ndays = len({(ds.time + utc_offset).date() for ds in dss}) geo["properties"]["total"] = len(dss) geo["properties"]["days"] = ndays return geo def process(idxs): - return dict(type="FeatureCollection", features=[_get(idx) for idx in idxs]) + return {"type": "FeatureCollection", "features": [_get(idx) for idx in idxs]} return { t: process(idxs) diff --git a/odc/stats/_grouper.py b/odc/stats/_grouper.py index 958b9c82..a99dcffd 100644 --- a/odc/stats/_grouper.py +++ b/odc/stats/_grouper.py @@ -7,7 +7,7 @@ from __future__ import annotations from datetime import timedelta -from typing import TYPE_CHECKING, Any +from typing import Any, TYPE_CHECKING import numpy as np import pandas as pd diff --git a/odc/stats/_sqs.py b/odc/stats/_sqs.py index cddd9f5d..b7f04825 100644 --- a/odc/stats/_sqs.py +++ b/odc/stats/_sqs.py @@ -2,14 +2,13 @@ Work token for SQS based job control """ -from typing import Optional from datetime import timedelta, datetime import toolz from .model import WorkTokenInterface class SQSWorkToken(WorkTokenInterface): - def __init__(self, msg, timeout: int, t0: Optional[datetime] = None): + def __init__(self, msg, timeout: int, t0: datetime | None = None): super().__init__() if t0 is None: t0 = self.now() diff --git a/odc/stats/_text.py b/odc/stats/_text.py index c5c8191a..be3ece4b 100644 --- a/odc/stats/_text.py +++ b/odc/stats/_text.py @@ -1,28 +1,28 @@ from pathlib import Path -from typing import Union, Optional, Tuple, Dict, Any +from typing import Any -PathLike = Union[str, Path] +PathLike = str | Path # Copied from odc.io.text -def read_int(path: PathLike, default=None, base=10) -> Optional[int]: +def read_int(path: PathLike, default=None, base=10) -> int | None: """ Read single integer from a text file. Useful for things like parsing content of /sys/ or /proc. """ try: - with open(path, "rt", encoding="utf8") as f: + with open(path, encoding="utf8") as f: return int(f.read(), base) except (FileNotFoundError, ValueError): return default def split_and_check( - s: str, separator: str, n: Union[int, Tuple[int, ...]] -) -> Tuple[str, ...]: + s: str, separator: str, n: int | tuple[int, ...] +) -> tuple[str, ...]: """Turn string into tuple, checking that there are exactly as many parts as expected. :param s: String to parse :param separator: Separator character @@ -44,7 +44,7 @@ def parse_slice(s: str) -> slice: Examples "::4", "2:5", "2::10", "3:100:5" """ - def parse(part: str) -> Optional[int]: + def parse(part: str) -> int | None: if part == "": return None return int(part) @@ -57,32 +57,32 @@ def parse(part: str) -> Optional[int]: return slice(*parts) -def parse_yaml(s: str) -> Dict[str, Any]: +def parse_yaml(s: str) -> dict[str, Any]: # pylint: disable=import-outside-toplevel import yaml return yaml.load(s, Loader=getattr(yaml, "CSafeLoader", yaml.SafeLoader)) -def parse_yaml_file_or_inline(s: str) -> Dict[str, Any]: +def parse_yaml_file_or_inline(s: str) -> dict[str, Any]: """ Accept on input either a path to yaml file or yaml text, return parsed yaml document. """ try: # if file path = Path(s) - with open(path, "rt", encoding="utf8") as f: + with open(path, encoding="utf8") as f: txt = f.read() assert isinstance(txt, str) - except (FileNotFoundError, IOError, ValueError): + except (FileNotFoundError, OSError, ValueError): txt = s result = parse_yaml(txt) if isinstance(result, str): - raise IOError(f"No such file: {s}") + raise OSError(f"No such file: {s}") return result -def load_yaml_remote(yaml_url: str) -> Dict[str, Any]: +def load_yaml_remote(yaml_url: str) -> dict[str, Any]: """ Open a yaml file remotely and return the parsed yaml document """ @@ -97,7 +97,7 @@ def load_yaml_remote(yaml_url: str) -> Dict[str, Any]: raise -def parse_range2d_int(s: str) -> Tuple[Tuple[int, int], Tuple[int, int]]: +def parse_range2d_int(s: str) -> tuple[tuple[int, int], tuple[int, int]]: """Parse string like "0:3,4:5" -> ((0,3), (4,5))""" try: return tuple( diff --git a/odc/stats/io.py b/odc/stats/io.py index 7e6aafa5..ddc13d6a 100644 --- a/odc/stats/io.py +++ b/odc/stats/io.py @@ -4,7 +4,7 @@ from __future__ import annotations -from typing import Any, Dict, List, Optional, Union, cast, TYPE_CHECKING +from typing import Any, TYPE_CHECKING, cast from hashlib import sha1 from collections import namedtuple from collections.abc import Callable, Iterable, Sequence @@ -64,7 +64,7 @@ } -def dump_json(meta: Dict[str, Any]) -> str: +def dump_json(meta: dict[str, Any]) -> str: return json.dumps(meta, separators=(",", ":")) @@ -91,7 +91,7 @@ def _sha1_digest(*write_results): lines = [] for wr in write_results: if wr.error is not None: - raise IOError(f"Failed to write for: {wr.path}") + raise OSError(f"Failed to write for: {wr.path}") file = wr.path.split("/")[-1] lines.append(f"{wr.sha1} {file}\n") return "".join(lines) @@ -116,8 +116,8 @@ def save_with_s3_client(data, url, with_deps=None, **kw): class S3COGSink: def __init__( self, - cog_opts: Optional[Dict[str, Any]] = None, - acl: Optional[str] = None, + cog_opts: dict[str, Any] | None = None, + acl: str | None = None, public: bool = False, band_ext: str = "tif", ): @@ -153,7 +153,7 @@ def __init__( cog_opts = tmp cog_opts_per_band = cast( - Dict[str, Dict[str, Any]], cog_opts.pop("overrides", {}) + dict[str, dict[str, Any]], cog_opts.pop("overrides", {}) ) per_band_cfg = {k: v for k, v in cog_opts.items() if isinstance(v, dict)} if per_band_cfg: @@ -179,7 +179,7 @@ def __init__( def uri(self, task: Task) -> str: return task.metadata_path("absolute", ext=self._stac_meta_ext) - def verify_s3_credentials(self, test_uri: Optional[str] = None) -> bool: + def verify_s3_credentials(self, test_uri: str | None = None) -> bool: if test_uri is None: return True rr = self._write_blob(b"verifying S3 permissions", test_uri).compute() @@ -188,7 +188,7 @@ def verify_s3_credentials(self, test_uri: Optional[str] = None) -> bool: # pylint: disable=invalid-name def _write_blob( - self, data, url: str, ContentType: Optional[str] = None, with_deps=None + self, data, url: str, ContentType: str | None = None, with_deps=None ) -> Delayed: """ Returns Delayed WriteResult[path, sha1, error=None] @@ -217,7 +217,7 @@ def _write_blob( raise ValueError(f"Don't know how to save to '{url}'") # pylint: enable=invalid-name - def _ds_to_cog(self, ds: xr.Dataset, paths: Dict[str, str]) -> List[Delayed]: + def _ds_to_cog(self, ds: xr.Dataset, paths: dict[str, str]) -> list[Delayed]: out = [] for band, dv in ds.data_vars.items(): band = str(band) @@ -258,7 +258,7 @@ def _get_thumbnail( ContentType="image/jpeg", ) - def _ds_to_thumbnail_cog(self, ds: xr.Dataset, task: Task) -> List[Delayed]: + def _ds_to_thumbnail_cog(self, ds: xr.Dataset, task: Task) -> list[Delayed]: odc_file_path = task.metadata_path("absolute", ext=self._odc_meta_ext) thumbnail_cogs = [] @@ -296,7 +296,7 @@ def _ds_to_thumbnail_cog(self, ds: xr.Dataset, task: Task) -> List[Delayed]: return thumbnail_cogs - def cog_opts(self, band_name: str = "") -> Dict[str, Any]: + def cog_opts(self, band_name: str = "") -> dict[str, Any]: opts = dict(self._cog_opts) opts.update(self._cog_opts_per_band.get(band_name, {})) return opts @@ -305,7 +305,7 @@ def write_cog(self, da: xr.DataArray, url: str) -> Delayed: cog_bytes = to_cog(da, **self.cog_opts(str(da.name))) return self._write_blob(cog_bytes, url, ContentType="image/tiff") - def exists(self, task: Union[Task, str]) -> bool: + def exists(self, task: Task | str) -> bool: if isinstance(task, str): uri = task else: @@ -347,7 +347,7 @@ def get_eo3_stac_meta( ) # stac_meta is Python str, but content is 'Dict format' def dump_with_pystac( - self, task: Task, ds: Dataset, aux: Optional[Dataset] = None + self, task: Task, ds: Dataset, aux: Dataset | None = None ) -> Delayed: """ Dump files with STAC metadata file, which generated from PySTAC @@ -389,7 +389,7 @@ def dump_with_eodatasets3( self, task: Task, ds: Dataset, - aux: Optional[Dataset] = None, + aux: Dataset | None = None, proc: StatsPluginInterface = None, ) -> Delayed: """ @@ -532,9 +532,9 @@ def dump( self, task: Task, ds: Dataset, - aux: Optional[Dataset] = None, + aux: Dataset | None = None, proc: StatsPluginInterface = None, - apply_eodatasets3: Optional[bool] = False, + apply_eodatasets3: bool | None = False, ) -> Delayed: if apply_eodatasets3: return self.dump_with_eodatasets3(task, ds, aux, proc) diff --git a/odc/stats/model.py b/odc/stats/model.py index dcea8456..cd89424d 100644 --- a/odc/stats/model.py +++ b/odc/stats/model.py @@ -3,7 +3,8 @@ from copy import deepcopy from dataclasses import dataclass, field from datetime import datetime, timedelta, timezone -from typing import Any, Dict, List, Optional, Sequence, Tuple, Union +from typing import Any +from collections.abc import Sequence from uuid import UUID, uuid5 from pathlib import Path @@ -24,9 +25,9 @@ from .plugins import StatsPluginInterface -TileIdx_xy = Tuple[int, int] # pylint:disable=invalid-name -TileIdx_txy = Tuple[str, int, int] # pylint:disable=invalid-name -TileIdx = Union[TileIdx_txy, TileIdx_xy] +TileIdx_xy = tuple[int, int] # pylint:disable=invalid-name +TileIdx_txy = tuple[str, int, int] # pylint:disable=invalid-name +TileIdx = TileIdx_txy | TileIdx_xy DEFAULT_HREF_PREFIX = "https://collections.dea.ga.gov.au/product" EXT_TIFF = "tif" # because "consistency" @@ -76,7 +77,7 @@ def format_datetime(dt: datetime, with_tz=True, timespec="microseconds") -> str: class DateTimeRange: __slots__ = ("start", "end", "freq") - def __init__(self, start: Union[str, datetime], freq: Optional[str] = None): + def __init__(self, start: str | datetime, freq: str | None = None): """ DateTimeRange('2019-03--P3M') @@ -114,9 +115,7 @@ def __str__(self): def __repr__(self): return f"DateTimeRange({repr(self.start)}, {repr(self.freq)})" - def dc_query( - self, pad: Optional[Union[timedelta, float, int]] = None - ) -> Dict[str, Any]: + def dc_query(self, pad: timedelta | float | int | None = None) -> dict[str, Any]: """ Transform to form understood by datacube @@ -178,19 +177,19 @@ class OutputProduct: # pylint:disable=too-many-instance-attributes version: str short_name: str location: str - properties: Dict[str, str] - measurements: Tuple[str, ...] + properties: dict[str, str] + measurements: tuple[str, ...] href: str = "" region_code_format: str = "x{x:02d}y{y:02d}" cfg: Any = None naming_conventions_values: str = "dea_c3" explorer_path: str = "https://explorer.dea.ga.gov.au/" - inherit_skip_properties: Optional[List[str]] = None - preview_image_ows_style: Optional[Dict[str, Any]] = None + inherit_skip_properties: list[str] | None = None + preview_image_ows_style: dict[str, Any] | None = None classifier: str = "level3" maturity: str = "final" collection_number: int = 3 - nodata: Optional[Dict[str, int]] = None + nodata: dict[str, int] | None = None def __post_init__(self): if self.href == "": @@ -205,7 +204,7 @@ def region_code(self, tidx: TileIdx_xy) -> str: @staticmethod def dummy( - measurements: Tuple[str, ...] = ("red", "green", "blue") + measurements: tuple[str, ...] = ("red", "green", "blue"), ) -> "OutputProduct": version = "0.0.0" name = "dummy" @@ -286,10 +285,10 @@ class Task: tile_index: TileIdx_xy geobox: GeoBox time_range: DateTimeRange - datasets: Tuple[Dataset, ...] = field(repr=False) + datasets: tuple[Dataset, ...] = field(repr=False) uuid: UUID = UUID(int=0) short_time: str = field(init=False, repr=False) - source: Optional[WorkTokenInterface] = field(init=True, repr=False, default=None) + source: WorkTokenInterface | None = field(init=True, repr=False, default=None) def __post_init__(self): self.short_time = self.time_range.short @@ -313,11 +312,11 @@ def location(self) -> str: p1, p2 = rc[:mid], rc[mid:] return "/".join([p1, p2, self.short_time]) - def _lineage(self) -> Tuple[UUID, ...]: + def _lineage(self) -> tuple[UUID, ...]: ds, *_ = self.datasets if ds.metadata_doc["properties"].get("fused", False): - lineage = tuple(set(x for ds in self.datasets for x in ds.metadata.sources)) + lineage = tuple({x for ds in self.datasets for x in ds.metadata.sources}) else: lineage = tuple(ds.id for ds in self.datasets) @@ -349,7 +348,7 @@ def _prefix(self, relative_to: str = "dataset") -> str: def paths( self, relative_to: str = "dataset", ext: str = EXT_TIFF - ) -> Dict[str, str]: + ) -> dict[str, str]: """ Compute dictionary mapping band name to paths. @@ -381,7 +380,7 @@ def render_assembler_metadata( self, ext: str = EXT_TIFF, output_dataset: xr.Dataset = None, - processing_dt: Optional[datetime] = None, + processing_dt: datetime | None = None, ) -> DatasetAssembler: """ Put together metadata document for the output of this task. It needs the source_dataset to inherit @@ -496,8 +495,8 @@ def render_assembler_metadata( return dataset_assembler def render_metadata( - self, ext: str = EXT_TIFF, processing_dt: Optional[datetime] = None - ) -> Dict[str, Any]: + self, ext: str = EXT_TIFF, processing_dt: datetime | None = None + ) -> dict[str, Any]: """ Put together STAC metadata document for the output of this task. """ @@ -509,7 +508,7 @@ def render_metadata( region_code = product.region_code(self.tile_index) inputs = list(map(str, self._lineage())) - properties: Dict[str, Any] = deepcopy(product.properties) + properties: dict[str, Any] = deepcopy(product.properties) properties["dtr:start_datetime"] = format_datetime(self.time_range.start) properties["dtr:end_datetime"] = format_datetime(self.time_range.end) @@ -574,22 +573,22 @@ def product_for_plugin( # pylint:disable=too-many-arguments,too-many-locals plugin: StatsPluginInterface, location: str, *, - name: Optional[str] = None, - short_name: Optional[str] = None, - version: Optional[str] = None, - product_family: Optional[str] = None, + name: str | None = None, + short_name: str | None = None, + version: str | None = None, + product_family: str | None = None, collections_site: str = "collections.dea.ga.gov.au", producer: str = "ga.gov.au", - properties: Dict[str, Any] = None, + properties: dict[str, Any] = None, region_code_format: str = "x{x:02d}y{y:02d}", naming_conventions_values: str = "dea_c3", explorer_path: str = "https://explorer.dea.ga.gov.au", - inherit_skip_properties: Optional[List[str]] = None, - preview_image_ows_style: Optional[Dict[str, Any]] = None, + inherit_skip_properties: list[str] | None = None, + preview_image_ows_style: dict[str, Any] | None = None, classifier: str = "level3", - maturity: Optional[str] = None, + maturity: str | None = None, collection_number: int = 3, - nodata: Optional[Dict[str, int]] = None, + nodata: dict[str, int] | None = None, ) -> OutputProduct: """ :param plugin: An instance of a subclass of StatsPluginInterface, used for name defaults. @@ -667,7 +666,7 @@ class TaskResult: task: Task result_location: str = "" skipped: bool = False - error: Optional[str] = None + error: str | None = None meta: Any = field(init=True, repr=False, default=None) def __bool__(self): @@ -692,12 +691,12 @@ def default_cog_settings(): # Plugin plugin: str = "" - plugin_config: Dict[str, Any] = field(init=True, repr=True, default_factory=dict) + plugin_config: dict[str, Any] = field(init=True, repr=True, default_factory=dict) # Output Product # .{name| short_name| version| product_family| # collections_site| producer| properties: Dict[str, Any]} - product: Dict[str, Any] = field(init=True, repr=True, default_factory=dict) + product: dict[str, Any] = field(init=True, repr=True, default_factory=dict) # Dask config threads: int = -1 @@ -705,14 +704,14 @@ def default_cog_settings(): # S3/Output config output_location: str = "" - s3_acl: Optional[str] = None + s3_acl: str | None = None # s3_public is deprecated, use s3_acl="public-read" instead s3_public: bool = False - cog_opts: Dict[str, Any] = field(init=True, repr=True, default_factory=dict) + cog_opts: dict[str, Any] = field(init=True, repr=True, default_factory=dict) overwrite: bool = False # Heartbeat filepath - heartbeat_filepath: Optional[str] = None + heartbeat_filepath: str | None = None # Terminate task if running longer than this amount (seconds) max_processing_time: int = 0 diff --git a/odc/stats/plugins/_base.py b/odc/stats/plugins/_base.py index f48bb6c6..ebf534b5 100644 --- a/odc/stats/plugins/_base.py +++ b/odc/stats/plugins/_base.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Mapping, Optional, Sequence, Tuple +from collections.abc import Mapping, Sequence import xarray as xr import numpy as np @@ -20,16 +20,16 @@ class StatsPluginInterface(ABC): def __init__( self, resampling: str = "bilinear", - input_bands: Optional[Sequence[str]] = None, - optional_bands: Optional[Sequence[str]] = None, - chunks: Optional[Mapping[str, int]] = None, - basis: Optional[str] = None, + input_bands: Sequence[str] | None = None, + optional_bands: Sequence[str] | None = None, + chunks: Mapping[str, int] | None = None, + basis: str | None = None, group_by: str = "solar_day", - rgb_bands: Optional[Sequence[str]] = None, - rgb_clamp: Tuple[float, float] = (1.0, 3_000.0), - transform_code: Optional[str] = None, - area_of_interest: Optional[Sequence[float]] = None, - measurements: Optional[Sequence[str]] = None, + rgb_bands: Sequence[str] | None = None, + rgb_clamp: tuple[float, float] = (1.0, 3_000.0), + transform_code: str | None = None, + area_of_interest: Sequence[float] | None = None, + measurements: Sequence[str] | None = None, ): self.resampling = resampling self.input_bands = input_bands if input_bands is not None else [] @@ -45,7 +45,7 @@ def __init__( self.dask_worker_plugin = None @property - def measurements(self) -> Tuple[str, ...]: + def measurements(self) -> tuple[str, ...]: if self._measurements is None: raise NotImplementedError("Plugins must provide 'measurements'") return self._measurements @@ -84,7 +84,7 @@ def input_data( def reduce(self, xx: xr.Dataset) -> xr.Dataset: pass - def rgba(self, xx: xr.Dataset) -> Optional[xr.DataArray]: + def rgba(self, xx: xr.Dataset) -> xr.DataArray | None: """ Given result of ``.reduce(..)`` optionally produce RGBA preview image """ diff --git a/odc/stats/plugins/_registry.py b/odc/stats/plugins/_registry.py index f9d8c649..b6757b8d 100644 --- a/odc/stats/plugins/_registry.py +++ b/odc/stats/plugins/_registry.py @@ -1,5 +1,5 @@ import pydoc -from typing import Callable, Dict, Type +from collections.abc import Callable from functools import partial from ._base import StatsPluginInterface @@ -7,7 +7,7 @@ PluginFactory = Callable[..., StatsPluginInterface] -_plugins: Dict[str, PluginFactory] = {} +_plugins: dict[str, PluginFactory] = {} def _new(plugin_class, *args, **kwargs) -> StatsPluginInterface: @@ -30,7 +30,7 @@ def resolve(name: str) -> PluginFactory: return maker -def register(name: str, plugin_class: Type[StatsPluginInterface]): +def register(name: str, plugin_class: type[StatsPluginInterface]): _plugins[name] = partial(_new, plugin_class) diff --git a/odc/stats/plugins/fc_percentiles.py b/odc/stats/plugins/fc_percentiles.py index c70375dd..46e704d4 100644 --- a/odc/stats/plugins/fc_percentiles.py +++ b/odc/stats/plugins/fc_percentiles.py @@ -4,7 +4,7 @@ from functools import partial from itertools import product -from typing import Tuple, Dict, Iterable, Optional +from collections.abc import Iterable import numpy as np import xarray as xr @@ -23,15 +23,16 @@ class StatsFCP(StatsPluginInterface): VERSION = "0.0.3" PRODUCT_FAMILY = "fc_percentiles" - BAD_BITS_MASK = dict(cloud=(1 << 6), cloud_shadow=(1 << 5), terrain_shadow=(1 << 3)) + BAD_BITS_MASK = {"cloud": 1 << 6, "cloud_shadow": 1 << 5, "terrain_shadow": 1 << 3} def __init__( self, - max_sum_limit: Optional[int] = None, - clip_range: Optional[Tuple] = None, - ue_threshold: Optional[int] = None, - count_valid: Optional[bool] = False, - cloud_filters: Dict[str, Iterable[Tuple[str, int]]] = None, + *, + max_sum_limit: int | None = None, + clip_range: tuple | None = None, + ue_threshold: int | None = None, + count_valid: bool | None = False, + cloud_filters: dict[str, Iterable[tuple[str, int]]] = None, **kwargs, ): super().__init__(input_bands=["water", "pv", "bs", "npv", "ue"], **kwargs) @@ -43,7 +44,7 @@ def __init__( self.cloud_filters = cloud_filters if cloud_filters is not None else {} @property - def measurements(self) -> Tuple[str, ...]: + def measurements(self) -> tuple[str, ...]: _measurements = [ f"{b}_pc_{p}" for b, p in product(["pv", "bs", "npv"], ["10", "50", "90"]) ] diff --git a/odc/stats/plugins/gm.py b/odc/stats/plugins/gm.py index 301eae80..f12ce734 100644 --- a/odc/stats/plugins/gm.py +++ b/odc/stats/plugins/gm.py @@ -2,7 +2,7 @@ Geomedian """ -from typing import Optional, Union, Tuple, Iterable, Dict +from collections.abc import Iterable import xarray as xr from odc.algo import geomedian_with_mads from ._registry import StatsPluginInterface, register @@ -21,16 +21,15 @@ class StatsGM(StatsPluginInterface): def __init__( self, - bands: Tuple[str, ...], + bands: tuple[str, ...], mask_band: str, - contiguity_band: Optional[str] = None, - nodata_classes: Optional[Tuple[str, ...]] = None, - cloud_filters: Dict[ - Union[str, Tuple[str, ...]], Iterable[Tuple[str, int]] - ] = None, + *, + contiguity_band: str | None = None, + nodata_classes: tuple[str, ...] | None = None, + cloud_filters: dict[str | tuple[str, ...], Iterable[tuple[str, int]]] = None, basis_band=None, - aux_names: Dict[str, str] = None, - work_chunks: Tuple[int, int] = (400, 400), + aux_names: dict[str, str] = None, + work_chunks: tuple[int, int] = (400, 400), **kwargs, ): aux_names = ( @@ -71,7 +70,7 @@ def __init__( self._work_chunks = work_chunks @property - def measurements(self) -> Tuple[str, ...]: + def measurements(self) -> tuple[str, ...]: return self.bands + self.aux_bands def native_transform(self, xx: xr.Dataset) -> xr.Dataset: @@ -137,13 +136,12 @@ class StatsGMS2(StatsGM): def __init__( self, - bands: Optional[Tuple[str, ...]] = None, + *, + bands: tuple[str, ...] | None = None, mask_band: str = "SCL", - nodata_classes: Optional[Tuple[str, ...]] = ("no data",), - cloud_filters: Dict[ - Union[str, Tuple[str, ...]], Iterable[Tuple[str, int]] - ] = None, - aux_names: Dict[str, str] = None, + nodata_classes: tuple[str, ...] | None = ("no data",), + cloud_filters: dict[str | tuple[str, ...], Iterable[tuple[str, int]]] = None, + aux_names: dict[str, str] = None, rgb_bands=None, **kwargs, ): @@ -204,14 +202,13 @@ class StatsGMLS(StatsGM): def __init__( self, - bands: Optional[Tuple[str, ...]] = None, + *, + bands: tuple[str, ...] | None = None, mask_band: str = "fmask", contiguity_band: str = "nbart_contiguity", - nodata_classes: Optional[Tuple[str, ...]] = ("nodata",), - cloud_filters: Dict[ - Union[str, Tuple[str, ...]], Iterable[Tuple[str, int]] - ] = None, - aux_names: Dict[str, str] = None, + nodata_classes: tuple[str, ...] | None = ("nodata",), + cloud_filters: dict[str | tuple[str, ...], Iterable[tuple[str, int]]] = None, + aux_names: dict[str, str] = None, rgb_bands=None, **kwargs, ): @@ -256,7 +253,7 @@ def __init__( ) @property - def measurements(self) -> Tuple[str, ...]: + def measurements(self) -> tuple[str, ...]: return ( tuple(b for b in self.bands if b != self._contiguity_band) + self.aux_bands ) diff --git a/odc/stats/plugins/gm_ls_bitmask.py b/odc/stats/plugins/gm_ls_bitmask.py index 8577ca3c..5451dad8 100644 --- a/odc/stats/plugins/gm_ls_bitmask.py +++ b/odc/stats/plugins/gm_ls_bitmask.py @@ -3,7 +3,8 @@ """ from functools import partial -from typing import Iterable, Optional, Sequence, Tuple, Dict, Any +from typing import Any +from collections.abc import Iterable, Sequence import xarray as xr import numpy as np @@ -20,16 +21,17 @@ class StatsGMLSBitmask(StatsPluginInterface): def __init__( # pylint:disable=too-many-arguments self, - bands: Optional[Sequence[str]] = None, + *, + bands: Sequence[str] | None = None, mask_band: str = "QA_PIXEL", # provide flags with high cloud bits definition - flags: Dict[str, Optional[Any]] = None, - nodata_flags: Dict[str, Optional[Any]] = None, - filters: Optional[ - Iterable[Tuple[str, int]] - ] = None, # e.g. [("closing", 10),("opening", 2),("dilation", 2)] + flags: dict[str, Any | None] = None, + nodata_flags: dict[str, Any | None] = None, + filters: None | ( + Iterable[tuple[str, int]] + ) = None, # e.g. [("closing", 10),("opening", 2),("dilation", 2)] aux_names=None, - work_chunks: Tuple[int, int] = (400, 400), + work_chunks: tuple[int, int] = (400, 400), scale: float = 0.0000275, offset: float = -0.2, output_scale: int = 10000, # gm rescaling - making SR range match sentinel-2 gm @@ -37,14 +39,16 @@ def __init__( # pylint:disable=too-many-arguments **kwargs, ): if aux_names is None: - aux_names = dict(smad="smad", emad="emad", bcmad="bcmad", count="count") + aux_names = { + "smad": "smad", + "emad": "emad", + "bcmad": "bcmad", + "count": "count", + } if nodata_flags is None: - nodata_flags = dict(nodata=False) + nodata_flags = {"nodata": False} if flags is None: - flags = dict( - cloud="high_confidence", - cirrus="high_confidence", - ) + flags = {"cloud": "high_confidence", "cirrus": "high_confidence"} if bands is None: self.bands = ( "red", @@ -71,7 +75,7 @@ def __init__( # pylint:disable=too-many-arguments self.output_nodata = 0 @property - def measurements(self) -> Tuple[str, ...]: + def measurements(self) -> tuple[str, ...]: return self.bands + self.aux_bands def native_transform(self, xx): @@ -133,17 +137,17 @@ def native_transform(self, xx): def reduce(self, xx: xr.Dataset) -> xr.Dataset: cloud_mask = xx["cloud_mask"] - cfg = dict( - maxiters=1000, - num_threads=1, - scale=self.scale, - offset=self.offset, - reshape_strategy="mem", - out_chunks=(-1, -1, -1), - work_chunks=self.work_chunks, - compute_count=True, - compute_mads=True, - ) + cfg = { + "maxiters": 1000, + "num_threads": 1, + "scale": self.scale, + "offset": self.offset, + "reshape_strategy": "mem", + "out_chunks": (-1, -1, -1), + "work_chunks": self.work_chunks, + "compute_count": True, + "compute_mads": True, + } if self.filters is not None: cloud_mask = mask_cleanup(xx["cloud_mask"], mask_filters=self.filters) diff --git a/odc/stats/plugins/lc_fc_wo_a0.py b/odc/stats/plugins/lc_fc_wo_a0.py index 81e9bb30..c30f3418 100644 --- a/odc/stats/plugins/lc_fc_wo_a0.py +++ b/odc/stats/plugins/lc_fc_wo_a0.py @@ -3,7 +3,7 @@ """ from functools import partial -from typing import Tuple, Dict, Iterable, Optional +from collections.abc import Iterable import numpy as np import xarray as xr @@ -34,9 +34,9 @@ class StatsVegCount(StatsPluginInterface): def __init__( self, - ue_threshold: Optional[int] = None, - veg_threshold: Optional[int] = None, - cloud_filters: Dict[str, Iterable[Tuple[str, int]]] = None, + ue_threshold: int | None = None, + veg_threshold: int | None = None, + cloud_filters: dict[str, Iterable[tuple[str, int]]] = None, **kwargs, ): super().__init__(input_bands=["water", "pv", "bs", "npv", "ue"], **kwargs) @@ -137,7 +137,6 @@ def native_transform(self, xx): return xx def fuser(self, xx): - wet_clear = xx["wet_clear"] wet_valid = xx["wet_valid"] @@ -307,7 +306,6 @@ def _max_consecutive_months(self, data, nodata, normalize=False): return max_count def reduce(self, xx: xr.Dataset) -> xr.Dataset: - xx = xx.groupby("time.month").map(median_ds, dim="spec") # consecutive observation of veg @@ -349,7 +347,7 @@ def reduce(self, xx: xr.Dataset) -> xr.Dataset: self.measurements, [max_count_veg, max_count_water, wet_percent] ) } - coords = dict((dim, xx.coords[dim]) for dim in xx["pv"].dims[1:]) + coords = {dim: xx.coords[dim] for dim in xx["pv"].dims[1:]} return xr.Dataset(data_vars=data_vars, coords=coords, attrs=xx.attrs) diff --git a/odc/stats/plugins/lc_level34.py b/odc/stats/plugins/lc_level34.py index 6eb47745..4e9dc187 100644 --- a/odc/stats/plugins/lc_level34.py +++ b/odc/stats/plugins/lc_level34.py @@ -2,8 +2,6 @@ Plugin of Module A3 in LandCover PipeLine """ -from typing import Optional, Dict, List - import xarray as xr import s3fs import os @@ -30,11 +28,11 @@ def __init__( self, *, class_def_path: str = None, - class_condition: Dict[str, List] = None, + class_condition: dict[str, list] = None, urban_mask: str = None, filter_expression: str = None, - mask_threshold: Optional[float] = None, - data_var_condition: Optional[Dict] = None, + mask_threshold: float | None = None, + data_var_condition: dict | None = None, **kwargs, ): super().__init__(**kwargs) @@ -148,7 +146,7 @@ def reduce(self, xx: xr.Dataset) -> xr.Dataset: attrs = xx.attrs.copy() attrs["nodata"] = NODATA dims = xx.level_3_4.dims[1:] - coords = dict((dim, xx.coords[dim]) for dim in dims) + coords = {dim: xx.coords[dim] for dim in dims} xx["level3"] = xr.DataArray( level3.squeeze(), dims=dims, attrs=attrs, coords=coords ) diff --git a/odc/stats/plugins/lc_ml_treelite.py b/odc/stats/plugins/lc_ml_treelite.py index b17746fd..2f2203c3 100644 --- a/odc/stats/plugins/lc_ml_treelite.py +++ b/odc/stats/plugins/lc_ml_treelite.py @@ -3,7 +3,7 @@ """ from abc import abstractmethod -from typing import Dict, Sequence, Optional +from collections.abc import Sequence import os import sys @@ -64,10 +64,10 @@ class StatsMLTree(StatsPluginInterface): def __init__( self, - output_classes: Dict, + output_classes: dict, model_path: str, - mask_bands: Optional[Dict] = None, - temporal_coverage: Optional[Dict] = None, + mask_bands: dict | None = None, + temporal_coverage: dict | None = None, **kwargs, ): super().__init__(**kwargs) @@ -122,7 +122,7 @@ def input_data( name=ds.type.name + "_yxt", ).squeeze("spec") - coords = dict((dim, input_array.coords[dim]) for dim in input_array.dims) + coords = {dim: input_array.coords[dim] for dim in input_array.dims} return xr.Dataset(data_vars=data_vars, coords=coords) def impute_missing_values(self, xx: xr.Dataset, image): diff --git a/odc/stats/plugins/lc_tf_urban.py b/odc/stats/plugins/lc_tf_urban.py index 50a7c0f9..fae6ba67 100644 --- a/odc/stats/plugins/lc_tf_urban.py +++ b/odc/stats/plugins/lc_tf_urban.py @@ -2,7 +2,7 @@ Plugin of TF urban model in LandCover PipeLine """ -from typing import Dict, Sequence +from collections.abc import Sequence import os import numpy as np @@ -76,7 +76,7 @@ class StatsUrbanClass(StatsPluginInterface): def __init__( self, - output_classes: Dict, + output_classes: dict, model_path: str, crop_size=None, **kwargs, @@ -118,7 +118,7 @@ def input_data( ).squeeze("spec", drop=True) data_vars[ds.type.name] = input_array - coords = dict((dim, input_array.coords[dim]) for dim in input_array.dims) + coords = {dim: input_array.coords[dim] for dim in input_array.dims} return xr.Dataset(data_vars=data_vars, coords=coords) def urban_class(self, input_array): diff --git a/odc/stats/plugins/lc_veg_class_a1.py b/odc/stats/plugins/lc_veg_class_a1.py index 697756e1..48801e53 100644 --- a/odc/stats/plugins/lc_veg_class_a1.py +++ b/odc/stats/plugins/lc_veg_class_a1.py @@ -2,8 +2,6 @@ Plugin of Module A1 in LandCover PipeLine """ -from typing import Optional, Dict - import numpy as np import xarray as xr from odc.algo import expr_eval @@ -53,12 +51,12 @@ class StatsVegClassL1(StatsPluginInterface): def __init__( self, *, - output_classes: Dict, - dem_threshold: Optional[int] = None, - mudflat_threshold: Optional[int] = None, - saltpan_threshold: Optional[int] = None, - water_threshold: Optional[float] = None, - veg_threshold: Optional[int] = None, + output_classes: dict, + dem_threshold: int | None = None, + mudflat_threshold: int | None = None, + saltpan_threshold: int | None = None, + water_threshold: float | None = None, + veg_threshold: int | None = None, **kwargs, ): super().__init__(**kwargs) @@ -233,7 +231,7 @@ def reduce(self, xx: xr.Dataset) -> xr.Dataset: k: xr.DataArray(v, dims=xx["veg_frequency"].dims[1:], attrs=attrs) for k, v in zip(self.measurements, [l3_mask]) } - coords = dict((dim, xx.coords[dim]) for dim in xx["veg_frequency"].dims[1:]) + coords = {dim: xx.coords[dim] for dim in xx["veg_frequency"].dims[1:]} return xr.Dataset(data_vars=data_vars, coords=coords, attrs=xx.attrs) diff --git a/odc/stats/plugins/mangroves.py b/odc/stats/plugins/mangroves.py index e8ede56c..ef05eaea 100644 --- a/odc/stats/plugins/mangroves.py +++ b/odc/stats/plugins/mangroves.py @@ -2,8 +2,6 @@ Mangroves canopy cover classes """ -from typing import Tuple - import numpy as np import xarray as xr import dask @@ -40,7 +38,7 @@ def __init__( super().__init__(input_bands=["pv_pc_10", "qa", "wet_pc_10"], **kwargs) @property - def measurements(self) -> Tuple[str, ...]: + def measurements(self) -> tuple[str, ...]: _measurements = ["canopy_cover_class"] return _measurements diff --git a/odc/stats/plugins/pq.py b/odc/stats/plugins/pq.py index da85fb05..4303fae4 100644 --- a/odc/stats/plugins/pq.py +++ b/odc/stats/plugins/pq.py @@ -2,7 +2,8 @@ Sentinel 2 pixel quality stats """ -from typing import Dict, Iterable, Optional, Tuple, cast +from typing import cast +from collections.abc import Iterable import xarray as xr from odc.algo import enum_to_bool, mask_cleanup @@ -33,9 +34,9 @@ class StatsPQ(StatsPluginInterface): def __init__( self, - filters: Optional[Dict[str, Iterable[Tuple[str, int]]]] = None, + filters: dict[str, Iterable[tuple[str, int]]] | None = None, resampling: str = "nearest", - **kwargs + **kwargs, ): super().__init__(input_bands=["SCL"], resampling=resampling, **kwargs) if filters is None: @@ -43,7 +44,7 @@ def __init__( self.filters = filters @property - def measurements(self) -> Tuple[str, ...]: + def measurements(self) -> tuple[str, ...]: measurements = ["total", "clear", *list(self.filters)] return tuple(measurements) @@ -83,7 +84,7 @@ def native_transform(self, xx: xr.Dataset) -> xr.Dataset: valid = scl != scl.nodata erased = enum_to_bool(scl, cloud_classes) return xr.Dataset( - dict(valid=valid, erased=erased), + {"valid": valid, "erased": erased}, attrs={"native": True}, # <- native flag needed for fuser ) @@ -96,7 +97,7 @@ def reduce(self, xx: xr.Dataset) -> xr.Dataset: valid = xx.valid erased_bands = [str(n) for n in xx.data_vars if str(n).startswith("erased")] total = valid.sum(axis=0, dtype="uint16") - pq = xr.Dataset(dict(total=total)) + pq = xr.Dataset({"total": total}) for band in erased_bands: erased: xr.DataArray = cast(xr.DataArray, xx[band]) diff --git a/odc/stats/plugins/pq_bitmask.py b/odc/stats/plugins/pq_bitmask.py index b408bf96..330bfad5 100644 --- a/odc/stats/plugins/pq_bitmask.py +++ b/odc/stats/plugins/pq_bitmask.py @@ -34,7 +34,7 @@ """ from functools import partial -from typing import Dict, Optional, Tuple, Iterable +from collections.abc import Iterable import dask.array as da import xarray as xr @@ -54,23 +54,21 @@ class StatsPQLSBitmask(StatsPluginInterface): def __init__( self, + *, pq_band: str = "QA_PIXEL", - aerosol_band: Optional[str] = None, + aerosol_band: str | None = None, # provide flags with high cloud bits definition flags=None, nodata_flags=None, - filters: Optional[Dict[str, Iterable[Tuple[str, int]]]] = None, - aerosol_filters: Optional[Dict[str, Iterable[Tuple[str, int]]]] = None, + filters: dict[str, Iterable[tuple[str, int]]] | None = None, + aerosol_filters: dict[str, Iterable[tuple[str, int]]] | None = None, resampling: str = "nearest", **kwargs, ): if nodata_flags is None: - nodata_flags = dict(nodata=False) + nodata_flags = {"nodata": False} if flags is None: - flags = dict( - cloud="high_confidence", - cirrus="high_confidence", - ) + flags = {"cloud": "high_confidence", "cirrus": "high_confidence"} self.pq_band = pq_band self.aerosol_band = aerosol_band input_bands = [self.pq_band] @@ -83,7 +81,7 @@ def __init__( self.aerosol_filters = aerosol_filters or {} @property - def measurements(self) -> Tuple[str, ...]: + def measurements(self) -> tuple[str, ...]: """ Output product measurements """ diff --git a/odc/stats/plugins/tcw_percentiles.py b/odc/stats/plugins/tcw_percentiles.py index c635b919..ae35511d 100644 --- a/odc/stats/plugins/tcw_percentiles.py +++ b/odc/stats/plugins/tcw_percentiles.py @@ -3,7 +3,7 @@ """ from functools import partial -from typing import Sequence, Tuple, Iterable, Dict +from collections.abc import Sequence, Iterable import xarray as xr import numpy as np import logging @@ -29,10 +29,10 @@ class StatsTCWPC(StatsPluginInterface): def __init__( self, - coefficients: Dict[str, Dict[str, float]] = None, + coefficients: dict[str, dict[str, float]] = None, input_bands: Sequence[str] = None, output_bands: Sequence[str] = None, - cloud_filters: Dict[str, Iterable[Tuple[str, int]]] = None, + cloud_filters: dict[str, Iterable[tuple[str, int]]] = None, **kwargs, ): self.cloud_filters = cloud_filters @@ -101,7 +101,7 @@ def __init__( ) @property - def measurements(self) -> Tuple[str, ...]: + def measurements(self) -> tuple[str, ...]: _measurments = [] for band in self.output_bands: _measurments += [f"{band}_pc_10", f"{band}_pc_50", f"{band}_pc_90"] diff --git a/odc/stats/plugins/wofs.py b/odc/stats/plugins/wofs.py index bd552b8d..4753bc76 100644 --- a/odc/stats/plugins/wofs.py +++ b/odc/stats/plugins/wofs.py @@ -17,7 +17,7 @@ """ -from typing import Dict, Tuple, Iterable +from collections.abc import Iterable import numpy as np import xarray as xr from odc.algo import safe_div, apply_numexpr, keep_good_only @@ -53,13 +53,13 @@ class StatsWofs(StatsPluginInterface): } # Cloud/Shadow + Terrain Shadow def __init__( - self, cloud_filters: Dict[str, Iterable[Tuple[str, int]]] = None, **kwargs + self, cloud_filters: dict[str, Iterable[tuple[str, int]]] = None, **kwargs ): super().__init__(input_bands=["water"], **kwargs) self.cloud_filters = cloud_filters if cloud_filters is not None else {} @property - def measurements(self) -> Tuple[str, ...]: + def measurements(self) -> tuple[str, ...]: return "count_wet", "count_clear", "frequency" def native_transform(self, xx): @@ -176,7 +176,7 @@ def __init__(self, **kwargs): super().__init__(input_bands=["count_wet", "count_clear"]) @property - def measurements(self) -> Tuple[str, ...]: + def measurements(self) -> tuple[str, ...]: return "count_wet", "count_clear", "frequency" def fuser(self, xx): diff --git a/odc/stats/proc.py b/odc/stats/proc.py index 3b326856..14509c84 100644 --- a/odc/stats/proc.py +++ b/odc/stats/proc.py @@ -1,13 +1,6 @@ import logging -from typing import ( - Iterable, - Iterator, - Optional, - List, - Any, - Tuple, - Union, -) +from typing import Any +from collections.abc import Iterable, Iterator from dask.distributed import Client, WorkerPlugin from datetime import datetime, timezone import xarray as xr @@ -41,8 +34,8 @@ class TaskRunner: def __init__( self, cfg: TaskRunnerConfig, - resolution: Optional[Tuple[float, float]] = None, - from_sqs: Optional[str] = "", + resolution: tuple[float, float] | None = None, + from_sqs: str | None = "", ): """ """ _log = logging.getLogger(__name__) @@ -91,7 +84,7 @@ def _init_dask(self) -> Client: if nthreads <= 0: nthreads = get_max_cpu() - memory_limit: Union[str, int] = cfg.memory_limit + memory_limit: str | int = cfg.memory_limit if memory_limit == "": mem_1g = 1 << 30 memory_limit = get_max_mem() @@ -137,8 +130,8 @@ def verify_setup(self) -> bool: # pylint: disable=import-outside-toplevel def tasks( self, - tasks: List[str], - ds_filters: Optional[str] = None, + tasks: list[str], + ds_filters: str | None = None, ) -> Iterator[Task]: from ._cli_common import parse_all_tasks @@ -153,9 +146,9 @@ def tasks( # pylint: enable=import-outside-toplevel def dry_run( self, - tasks: List[str], + tasks: list[str], check_exists: bool = True, - ds_filters: Optional[str] = None, + ds_filters: str | None = None, ) -> Iterator[TaskResult]: sink = self.sink overwrite = self._cfg.overwrite @@ -176,7 +169,7 @@ def dry_run( skipped = (overwrite is False) and (exists is True) nds = len(task.datasets) # TODO: take care of utc offset for day boundaries when computing ndays - ndays = len(set(ds.center_time.date() for ds in task.datasets)) + ndays = len({ds.center_time.date() for ds in task.datasets}) flag = flag_mapping.get(exists, "") msg = f"{task.location} days={ndays:03} ds={nds:04} {uri}{flag}" @@ -248,7 +241,7 @@ def _run(self, tasks: Iterable[Task], apply_eodatasets3) -> Iterator[TaskResult] _log.debug("Submitting to Dask (%s)", task.location) ds = client.persist(ds, fifo_timeout="1ms") - aux: Optional[xr.Dataset] = None + aux: xr.Dataset | None = None # if no rgba setting in cog_ops:overrides, no rgba tif as ouput if "overrides" in cfg.cog_opts and "rgba" in cfg.cog_opts["overrides"]: @@ -298,10 +291,10 @@ def _run(self, tasks: Iterable[Task], apply_eodatasets3) -> Iterator[TaskResult] # pylint: enable=too-many-locals, too-many-branches, too-many-statements def run( self, - tasks: Optional[List[str]] = None, - sqs: Optional[str] = None, - ds_filters: Optional[str] = None, - apply_eodatasets3: Optional[bool] = False, + tasks: list[str] | None = None, + sqs: str | None = None, + ds_filters: str | None = None, + apply_eodatasets3: bool | None = False, ) -> Iterator[TaskResult]: cfg = self._cfg _log = self._log @@ -355,7 +348,7 @@ def get_max_cpu() -> int: return psutil.cpu_count() -def get_cpu_quota() -> Optional[float]: +def get_cpu_quota() -> float | None: """ :returns: ``None`` if unconstrained or there is an error :returns: maximum amount of CPU this pod is allowed to use @@ -369,7 +362,7 @@ def get_cpu_quota() -> Optional[float]: return quota / period -def get_mem_quota() -> Optional[int]: +def get_mem_quota() -> int | None: """ :returns: ``None`` if there was some error :returns: maximum RAM, in bytes, this pod can use according to Linux cgroups diff --git a/odc/stats/tasks.py b/odc/stats/tasks.py index 79fd68b2..ebce4f8e 100644 --- a/odc/stats/tasks.py +++ b/odc/stats/tasks.py @@ -1,5 +1,6 @@ import random -from typing import Optional, Tuple, Union, Callable, Any, Dict, List, Iterable, Iterator +from typing import Any +from collections.abc import Callable, Iterable, Iterator from types import SimpleNamespace from collections import namedtuple from datetime import datetime @@ -44,7 +45,7 @@ ) from ._stac_fetch import s3_fetch_dss -TilesRange2d = Tuple[Tuple[int, int], Tuple[int, int]] +TilesRange2d = tuple[tuple[int, int], tuple[int, int]] CompressedDataset = namedtuple("CompressedDataset", ["id", "time"]) _log = logging.getLogger(__name__) @@ -58,7 +59,7 @@ def compress_ds(ds: Dataset) -> CompressedDataset: return CompressedDataset(ds.id, dt) -def is_tile_in(tidx: Tuple[int, int], tiles: TilesRange2d) -> bool: +def is_tile_in(tidx: tuple[int, int], tiles: TilesRange2d) -> bool: (x0, x1), (y0, y1) = tiles x, y = tidx return (x0 <= x < x1) and (y0 <= y < y1) @@ -94,7 +95,7 @@ def parse_task(s: str) -> TileIdx_txy: return (t, int(x.lstrip("x")), int(y.lstrip("y"))) -def render_sqs(tidx: TileIdx_txy, filedb: str) -> Dict[str, str]: +def render_sqs(tidx: TileIdx_txy, filedb: str) -> dict[str, str]: """ Add extra layer to render task. Convert it to JSON for SQS message body. """ @@ -102,7 +103,7 @@ def render_sqs(tidx: TileIdx_txy, filedb: str) -> Dict[str, str]: return {"filedb": filedb, "tile_idx": f"{period}/{xi:02d}/{yi:02d}"} -def parse_sqs(s: str) -> Tuple[TileIdx_txy, str]: +def parse_sqs(s: str) -> tuple[TileIdx_txy, str]: """ Add extra layer to parse task. Convert it from JSON for SQS message body. """ @@ -159,6 +160,7 @@ def __init__( self, output: str, grid: str, + *, frequency: str = "annual", overwrite: bool = False, complevel: int = 6, @@ -183,8 +185,9 @@ def ds_align( cls, dss: Iterable, group_size: int, - dss_extra: Optional[Iterable] = None, - optional_products: Optional[Iterable] = None, + *, + dss_extra: Iterable | None = None, + optional_products: Iterable | None = None, fuse_dss: bool = True, ): def pack_dss(grouped_dss, group_size): @@ -282,13 +285,14 @@ def _find_dss( cls, dc: Datacube, products: str, - query: Dict[str, Any], - cfg: Dict[str, Any], + query: dict[str, Any], + cfg: dict[str, Any], + *, dataset_filter=None, predicate=None, fuse_dss: bool = True, - ignore_time: Optional[Iterable] = None, - optional_products: Optional[Iterable] = None, + ignore_time: Iterable | None = None, + optional_products: Iterable | None = None, ): """ query and filter the datasets with a string composed by products name @@ -369,7 +373,11 @@ def _find_dss( if group_size > 0: dss = cls.ds_align( - dss, group_size + 1, dss_extra, optional_products, fuse_dss + dss, + group_size + 1, + dss_extra=dss_extra, + optional_products=optional_products, + fuse_dss=fuse_dss, ) if predicate is not None: @@ -380,12 +388,11 @@ def _find_dss( @classmethod def create_dss_by_stac( cls, - s3_path: List[str], + s3_path: list[str], pattern: str = "*.stac-item.json", tiles=None, temporal_range=None, ): - if tiles is not None: glob_path = [ "x" + str(x) + "/" + "y" + str(y) + "/" + "*" @@ -429,12 +436,13 @@ def get_dss_by_grid( dc: Datacube, products: str, msg: Callable[[str], Any], + *, dataset_filter=None, predicate=None, - temporal_range: Optional[DateTimeRange] = None, - tiles: Optional[TilesRange2d] = None, - ignore_time: Optional[Iterable] = None, - optional_products: Optional[Iterable] = None, + temporal_range: DateTimeRange | None = None, + tiles: TilesRange2d | None = None, + ignore_time: Iterable | None = None, + optional_products: Iterable | None = None, ): """ This returns a tuple containing: @@ -444,7 +452,7 @@ def get_dss_by_grid( """ # pylint:disable=too-many-locals - cfg: Dict[str, Any] = { + cfg: dict[str, Any] = { "grid": self._grid, "freq": self._frequency, } @@ -469,8 +477,8 @@ def get_dss_by_grid( products, query, cfg, - dataset_filter, - predicate, + dataset_filter=dataset_filter, + predicate=predicate, ignore_time=ignore_time, optional_products=optional_products, ) @@ -485,13 +493,14 @@ def save( self, dc: Datacube, products: str, + *, dataset_filter=None, - temporal_range: Union[str, DateTimeRange, None] = None, - tiles: Optional[TilesRange2d] = None, - predicate: Optional[Callable[[Dataset], bool]] = None, - ignore_time: Optional[Iterable] = None, - optional_products: Optional[Iterable] = None, - msg: Optional[Callable[[str], Any]] = None, + temporal_range: str | DateTimeRange | None = None, + tiles: TilesRange2d | None = None, + predicate: Callable[[Dataset], bool] | None = None, + ignore_time: Iterable | None = None, + optional_products: Iterable | None = None, + msg: Callable[[str], Any] | None = None, debug: bool = False, ) -> bool: """ @@ -535,12 +544,12 @@ def msg_default(msg): dc, products, msg, - dataset_filter, - predicate, - temporal_range, - tiles, - ignore_time, - optional_products, + dataset_filter=dataset_filter, + predicate=predicate, + temporal_range=temporal_range, + tiles=tiles, + ignore_time=ignore_time, + optional_products=optional_products, ) dss_slice = list(islice(dss, 0, 100)) @@ -567,7 +576,7 @@ def msg_default(msg): cache.add_grid(self._gridspec, self._grid) cache.append_info_dict("stats/", {"config": cfg}) - cells: Dict[Tuple[int, int], Any] = {} + cells: dict[tuple[int, int], Any] = {} dss = cache.tee(dss) dss = bin_dataset_stream(self._gridspec, dss, cells, persist=persist) @@ -649,13 +658,13 @@ def _write_info(self, tasks, msg, cells, debug): # pylint:disable=too-many-locals csv_path = self.out_path(".csv") msg(f"Writing summary to {csv_path}") - with open(csv_path, "wt", encoding="utf8") as f: + with open(csv_path, "w", encoding="utf8") as f: f.write('"T","X","Y","datasets","days"\n') for p, x, y in sorted(tasks): dss = tasks[(p, x, y)] n_dss = len(dss) - n_days = len(set(ds.time.date() for ds in dss)) + n_days = len({ds.time.date() for ds in dss}) line = f'"{p}", {x:+05d}, {y:+05d}, {n_dss:4d}, {n_days:4d}\n' f.write(line) @@ -667,7 +676,7 @@ def _write_info(self, tasks, msg, cells, debug): for temporal_range, gjson in tasks_geo.items(): fname = self.out_path(f"-{temporal_range}.geojson") msg(f"..writing to {fname}") - with open(fname, "wt", encoding="utf8") as f: + with open(fname, "w", encoding="utf8") as f: json.dump(gjson, f) if debug: @@ -687,9 +696,9 @@ def _write_info(self, tasks, msg, cells, debug): class TaskReader: def __init__( self, - cache: Union[str, DatasetCache], - product: Optional[OutputProduct] = None, - resolution: Optional[Tuple[float, float]] = None, + cache: str | DatasetCache, + product: OutputProduct | None = None, + resolution: tuple[float, float] | None = None, ): self._cache_path = None self.s3_client = S3Client() @@ -717,7 +726,7 @@ def __init__( self._dscache = cache self._cfg = cfg - def is_compatible_resolution(self, resolution: Tuple[float, float], tol=1e-8): + def is_compatible_resolution(self, resolution: tuple[float, float], tol=1e-8): for res, sz in zip(resolution, self._gridspec.tile_size): res = abs(res) npix = int(sz / res) @@ -725,7 +734,7 @@ def is_compatible_resolution(self, resolution: Tuple[float, float], tol=1e-8): return False return True - def change_resolution(self, resolution: Tuple[float, float]): + def change_resolution(self, resolution: tuple[float, float]): """ Modify GridSpec to have different pixel resolution but still covering same tiles as the original. """ @@ -784,7 +793,7 @@ def __repr__(self) -> str: grid, path, n = self._grid, str(self._dscache.path), len(self._all_tiles) return f"<{path}> grid:{grid} n:{n:,d}" - def _resolve_product(self, product: Optional[OutputProduct]) -> OutputProduct: + def _resolve_product(self, product: OutputProduct | None) -> OutputProduct: if product is None: product = self._product @@ -805,10 +814,10 @@ def product(self) -> OutputProduct: return self._resolve_product(None) @property - def all_tiles(self) -> List[TileIdx_txy]: + def all_tiles(self) -> list[TileIdx_txy]: return self._all_tiles - def datasets(self, tile_index: TileIdx_txy) -> Tuple[Dataset, ...]: + def datasets(self, tile_index: TileIdx_txy) -> tuple[Dataset, ...]: return tuple( ds for ds in self._dscache.stream_grid_tile(tile_index, self._grid) ) @@ -816,9 +825,9 @@ def datasets(self, tile_index: TileIdx_txy) -> Tuple[Dataset, ...]: def load_task( self, tile_index: TileIdx_txy, - product: Optional[OutputProduct] = None, + product: OutputProduct | None = None, source: Any = None, - ds_filters: Optional[str] = None, + ds_filters: str | None = None, ) -> Task: product = self._resolve_product(product) @@ -840,8 +849,8 @@ def load_task( def stream( self, tiles: Iterable[TileIdx_txy], - product: Optional[OutputProduct] = None, - ds_filters: Optional[str] = None, + product: OutputProduct | None = None, + ds_filters: str | None = None, ) -> Iterator[Task]: product = self._resolve_product(product) for tidx in tiles: @@ -850,9 +859,9 @@ def stream( def stream_from_sqs( self, sqs_queue, - product: Optional[OutputProduct] = None, + product: OutputProduct | None = None, visibility_timeout: int = 300, - ds_filters: Optional[str] = None, + ds_filters: str | None = None, **kw, ) -> Iterator[Task]: from odc.aws.queue import get_messages, get_queue diff --git a/odc/stats/utils.py b/odc/stats/utils.py index c3f19858..13de5c60 100644 --- a/odc/stats/utils.py +++ b/odc/stats/utils.py @@ -1,5 +1,6 @@ import toolz -from typing import Dict, Tuple, List, Any, Callable, Optional +from typing import Any +from collections.abc import Callable from collections import namedtuple, defaultdict from datetime import datetime from dateutil.relativedelta import relativedelta @@ -14,14 +15,14 @@ def _bin_generic( - dss: List[CompressedDataset], bins: List[DateTimeRange] -) -> Dict[str, List[CompressedDataset]]: + dss: list[CompressedDataset], bins: list[DateTimeRange] +) -> dict[str, list[CompressedDataset]]: """ Dumb O(NM) implementation, N number of dataset, M number of bins. For every bin find all datasets that fall in there, and if not empty keep that bin. """ - out: Dict[str, List[CompressedDataset]] = {} + out: dict[str, list[CompressedDataset]] = {} for b in bins: _dss = [ds for ds in dss if ds.time in b] if len(_dss) > 0: @@ -31,9 +32,9 @@ def _bin_generic( def bin_generic( - cells: Dict[Tuple[int, int], Cell], bins: List[DateTimeRange] -) -> Dict[Tuple[str, int, int], List[CompressedDataset]]: - tasks: Dict[Tuple[str, int, int], List[CompressedDataset]] = {} + cells: dict[tuple[int, int], Cell], bins: list[DateTimeRange] +) -> dict[tuple[str, int, int], list[CompressedDataset]]: + tasks: dict[tuple[str, int, int], list[CompressedDataset]] = {} for tidx, cell in cells.items(): _bins = _bin_generic(cell.dss, bins) for t, dss in _bins.items(): @@ -43,11 +44,11 @@ def bin_generic( def bin_seasonal( - cells: Dict[Tuple[int, int], Cell], + cells: dict[tuple[int, int], Cell], months: int, anchor: int, extract_single_season=False, -) -> Dict[Tuple[str, int, int], List[CompressedDataset]]: +) -> dict[tuple[str, int, int], list[CompressedDataset]]: # mk_single_season_rules is different from mk_season_rules # because the mk_season_rules will split the whole year to 2/3/4 seasons # but mk_single_season_rules only extract a single season from the whole year @@ -72,11 +73,11 @@ def bin_seasonal( def bin_rolling_seasonal( - cells: Dict[Tuple[int, int], Cell], + cells: dict[tuple[int, int], Cell], temporal_range, months: int, interval: int, -) -> Dict[Tuple[str, int, int], List[CompressedDataset]]: +) -> dict[tuple[str, int, int], list[CompressedDataset]]: binner = rolling_season_binner( mk_rolling_season_rules(temporal_range, months, interval) ) @@ -102,16 +103,16 @@ def bin_rolling_seasonal( def bin_full_history( - cells: Dict[Tuple[int, int], Cell], start: datetime, end: datetime -) -> Dict[Tuple[str, int, int], List[CompressedDataset]]: + cells: dict[tuple[int, int], Cell], start: datetime, end: datetime +) -> dict[tuple[str, int, int], list[CompressedDataset]]: duration = end.year - start.year + 1 temporal_key = (f"{start.year}--P{duration}Y",) return {temporal_key + k: cell.dss for k, cell in cells.items()} def bin_annual( - cells: Dict[Tuple[int, int], Cell], -) -> Dict[Tuple[str, int, int], List[CompressedDataset]]: + cells: dict[tuple[int, int], Cell], +) -> dict[tuple[str, int, int], list[CompressedDataset]]: """ Annual binning :param cells: (x,y) -> Cell(dss: List[CompressedDataset], geobox: GeoBox, idx: Tuple[int, int]) @@ -131,7 +132,7 @@ def bin_annual( return tasks -def mk_single_season_rules(months: int, anchor: int) -> Dict[int, str]: +def mk_single_season_rules(months: int, anchor: int) -> dict[int, str]: """ Construct rules for a each year single season summary :param months: Length of season in months can be one of [1, 12] @@ -140,7 +141,7 @@ def mk_single_season_rules(months: int, anchor: int) -> Dict[int, str]: assert 1 <= months <= 12 assert 1 <= anchor <= 12 - rules: Dict[int, str] = {} + rules: dict[int, str] = {} start_month = anchor @@ -157,7 +158,7 @@ def mk_single_season_rules(months: int, anchor: int) -> Dict[int, str]: return rules -def mk_season_rules(months: int, anchor: int) -> Dict[int, str]: +def mk_season_rules(months: int, anchor: int) -> dict[int, str]: """ Construct rules for a regular seasons :param months: Length of season in months can be one of (1,2,3,4,6,12) @@ -166,7 +167,7 @@ def mk_season_rules(months: int, anchor: int) -> Dict[int, str]: assert months in (1, 2, 3, 4, 6, 12) assert 1 <= anchor <= 12 - rules: Dict[int, str] = {} + rules: dict[int, str] = {} for i in range(12 // months): start_month = anchor + i * months if start_month > 12: @@ -201,18 +202,18 @@ def mk_rolling_season_rules(temporal_range, months, interval): rules = {} season_start = start_date while ( - DateTimeRange(f'{season_start.strftime("%Y-%m-%d")}--P{months}M').end + DateTimeRange(f"{season_start.strftime('%Y-%m-%d')}--P{months}M").end <= end_date ): - rules[f'{season_start.strftime("%Y-%m")}--P{months}M'] = DateTimeRange( - f'{season_start.strftime("%Y-%m-%d")}--P{months}M' + rules[f"{season_start.strftime('%Y-%m')}--P{months}M"] = DateTimeRange( + f"{season_start.strftime('%Y-%m-%d')}--P{months}M" ) season_start += season_start_interval return rules -def season_binner(rules: Dict[int, str]) -> Callable[[datetime], str]: +def season_binner(rules: dict[int, str]) -> Callable[[datetime], str]: """ Construct mapping from datetime to a string in the form like 2010-06--P3M @@ -221,7 +222,7 @@ def season_binner(rules: Dict[int, str]) -> Callable[[datetime], str]: month of the season and ``N`` is a duration of the season in months. """ - _rules: Dict[int, Tuple[str, int]] = {} + _rules: dict[int, tuple[str, int]] = {} for month in range(1, 12 + 1): season = rules.get(month, "") @@ -241,7 +242,7 @@ def label(dt: datetime) -> str: return label -def rolling_season_binner(rules: Dict[int, str]) -> Callable[[datetime], list]: +def rolling_season_binner(rules: dict[int, str]) -> Callable[[datetime], list]: """ Construct mapping from datetime to a string in the form like 2010-06--P3M @@ -357,7 +358,7 @@ def fuse_products(*ds_types) -> Product: def fuse_ds( *dss, - product: Optional[Product] = None, + product: Product | None = None, ) -> Dataset: """ This function fuses two datasets. It requires that: diff --git a/scripts/patch_version.py b/scripts/patch_version.py index f2e3ba6b..c08e2ae0 100644 --- a/scripts/patch_version.py +++ b/scripts/patch_version.py @@ -32,9 +32,9 @@ def patch_version_lines(lines, patch_number): def patch_file(fname, patch_number): - with open(fname, "rt", encoding="utf-8") as src: + with open(fname, encoding="utf-8") as src: lines = list(patch_version_lines(src, patch_number)) - with open(fname, "wt", encoding="utf-8") as dst: + with open(fname, "w", encoding="utf-8") as dst: dst.writelines(lines) diff --git a/tests/conftest.py b/tests/conftest.py index 3675c165..602ea39f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,6 @@ import pathlib import pytest -from mock import MagicMock +from unittest.mock import MagicMock import boto3 from moto import mock_aws from odc.stats.plugins import register diff --git a/tests/test_fc_percentiles.py b/tests/test_fc_percentiles.py index f4086f2c..a5184532 100644 --- a/tests/test_fc_percentiles.py +++ b/tests/test_fc_percentiles.py @@ -179,9 +179,13 @@ def test_reduce(dataset): print(result) assert (result == expected_result).all() - assert set(xx.data_vars.keys()) == set( - ["band_1_pc_10", "band_1_pc_50", "band_1_pc_90", "qa", "count_valid"] - ) + assert set(xx.data_vars.keys()) == { + "band_1_pc_10", + "band_1_pc_50", + "band_1_pc_90", + "qa", + "count_valid", + } for band_name in xx.data_vars.keys(): if band_name not in ["count_valid"]: diff --git a/tests/test_gm_ls.py b/tests/test_gm_ls.py index cc45504a..91250f87 100644 --- a/tests/test_gm_ls.py +++ b/tests/test_gm_ls.py @@ -115,20 +115,18 @@ def test_result_bands_to_match_inputs(dataset): xx = stats_gmls.native_transform(dataset) result = stats_gmls.reduce(xx) - assert set(result.data_vars.keys()) == set( - [ - "nbart_red", - "nbart_green", - "nbart_blue", - "nbart_nir", - "nbart_swir_1", - "nbart_swir_2", - "sdev", - "edev", - "bcdev", - "count", - ] - ) + assert set(result.data_vars.keys()) == { + "nbart_red", + "nbart_green", + "nbart_blue", + "nbart_nir", + "nbart_swir_1", + "nbart_swir_2", + "sdev", + "edev", + "bcdev", + "count", + } def test_result_aux_bands_to_match_inputs(dataset): @@ -152,20 +150,18 @@ def test_result_aux_bands_to_match_inputs(dataset): xx = stats_gmls.native_transform(dataset) result = stats_gmls.reduce(xx) - assert set(result.data_vars.keys()) == set( - [ - "nbart_red", - "nbart_green", - "nbart_blue", - "nbart_nir", - "nbart_swir_1", - "nbart_swir_2", - "SDEV", - "EDEV", - "BCDEV", - "COUNT", - ] - ) + assert set(result.data_vars.keys()) == { + "nbart_red", + "nbart_green", + "nbart_blue", + "nbart_nir", + "nbart_swir_1", + "nbart_swir_2", + "SDEV", + "EDEV", + "BCDEV", + "COUNT", + } def test_resampling(dataset): diff --git a/tests/test_gm_ls_bitmask.py b/tests/test_gm_ls_bitmask.py index d1dba5e5..58201516 100644 --- a/tests/test_gm_ls_bitmask.py +++ b/tests/test_gm_ls_bitmask.py @@ -119,9 +119,7 @@ def test_reduce(dataset): result = xx.compute() - assert set(xx.data_vars.keys()) == set( - ["band_red", "smad", "emad", "bcmad", "count"] - ) + assert set(xx.data_vars.keys()) == {"band_red", "smad", "emad", "bcmad", "count"} assert result["band_red"].dtype == np.uint16 assert result["emad"].dtype == np.uint16 @@ -157,9 +155,7 @@ def test_reduce_with_filters(dataset): result = xx.compute() - assert set(xx.data_vars.keys()) == set( - ["band_red", "smad", "emad", "bcmad", "count"] - ) + assert set(xx.data_vars.keys()) == {"band_red", "smad", "emad", "bcmad", "count"} expected_result = np.array([[229, 36], [48, 58]]) band_red = result["band_red"].data @@ -190,6 +186,4 @@ def test_aux_result_bands_to_match_inputs(dataset): xx = gm.native_transform(dataset) xx = gm.reduce(xx) - assert set(xx.data_vars.keys()) == set( - ["band_red", "SMAD", "EMAD", "BCMAD", "COUNT"] - ) + assert set(xx.data_vars.keys()) == {"band_red", "SMAD", "EMAD", "BCMAD", "COUNT"} diff --git a/tests/test_pq_bitmask.py b/tests/test_pq_bitmask.py index 3e74f7eb..a73ca579 100644 --- a/tests/test_pq_bitmask.py +++ b/tests/test_pq_bitmask.py @@ -179,7 +179,7 @@ def test_reduce(dataset): xx = pq.reduce(xx) reduce_result = xx.compute() - assert set(reduce_result.data_vars.keys()) == set(["total", "clear"]) + assert set(reduce_result.data_vars.keys()) == {"total", "clear"} expected_result = np.array([[2, 3], [3, 2]]) total = reduce_result["total"].data @@ -201,9 +201,12 @@ def test_reduce_with_filter(dataset): xx = pq.reduce(xx) reduce_result = xx.compute() - assert set(reduce_result.data_vars.keys()) == set( - ["total", "clear", "clear_1_1", "clear_2_1_1"] - ) + assert set(reduce_result.data_vars.keys()) == { + "total", + "clear", + "clear_1_1", + "clear_2_1_1", + } expected_result = np.array([[2, 3], [3, 2]]) total = reduce_result["total"].data @@ -265,9 +268,7 @@ def test_reduce_for_aerosol(dataset_with_aerosol_band): xx = pq.reduce(xx) reduce_result = xx.compute() - assert set(reduce_result.data_vars.keys()) == set( - ["total", "clear", "clear_aerosol"] - ) + assert set(reduce_result.data_vars.keys()) == {"total", "clear", "clear_aerosol"} expected_result = np.array([[2, 2], [1, 2]]) clear = reduce_result["clear"].data @@ -294,9 +295,13 @@ def test_reduce_for_aerosol_with_filter(dataset_with_aerosol_band): xx = pq.reduce(xx) reduce_result = xx.compute() - assert set(reduce_result.data_vars.keys()) == set( - ["total", "clear", "clear_0_1_1", "clear_aerosol", "clear_0_1_1_aerosol"] - ) + assert set(reduce_result.data_vars.keys()) == { + "total", + "clear", + "clear_0_1_1", + "clear_aerosol", + "clear_0_1_1_aerosol", + } expected_result = np.array([[2, 3], [3, 2]]) clear_1_1_0 = reduce_result["clear_0_1_1"].data @@ -350,9 +355,7 @@ def test_reduce_for_atmos_opacity(dataset_with_atmos_opacity_band): xx = pq.reduce(xx) reduce_result = xx.compute() - assert set(reduce_result.data_vars.keys()) == set( - ["total", "clear", "clear_aerosol"] - ) + assert set(reduce_result.data_vars.keys()) == {"total", "clear", "clear_aerosol"} expected_result = np.array([[2, 2], [1, 2]]) clear = reduce_result["clear"].data diff --git a/tests/test_save_tasks.py b/tests/test_save_tasks.py index 66887805..72f7d2ca 100644 --- a/tests/test_save_tasks.py +++ b/tests/test_save_tasks.py @@ -143,7 +143,7 @@ def test_create_dss_by_stac(s3_path): ) dss = list(dss) assert len(dss) == 4 * 2 * len(s3_path) - products = list(set([d.product for d in dss])) + products = list({d.product for d in dss}) assert len(products) == len(s3_path) for d in dss: with_uris = False diff --git a/tests/test_utils.py b/tests/test_utils.py index a3d54a36..4541733e 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -72,7 +72,7 @@ def verify(task): dss1 = tasks[k] dss2 = tasks_y[k] - assert set(ds.id for ds in dss1) == set(ds.id for ds in dss2) + assert {ds.id for ds in dss1} == {ds.id for ds in dss2} tasks = bin_seasonal(cells, 6, 1) verify(tasks) @@ -320,7 +320,7 @@ def test_fuse_products(wo_definition, fc_definition): def _get_msr_paths(ds): - return set(m["path"] for m in ds.metadata_doc["measurements"].values()) + return {m["path"] for m in ds.metadata_doc["measurements"].values()} def test_fuse_dss(wo_definition, fc_definition): From 224543812cbee9710050ec5cfbe7e43b12781eae Mon Sep 17 00:00:00 2001 From: Emma Ai Date: Thu, 19 Jun 2025 00:00:42 +0000 Subject: [PATCH 37/37] unhacking lineage and metadata assemble --- odc/stats/model.py | 81 ++++++++++++++++++---------------------------- odc/stats/utils.py | 1 - 2 files changed, 32 insertions(+), 50 deletions(-) diff --git a/odc/stats/model.py b/odc/stats/model.py index cd89424d..5997ea3e 100644 --- a/odc/stats/model.py +++ b/odc/stats/model.py @@ -12,6 +12,7 @@ import pystac import xarray as xr from datacube.model import Dataset +from datacube.model.lineage import LineageTree from datacube.utils.dates import normalise_dt from odc.geo.geobox import GeoBox from ._text import split_and_check @@ -20,7 +21,7 @@ from rasterio.crs import CRS import warnings -from eodatasets3.assemble import DatasetAssembler, serialise +from eodatasets3.assemble import DatasetAssembler, serialise, _validate_property_name from eodatasets3.images import GridSpec from .plugins import StatsPluginInterface @@ -313,14 +314,13 @@ def location(self) -> str: return "/".join([p1, p2, self.short_time]) def _lineage(self) -> tuple[UUID, ...]: - ds, *_ = self.datasets - - if ds.metadata_doc["properties"].get("fused", False): - lineage = tuple({x for ds in self.datasets for x in ds.metadata.sources}) - else: - lineage = tuple(ds.id for ds in self.datasets) - - return lineage + lineage = set() + for ds in self.datasets: + tree = LineageTree.from_eo3_doc(ds.metadata_doc) + lineage |= ( + tree.child_datasets() if tree.child_datasets() else {tree.dataset_id} + ) + return tuple(lineage) def _prefix(self, relative_to: str = "dataset") -> str: product = self.product @@ -386,7 +386,7 @@ def render_assembler_metadata( Put together metadata document for the output of this task. It needs the source_dataset to inherit several properties and lineages. It also needs the output_dataset to get the measurement information. """ - # pylint:disable=too-many-branches + # pylint:disable=too-many-branches,protected-access dataset_assembler = DatasetAssembler( naming_conventions=self.product.naming_conventions_values, dataset_location=Path(self.product.explorer_path), @@ -398,47 +398,30 @@ def render_assembler_metadata( platforms, instruments = ([], []) + _validate_property_name(self.product.classifier) for dataset in self.datasets: - if dataset.metadata_doc["properties"].get("fused", False): - if dataset.metadata_doc["properties"].get("eo:platform") is not None: - platforms.append(dataset.metadata_doc["properties"]["eo:platform"]) - if dataset.metadata_doc["properties"].get("eo:instrument") is not None: - if isinstance( - dataset.metadata_doc["properties"]["eo:instrument"], list - ): - instruments += dataset.metadata_doc["properties"][ - "eo:instrument" - ] - else: - instruments += [ - dataset.metadata_doc["properties"]["eo:instrument"] - ] - dataset_assembler.note_source_datasets( - self.product.classifier, *dataset.metadata.sources - ) - else: - dataset.metadata_doc.setdefault("$schema", "") - source_datasetdoc = serialise.from_doc( - dataset.metadata_doc, skip_validation=True - ) - dataset_assembler.add_source_dataset( - source_datasetdoc, - classifier=self.product.classifier, - auto_inherit_properties=True, # it will grab all useful input dataset preperties - inherit_geometry=False, - inherit_skip_properties=self.product.inherit_skip_properties, - ) - - if source_datasetdoc.properties.get("eo:platform") is not None: - platforms.append(source_datasetdoc.properties["eo:platform"]) - if source_datasetdoc.properties.get("eo:instrument") is not None: - if isinstance(source_datasetdoc.properties["eo:instrument"], list): - instruments += source_datasetdoc.properties["eo:instrument"] - else: - instruments.append( - source_datasetdoc.properties["eo:instrument"] - ) + if dataset.metadata_doc["properties"].get("eo:platform") is not None: + platforms.append(dataset.metadata_doc["properties"]["eo:platform"]) + if dataset.metadata_doc["properties"].get("eo:instrument") is not None: + if isinstance( + dataset.metadata_doc["properties"]["eo:instrument"], list + ): + instruments += dataset.metadata_doc["properties"]["eo:instrument"] + else: + instruments += [dataset.metadata_doc["properties"]["eo:instrument"]] + + dataset.metadata_doc.setdefault("$schema", "") + source_datasetdoc = serialise.from_doc( + dataset.metadata_doc, skip_validation=True + ) + # it will grab all useful input dataset preperties + dataset_assembler._inherit_properties_from( + source_datasetdoc, self.product.inherit_skip_properties + ) + dataset_assembler.note_source_datasets( + self.product.classifier, *self._lineage() + ) dataset_assembler.platform = ",".join(sorted(set(platforms))) dataset_assembler.instrument = "_".join(sorted(set(instruments))) diff --git a/odc/stats/utils.py b/odc/stats/utils.py index 13de5c60..7e20caab 100644 --- a/odc/stats/utils.py +++ b/odc/stats/utils.py @@ -455,6 +455,5 @@ def fuse_ds( for key, path in {**measurement_paths(ds)}.items(): fused_doc["measurements"][key]["path"] = path - fused_doc["properties"]["fused"] = "True" fused_ds = Dataset(product, prep_eo3(fused_doc), uri="fake") return fused_ds