From bdb906b2527fd4ab37252e8491c4d47b3dfb277d Mon Sep 17 00:00:00 2001 From: dsamaey Date: Tue, 22 Apr 2025 19:54:46 +0200 Subject: [PATCH 1/2] Issue #761 better diff for apex reference check --- openeo/testing/results.py | 137 ++++++++++++++++++++++++++++++++-- tests/testing/test_results.py | 118 +++++++++++++++++++++++++---- 2 files changed, 233 insertions(+), 22 deletions(-) diff --git a/openeo/testing/results.py b/openeo/testing/results.py index 633ddaf58..7076aaa59 100644 --- a/openeo/testing/results.py +++ b/openeo/testing/results.py @@ -8,8 +8,10 @@ from pathlib import Path from typing import List, Optional, Union +import numpy as np import xarray import xarray.testing +from scipy.spatial import ConvexHull from openeo.rest.job import DEFAULT_JOB_RESULTS_FILENAME, BatchJob, JobResults from openeo.util import repr_truncate @@ -88,6 +90,97 @@ def _as_xarray_dataarray(data: Union[str, Path, xarray.DataArray]) -> xarray.Dat return data +def _compare_xarray_dataarray_xy( + actual: Union[xarray.DataArray, str, Path], + expected: Union[xarray.DataArray, str, Path], + *, + rtol: float = _DEFAULT_RTOL, + atol: float = _DEFAULT_ATOL, +) -> List[str]: + """ + Compare two xarray DataArrays with tolerance and report mismatch issues (as strings) + + Checks that are done (with tolerance): + - (optional) Check fraction of mismatching pixels (difference exceeding some tolerance). + If fraction is below a given threshold, ignore these mismatches in subsequent comparisons. + If fraction is above the threshold, report this issue. + - Compare actual and expected data with `xarray.testing.assert_allclose` and specified tolerances. + + :return: list of issues (empty if no issues) + """ + # TODO: make this a public function? + # TODO: option for nodata fill value? + # TODO: option to include data type check? + # TODO: option to cast to some data type (or even rescale) before comparison? + # TODO: also compare attributes of the DataArray? + actual = _as_xarray_dataarray(actual) + expected = _as_xarray_dataarray(expected) + issues = [] + + if actual.dims != expected.dims: + issues.append(f"Dimension mismatch: {actual.dims} != {expected.dims}") + for dim in sorted(set(expected.dims).intersection(actual.dims)): + acs = actual.coords[dim].values + ecs = expected.coords[dim].values + if not (acs.shape == ecs.shape and (acs == ecs).all()): + issues.append(f"Coordinates mismatch for dimension {dim!r}: {acs} != {ecs}") + if actual.shape != expected.shape: + issues.append(f"Shape mismatch: {actual.shape} != {expected.shape}") + + if not issues: + threshold = abs(expected * rtol) + atol + diff_exact = abs(expected - actual) + diff_mask = diff_exact > threshold + diff_lenient = diff_exact.where(diff_mask) + + non_x_y_dims = list(set(expected.dims) - {"x", "y"}) + value_mapping = dict(map(lambda d: (d, expected[d].data), non_x_y_dims)) + shape = tuple([len(value_mapping[x]) for x in non_x_y_dims]) + + for shape_index, v in np.ndenumerate(np.ndarray(shape)): + indexers = {} + for index, value_index in enumerate(shape_index): + indexers[non_x_y_dims[index]] = value_mapping[non_x_y_dims[index]][value_index] + diff_data = diff_lenient.sel(indexers=indexers) + total_pixel_count = expected.sel(indexers).count().item() + diff_pixel_count = diff_data.count().item() + + if diff_pixel_count > 0: + diff_pixel_percentage = round(diff_pixel_count * 100 / total_pixel_count, 1) + diff_mean = round(diff_data.mean().item(), 1) + diff_var = round(diff_data.var().item(), 1) + + key = ",".join([f"{k} {str(v1)}" for k, v1 in indexers.items()]) + issues.append( + f"{key}: value difference min:{diff_data.min().data}, max: {diff_data.max().data}, mean: {diff_mean}, var: {diff_var}" + ) + + coord_grid = np.meshgrid(diff_data.coords["y"], diff_data.coords["x"]) + mask = diff_data.notnull() + c1 = coord_grid[0][mask] + c2 = coord_grid[1][mask] + coordinates = np.dstack((c1, c2)).reshape(-1, 2) + if len(coordinates) > 2: + hull = ConvexHull(coordinates) + area = hull.volume + + x_m = diff_data.coords["x"][0].data + x_M = diff_data.coords["x"][-1].data + y_m = diff_data.coords["y"][0].data + y_M = diff_data.coords["y"][-1].data + + total_area = abs((y_M - y_m) * (x_M - x_m)) + area_percentage = round(area * 100 / total_area, 1) + issues.append( + f"{key}: differing pixels: {diff_pixel_count}/{total_pixel_count} ({diff_pixel_percentage}%), spread over {area_percentage}% of the area" + ) + else: + issues.append( + f"{key}: differing pixels: {diff_pixel_count}/{total_pixel_count} ({diff_pixel_percentage}%)" + ) + return issues + + def _compare_xarray_dataarray( actual: Union[xarray.DataArray, str, Path], expected: Union[xarray.DataArray, str, Path], @@ -128,11 +221,15 @@ def _compare_xarray_dataarray( if actual.shape != expected.shape: issues.append(f"Shape mismatch: {actual.shape} != {expected.shape}") - try: - xarray.testing.assert_allclose(a=actual, b=expected, rtol=rtol, atol=atol) - except AssertionError as e: - # TODO: message of `assert_allclose` is typically multiline, split it again or make it one line? - issues.append(str(e).strip()) + if not issues: + if {"x", "y"} <= set(expected.dims): + issues = _compare_xarray_dataarray_xy(actual=actual, expected=expected, rtol=rtol, atol=atol) + else: + try: + xarray.testing.assert_allclose(a=actual, b=expected, rtol=rtol, atol=atol) + except AssertionError as e: + # TODO: message of `assert_allclose` is typically multiline, split it again or make it one line? + issues.append(str(e).strip()) return issues @@ -163,6 +260,31 @@ def assert_xarray_dataarray_allclose( raise AssertionError("\n".join(issues)) +def assert_xarray_dataarray_allclose_xy( + actual: Union[xarray.DataArray, str, Path], + expected: Union[xarray.DataArray, str, Path], + *, + rtol: float = _DEFAULT_RTOL, + atol: float = _DEFAULT_ATOL, +): + """ + Assert that two Xarray ``DataArray`` instances are equal (with tolerance). + + :param actual: actual data, provided as Xarray DataArray object or path to NetCDF/GeoTIFF file. + :param expected: expected or reference data, provided as Xarray DataArray object or path to NetCDF/GeoTIFF file. + :param rtol: relative tolerance + :param atol: absolute tolerance + :raises AssertionError: if not equal within the given tolerance + + .. versionadded:: 0.31.0 + + .. warning:: + This function is experimental and subject to change. + """ + issues = _compare_xarray_dataarray_xy(actual=actual, expected=expected, rtol=rtol, atol=atol) + if issues: + raise AssertionError("\n".join(issues)) + def _compare_xarray_datasets( actual: Union[xarray.Dataset, str, Path], expected: Union[xarray.Dataset, str, Path], @@ -250,7 +372,10 @@ def assert_xarray_allclose( if isinstance(actual, xarray.Dataset) and isinstance(expected, xarray.Dataset): assert_xarray_dataset_allclose(actual, expected, rtol=rtol, atol=atol) elif isinstance(actual, xarray.DataArray) and isinstance(expected, xarray.DataArray): - assert_xarray_dataarray_allclose(actual, expected, rtol=rtol, atol=atol) + if (["x", "y", "band"]).elements_in(expected.dims): + assert_xarray_dataarray_allclose_xy(actual, expected, rtol=rtol, atol=atol) + else: + assert_xarray_dataarray_allclose(actual, expected, rtol=rtol, atol=atol) else: raise ValueError(f"Unsupported types: {type(actual)} and {type(expected)}") diff --git a/tests/testing/test_results.py b/tests/testing/test_results.py index 9bff2a4f1..73de47e95 100644 --- a/tests/testing/test_results.py +++ b/tests/testing/test_results.py @@ -13,7 +13,6 @@ from openeo.testing.results import ( _compare_xarray_dataarray, assert_job_results_allclose, - assert_xarray_allclose, assert_xarray_dataarray_allclose, assert_xarray_dataset_allclose, ) @@ -36,7 +35,6 @@ def test_simple_defaults(self): [ "Coordinates mismatch for dimension 'dim_0': [0 1 2 3] != [0 1 2]", "Shape mismatch: (4,) != (3,)", - dirty_equals.IsStr(regex="Left and right DataArray objects are not close.*", regex_flags=re.DOTALL), ], ), ( @@ -45,7 +43,6 @@ def test_simple_defaults(self): "Dimension mismatch: ('dim_0', 'dim_1') != ('dim_0',)", "Coordinates mismatch for dimension 'dim_0': [0 1] != [0 1 2]", "Shape mismatch: (2, 3) != (3,)", - dirty_equals.IsStr(regex="Left and right DataArray objects are not close.*", regex_flags=re.DOTALL), ], ), ( @@ -53,7 +50,6 @@ def test_simple_defaults(self): [ "Dimension mismatch: ('dim_0', 'dim_1') != ('dim_0',)", "Shape mismatch: (3, 1) != (3,)", - dirty_equals.IsStr(regex="Left and right DataArray objects are not close.*", regex_flags=re.DOTALL), ], ), ], @@ -75,20 +71,12 @@ def test_simple_shape_mismatch(self, actual, expected_issues): "Dimension mismatch: ('y', 'x') != ('x', 'y')", "Coordinates mismatch for dimension 'x': [0 1 2] != [0 1]", "Coordinates mismatch for dimension 'y': [0 1] != [0 1 2]", - dirty_equals.IsStr( - regex=r"Left and right DataArray objects are not close.*Differing dimensions:.*\(y: 2, x: 3\) != \(x: 2, y: 3\)", - regex_flags=re.DOTALL, - ), ], ), ( xarray.DataArray([[1, 2, 3], [4, 5, 6]], dims=["x", "z"]), [ "Dimension mismatch: ('x', 'z') != ('x', 'y')", - dirty_equals.IsStr( - regex=r"Left and right DataArray objects are not close.*Differing dimensions:.*\(x: 2, z: 3\) != \(x: 2, y: 3\)", - regex_flags=re.DOTALL, - ), ], ), ], @@ -108,10 +96,6 @@ def test_simple_dims_mismatch(self, actual, expected_issues): xarray.DataArray([[1, 2, 3], [4, 5, 6]], coords=[("x", [111, 222]), ("y", [33, 44, 55])]), [ "Coordinates mismatch for dimension 'x': [111 222] != [11 22]", - dirty_equals.IsStr( - regex=r"Left and right DataArray objects are not close.*Differing coordinates:.*L \* x\s+\(x\).*?111 222.*R \* x\s+\(x\).*?11 22", - regex_flags=re.DOTALL, - ), ], ), ], @@ -351,6 +335,108 @@ def test_allclose_minimal_success(self, tmp_path, actual_dir, expected_dir): ds.to_netcdf(actual_dir / "data.nc") assert_job_results_allclose(actual=actual_dir, expected=expected_dir, tmp_path=tmp_path) + def test_allclose_xy_success(self, tmp_path, actual_dir, expected_dir): + expected_ds = xarray.Dataset( + { + "b1": xarray.Variable(dims=["t", "x", "y"], data=2 * numpy.ones((3, 4, 5))), + "b2": xarray.Variable(dims=["t", "x", "y"], data=3 * numpy.ones((3, 4, 5))), + }, + coords={ + "t": range(0, 3), + "x": range(4, 8), + "y": range(5, 10), + }, + ) + expected_ds.to_netcdf(expected_dir / "data.nc") + actual_ds = xarray.Dataset( + { + "b1": xarray.Variable(dims=["t", "x", "y"], data=1 * numpy.ones((3, 4, 5))), + "b2": xarray.Variable(dims=["t", "x", "y"], data=3 * numpy.ones((3, 4, 5))), + }, + coords={ + "t": range(0, 3), + "x": range(4, 8), + "y": range(5, 10), + }, + ) + actual_ds.to_netcdf(actual_dir / "data.nc") + assert_job_results_allclose(actual=actual_dir, expected=expected_dir, tmp_path=tmp_path, rtol=1) + + def test_allclose_minimal_xy_different(self, tmp_path, actual_dir, expected_dir): + expected_ds = xarray.Dataset( + { + "b1": xarray.Variable(dims=["t", "x", "y"], data=2 * numpy.ones((3, 4, 5))), + "b2": xarray.Variable(dims=["t", "x", "y"], data=3 * numpy.ones((3, 4, 5))), + }, + coords={ + "t": range(0, 3), + "x": range(4, 8), + "y": range(5, 10), + }, + ) + expected_ds.to_netcdf(expected_dir / "data.nc") + actual_ds = xarray.Dataset( + { + "b1": xarray.Variable(dims=["t", "x", "y"], data=1 * numpy.ones((3, 4, 5))), + "b2": xarray.Variable(dims=["t", "x", "y"], data=3 * numpy.ones((3, 4, 5))), + }, + coords={ + "t": range(0, 3), + "x": range(4, 8), + "y": range(5, 10), + }, + ) + actual_ds.to_netcdf(actual_dir / "data.nc") + with raises_assertion_error_or_not( + r"Issues for file 'data.nc'.*" + r"Issues for variable 'b1'.*" + r"t 0: value difference min:1.0, max: 1.0, mean: 1.0, var: 0.0.*" + r"t 0: differing pixels: 20/20 \(100.0%\), spread over 100.0% of the area.*" + r"t 1: value difference min:1.0, max: 1.0, mean: 1.0, var: 0.0.*" + r"t 1: differing pixels: 20/20 \(100.0%\), spread over 100.0% of the area.*" + r"t 2: value difference min:1.0, max: 1.0, mean: 1.0, var: 0.0.*" + r"t 2: differing pixels: 20/20 \(100.0%\), spread over 100.0% of the area" + ): + assert_job_results_allclose(actual=actual_dir, expected=expected_dir, tmp_path=tmp_path) + + def test_allclose_minimal_xy_different_small_area(self, tmp_path, actual_dir, expected_dir): + expected_ds = xarray.Dataset( + { + "b1": xarray.Variable(dims=["t", "x", "y"], data=2 * numpy.ones((3, 4, 5))), + "b2": xarray.Variable(dims=["t", "x", "y"], data=3 * numpy.ones((3, 4, 5))), + }, + coords={ + "t": range(0, 3), + "x": range(4, 8), + "y": range(5, 10), + }, + ) + expected_ds.to_netcdf(expected_dir / "data.nc") + b2_modified_data = 3 * numpy.ones((3, 4, 5)) + b2_modified_data[2][2][2] *= 15 + b2_modified_data[2][2][3] *= 14 + b2_modified_data[2][3][2] *= 13 + b2_modified_data[2][3][3] *= 12 + actual_ds = xarray.Dataset( + { + "b1": xarray.Variable(dims=["t", "x", "y"], data=2 * numpy.ones((3, 4, 5))), + "b2": xarray.Variable(dims=["t", "x", "y"], data=b2_modified_data), + }, + coords={ + "t": range(0, 3), + "x": range(4, 8), + "y": range(5, 10), + }, + ) + actual_ds.to_netcdf(actual_dir / "data.nc") + with raises_assertion_error_or_not( + r"Issues for file 'data.nc'.*" + r"Issues for variable 'b2'.*" + r"t 2: value difference min:33.0, max: 42.0, mean: 37.5, var: 11.2.*" + r"t 2: differing pixels: 4/20 \(20.0%\), spread over 8.3% of the area" + ): + assert_job_results_allclose(actual=actual_dir, expected=expected_dir, tmp_path=tmp_path) + def test_allclose_basic_fail(self, tmp_path, actual_dir, expected_dir): expected_ds = xarray.Dataset({"a": (["time"], [1, 2, 3])}, coords={"time": [11, 22, 33]}) expected_ds.to_netcdf(expected_dir / "data.nc") From e7237298e8d5a73b78cea74952be3de8c7039749 Mon Sep 17 00:00:00 2001 From: dsamaey Date: Wed, 23 Apr 2025 09:44:21 +0200 Subject: [PATCH 2/2] Issue #761 better diff for apex reference check (added scipy dependency) --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index d2c22b39f..bd48199b6 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,6 @@ with open("openeo/_version.py") as fp: exec(fp.read(), _version) - with open("README.md", "r") as fh: long_description = fh.read() @@ -22,7 +21,8 @@ "mock", "requests-mock>=1.8.0", "httpretty>=1.1.4", - "urllib3<2.3.0", # httpretty doesn't work properly with urllib3>=2.3.0. See #700 and https://github.com/gabrielfalcao/HTTPretty/issues/484 + "urllib3<2.3.0", + # httpretty doesn't work properly with urllib3>=2.3.0. See #700 and https://github.com/gabrielfalcao/HTTPretty/issues/484 "netCDF4>=1.7.0", "matplotlib", # TODO: eliminate matplotlib as test dependency # TODO #717 Simplify geopandas constraints when Python 3.8 support is dropped @@ -35,6 +35,7 @@ "pyarrow>=10.0.1", # For Parquet read/write support in pandas "python-dateutil>=2.7.0", "pystac-client>=0.7.5", + "scipy", # for Convex Hull algorithm ] docs_require = [ @@ -56,7 +57,6 @@ "ipython", ] - name = "openeo" setup( name=name,