Skip to content

Issue #761 better diff for apex reference check #765

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 131 additions & 6 deletions openeo/testing/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@
from pathlib import Path
from typing import List, Optional, Union

import numpy as np
import xarray
import xarray.testing
from scipy.spatial import ConvexHull

from openeo.rest.job import DEFAULT_JOB_RESULTS_FILENAME, BatchJob, JobResults
from openeo.util import repr_truncate
Expand Down Expand Up @@ -88,6 +90,97 @@ def _as_xarray_dataarray(data: Union[str, Path, xarray.DataArray]) -> xarray.Dat
return data


def _compare_xarray_dataarray_xy(
actual: Union[xarray.DataArray, str, Path],
expected: Union[xarray.DataArray, str, Path],
*,
rtol: float = _DEFAULT_RTOL,
atol: float = _DEFAULT_ATOL,
) -> List[str]:
"""
Compare two xarray DataArrays with tolerance and report mismatch issues (as strings)

Checks that are done (with tolerance):
- (optional) Check fraction of mismatching pixels (difference exceeding some tolerance).
If fraction is below a given threshold, ignore these mismatches in subsequent comparisons.
If fraction is above the threshold, report this issue.
- Compare actual and expected data with `xarray.testing.assert_allclose` and specified tolerances.

:return: list of issues (empty if no issues)
"""
# TODO: make this a public function?
# TODO: option for nodata fill value?
# TODO: option to include data type check?
# TODO: option to cast to some data type (or even rescale) before comparison?
# TODO: also compare attributes of the DataArray?
actual = _as_xarray_dataarray(actual)
expected = _as_xarray_dataarray(expected)
issues = []

if actual.dims != expected.dims:
issues.append(f"Dimension mismatch: {actual.dims} != {expected.dims}")
for dim in sorted(set(expected.dims).intersection(actual.dims)):
acs = actual.coords[dim].values
ecs = expected.coords[dim].values
if not (acs.shape == ecs.shape and (acs == ecs).all()):
issues.append(f"Coordinates mismatch for dimension {dim!r}: {acs} != {ecs}")
if actual.shape != expected.shape:
issues.append(f"Shape mismatch: {actual.shape} != {expected.shape}")

if not issues:
threshold = abs(expected * rtol) + atol
diff_exact = abs(expected - actual)
diff_mask = diff_exact > threshold
diff_lenient = diff_exact.where(diff_mask)

non_x_y_dims = list(set(expected.dims) - {"x", "y"})
value_mapping = dict(map(lambda d: (d, expected[d].data), non_x_y_dims))
shape = tuple([len(value_mapping[x]) for x in non_x_y_dims])

for shape_index, v in np.ndenumerate(np.ndarray(shape)):
indexers = {}
for index, value_index in enumerate(shape_index):
indexers[non_x_y_dims[index]] = value_mapping[non_x_y_dims[index]][value_index]
diff_data = diff_lenient.sel(indexers=indexers)
total_pixel_count = expected.sel(indexers).count().item()
diff_pixel_count = diff_data.count().item()

if diff_pixel_count > 0:
diff_pixel_percentage = round(diff_pixel_count * 100 / total_pixel_count, 1)
diff_mean = round(diff_data.mean().item(), 1)
diff_var = round(diff_data.var().item(), 1)

key = ",".join([f"{k} {str(v1)}" for k, v1 in indexers.items()])
issues.append(
f"{key}: value difference min:{diff_data.min().data}, max: {diff_data.max().data}, mean: {diff_mean}, var: {diff_var}"
)

coord_grid = np.meshgrid(diff_data.coords["y"], diff_data.coords["x"])
mask = diff_data.notnull()
c1 = coord_grid[0][mask]
c2 = coord_grid[1][mask]
coordinates = np.dstack((c1, c2)).reshape(-1, 2)
if len(coordinates) > 2:
hull = ConvexHull(coordinates)
area = hull.volume

x_m = diff_data.coords["x"][0].data
x_M = diff_data.coords["x"][-1].data
y_m = diff_data.coords["y"][0].data
y_M = diff_data.coords["y"][-1].data

total_area = abs((y_M - y_m) * (x_M - x_m))
area_percentage = round(area * 100 / total_area, 1)
issues.append(
f"{key}: differing pixels: {diff_pixel_count}/{total_pixel_count} ({diff_pixel_percentage}%), spread over {area_percentage}% of the area"
)
else:
issues.append(
f"{key}: differing pixels: {diff_pixel_count}/{total_pixel_count} ({diff_pixel_percentage}%)"
)
return issues


def _compare_xarray_dataarray(
actual: Union[xarray.DataArray, str, Path],
expected: Union[xarray.DataArray, str, Path],
Expand Down Expand Up @@ -128,11 +221,15 @@ def _compare_xarray_dataarray(
if actual.shape != expected.shape:
issues.append(f"Shape mismatch: {actual.shape} != {expected.shape}")

try:
xarray.testing.assert_allclose(a=actual, b=expected, rtol=rtol, atol=atol)
except AssertionError as e:
# TODO: message of `assert_allclose` is typically multiline, split it again or make it one line?
issues.append(str(e).strip())
if not issues:
if {"x", "y"} <= set(expected.dims):
issues = _compare_xarray_dataarray_xy(actual=actual, expected=expected, rtol=rtol, atol=atol)
else:
try:
xarray.testing.assert_allclose(a=actual, b=expected, rtol=rtol, atol=atol)
except AssertionError as e:
# TODO: message of `assert_allclose` is typically multiline, split it again or make it one line?
issues.append(str(e).strip())

return issues

Expand Down Expand Up @@ -163,6 +260,31 @@ def assert_xarray_dataarray_allclose(
raise AssertionError("\n".join(issues))


def assert_xarray_dataarray_allclose_xy(
actual: Union[xarray.DataArray, str, Path],
expected: Union[xarray.DataArray, str, Path],
*,
rtol: float = _DEFAULT_RTOL,
atol: float = _DEFAULT_ATOL,
):
"""
Assert that two Xarray ``DataArray`` instances are equal (with tolerance).

:param actual: actual data, provided as Xarray DataArray object or path to NetCDF/GeoTIFF file.
:param expected: expected or reference data, provided as Xarray DataArray object or path to NetCDF/GeoTIFF file.
:param rtol: relative tolerance
:param atol: absolute tolerance
:raises AssertionError: if not equal within the given tolerance

.. versionadded:: 0.31.0

.. warning::
This function is experimental and subject to change.
"""
issues = _compare_xarray_dataarray_xy(actual=actual, expected=expected, rtol=rtol, atol=atol)
if issues:
raise AssertionError("\n".join(issues))

def _compare_xarray_datasets(
actual: Union[xarray.Dataset, str, Path],
expected: Union[xarray.Dataset, str, Path],
Expand Down Expand Up @@ -250,7 +372,10 @@ def assert_xarray_allclose(
if isinstance(actual, xarray.Dataset) and isinstance(expected, xarray.Dataset):
assert_xarray_dataset_allclose(actual, expected, rtol=rtol, atol=atol)
elif isinstance(actual, xarray.DataArray) and isinstance(expected, xarray.DataArray):
assert_xarray_dataarray_allclose(actual, expected, rtol=rtol, atol=atol)
if (["x", "y", "band"]).elements_in(expected.dims):
assert_xarray_dataarray_allclose_xy(actual, expected, rtol=rtol, atol=atol)
else:
assert_xarray_dataarray_allclose(actual, expected, rtol=rtol, atol=atol)
else:
raise ValueError(f"Unsupported types: {type(actual)} and {type(expected)}")

Expand Down
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
with open("openeo/_version.py") as fp:
exec(fp.read(), _version)


with open("README.md", "r") as fh:
long_description = fh.read()

Expand All @@ -22,7 +21,8 @@
"mock",
"requests-mock>=1.8.0",
"httpretty>=1.1.4",
"urllib3<2.3.0", # httpretty doesn't work properly with urllib3>=2.3.0. See #700 and https://github.com/gabrielfalcao/HTTPretty/issues/484
"urllib3<2.3.0",
# httpretty doesn't work properly with urllib3>=2.3.0. See #700 and https://github.com/gabrielfalcao/HTTPretty/issues/484
"netCDF4>=1.7.0",
"matplotlib", # TODO: eliminate matplotlib as test dependency
# TODO #717 Simplify geopandas constraints when Python 3.8 support is dropped
Expand All @@ -35,6 +35,7 @@
"pyarrow>=10.0.1", # For Parquet read/write support in pandas
"python-dateutil>=2.7.0",
"pystac-client>=0.7.5",
"scipy", # for Convex Hull algorithm
]

docs_require = [
Expand All @@ -56,7 +57,6 @@
"ipython",
]


name = "openeo"
setup(
name=name,
Expand Down
118 changes: 102 additions & 16 deletions tests/testing/test_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from openeo.testing.results import (
_compare_xarray_dataarray,
assert_job_results_allclose,
assert_xarray_allclose,
assert_xarray_dataarray_allclose,
assert_xarray_dataset_allclose,
)
Expand All @@ -36,7 +35,6 @@ def test_simple_defaults(self):
[
"Coordinates mismatch for dimension 'dim_0': [0 1 2 3] != [0 1 2]",
"Shape mismatch: (4,) != (3,)",
dirty_equals.IsStr(regex="Left and right DataArray objects are not close.*", regex_flags=re.DOTALL),
],
),
(
Expand All @@ -45,15 +43,13 @@ def test_simple_defaults(self):
"Dimension mismatch: ('dim_0', 'dim_1') != ('dim_0',)",
"Coordinates mismatch for dimension 'dim_0': [0 1] != [0 1 2]",
"Shape mismatch: (2, 3) != (3,)",
dirty_equals.IsStr(regex="Left and right DataArray objects are not close.*", regex_flags=re.DOTALL),
],
),
(
xarray.DataArray([[1], [2], [3]]),
[
"Dimension mismatch: ('dim_0', 'dim_1') != ('dim_0',)",
"Shape mismatch: (3, 1) != (3,)",
dirty_equals.IsStr(regex="Left and right DataArray objects are not close.*", regex_flags=re.DOTALL),
],
),
],
Expand All @@ -75,20 +71,12 @@ def test_simple_shape_mismatch(self, actual, expected_issues):
"Dimension mismatch: ('y', 'x') != ('x', 'y')",
"Coordinates mismatch for dimension 'x': [0 1 2] != [0 1]",
"Coordinates mismatch for dimension 'y': [0 1] != [0 1 2]",
dirty_equals.IsStr(
regex=r"Left and right DataArray objects are not close.*Differing dimensions:.*\(y: 2, x: 3\) != \(x: 2, y: 3\)",
regex_flags=re.DOTALL,
),
],
),
(
xarray.DataArray([[1, 2, 3], [4, 5, 6]], dims=["x", "z"]),
[
"Dimension mismatch: ('x', 'z') != ('x', 'y')",
dirty_equals.IsStr(
regex=r"Left and right DataArray objects are not close.*Differing dimensions:.*\(x: 2, z: 3\) != \(x: 2, y: 3\)",
regex_flags=re.DOTALL,
),
],
),
],
Expand All @@ -108,10 +96,6 @@ def test_simple_dims_mismatch(self, actual, expected_issues):
xarray.DataArray([[1, 2, 3], [4, 5, 6]], coords=[("x", [111, 222]), ("y", [33, 44, 55])]),
[
"Coordinates mismatch for dimension 'x': [111 222] != [11 22]",
dirty_equals.IsStr(
regex=r"Left and right DataArray objects are not close.*Differing coordinates:.*L \* x\s+\(x\).*?111 222.*R \* x\s+\(x\).*?11 22",
regex_flags=re.DOTALL,
),
],
),
],
Expand Down Expand Up @@ -351,6 +335,108 @@ def test_allclose_minimal_success(self, tmp_path, actual_dir, expected_dir):
ds.to_netcdf(actual_dir / "data.nc")
assert_job_results_allclose(actual=actual_dir, expected=expected_dir, tmp_path=tmp_path)

def test_allclose_xy_success(self, tmp_path, actual_dir, expected_dir):
expected_ds = xarray.Dataset(
{
"b1": xarray.Variable(dims=["t", "x", "y"], data=2 * numpy.ones((3, 4, 5))),
"b2": xarray.Variable(dims=["t", "x", "y"], data=3 * numpy.ones((3, 4, 5))),
},
coords={
"t": range(0, 3),
"x": range(4, 8),
"y": range(5, 10),
},
)
expected_ds.to_netcdf(expected_dir / "data.nc")
actual_ds = xarray.Dataset(
{
"b1": xarray.Variable(dims=["t", "x", "y"], data=1 * numpy.ones((3, 4, 5))),
"b2": xarray.Variable(dims=["t", "x", "y"], data=3 * numpy.ones((3, 4, 5))),
},
coords={
"t": range(0, 3),
"x": range(4, 8),
"y": range(5, 10),
},
)
actual_ds.to_netcdf(actual_dir / "data.nc")
assert_job_results_allclose(actual=actual_dir, expected=expected_dir, tmp_path=tmp_path, rtol=1)

def test_allclose_minimal_xy_different(self, tmp_path, actual_dir, expected_dir):
expected_ds = xarray.Dataset(
{
"b1": xarray.Variable(dims=["t", "x", "y"], data=2 * numpy.ones((3, 4, 5))),
"b2": xarray.Variable(dims=["t", "x", "y"], data=3 * numpy.ones((3, 4, 5))),
},
coords={
"t": range(0, 3),
"x": range(4, 8),
"y": range(5, 10),
},
)
expected_ds.to_netcdf(expected_dir / "data.nc")
actual_ds = xarray.Dataset(
{
"b1": xarray.Variable(dims=["t", "x", "y"], data=1 * numpy.ones((3, 4, 5))),
"b2": xarray.Variable(dims=["t", "x", "y"], data=3 * numpy.ones((3, 4, 5))),
},
coords={
"t": range(0, 3),
"x": range(4, 8),
"y": range(5, 10),
},
)
actual_ds.to_netcdf(actual_dir / "data.nc")
with raises_assertion_error_or_not(
r"Issues for file 'data.nc'.*"
r"Issues for variable 'b1'.*"
r"t 0: value difference min:1.0, max: 1.0, mean: 1.0, var: 0.0.*"
r"t 0: differing pixels: 20/20 \(100.0%\), spread over 100.0% of the area.*"
r"t 1: value difference min:1.0, max: 1.0, mean: 1.0, var: 0.0.*"
r"t 1: differing pixels: 20/20 \(100.0%\), spread over 100.0% of the area.*"
r"t 2: value difference min:1.0, max: 1.0, mean: 1.0, var: 0.0.*"
r"t 2: differing pixels: 20/20 \(100.0%\), spread over 100.0% of the area"
):
assert_job_results_allclose(actual=actual_dir, expected=expected_dir, tmp_path=tmp_path)

def test_allclose_minimal_xy_different_small_area(self, tmp_path, actual_dir, expected_dir):
expected_ds = xarray.Dataset(
{
"b1": xarray.Variable(dims=["t", "x", "y"], data=2 * numpy.ones((3, 4, 5))),
"b2": xarray.Variable(dims=["t", "x", "y"], data=3 * numpy.ones((3, 4, 5))),
},
coords={
"t": range(0, 3),
"x": range(4, 8),
"y": range(5, 10),
},
)
expected_ds.to_netcdf(expected_dir / "data.nc")
b2_modified_data = 3 * numpy.ones((3, 4, 5))
b2_modified_data[2][2][2] *= 15
b2_modified_data[2][2][3] *= 14
b2_modified_data[2][3][2] *= 13
b2_modified_data[2][3][3] *= 12
actual_ds = xarray.Dataset(
{
"b1": xarray.Variable(dims=["t", "x", "y"], data=2 * numpy.ones((3, 4, 5))),
"b2": xarray.Variable(dims=["t", "x", "y"], data=b2_modified_data),
},
coords={
"t": range(0, 3),
"x": range(4, 8),
"y": range(5, 10),
},
)
actual_ds.to_netcdf(actual_dir / "data.nc")
with raises_assertion_error_or_not(
r"Issues for file 'data.nc'.*"
r"Issues for variable 'b2'.*"
r"t 2: value difference min:33.0, max: 42.0, mean: 37.5, var: 11.2.*"
r"t 2: differing pixels: 4/20 \(20.0%\), spread over 8.3% of the area"
):
assert_job_results_allclose(actual=actual_dir, expected=expected_dir, tmp_path=tmp_path)

def test_allclose_basic_fail(self, tmp_path, actual_dir, expected_dir):
expected_ds = xarray.Dataset({"a": (["time"], [1, 2, 3])}, coords={"time": [11, 22, 33]})
expected_ds.to_netcdf(expected_dir / "data.nc")
Expand Down