Skip to content

Issue #761 better diff for apex reference check #765

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 103 additions & 7 deletions openeo/testing/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,22 @@
from pathlib import Path
from typing import List, Optional, Union

import numpy as np
import xarray
import xarray.testing
from xarray import DataArray

from openeo.rest.job import DEFAULT_JOB_RESULTS_FILENAME, BatchJob, JobResults
from openeo.util import repr_truncate

_log = logging.getLogger(__name__)


_DEFAULT_RTOL = 1e-6
_DEFAULT_ATOL = 1e-6

# https://paulbourke.net/dataformats/asciiart
DEFAULT_GRAYSCALE_70_CHARACTERS = "$@B%8&WM#*oahkbdpqwmZO0QLCJUYXzcvunxrjft/\|()1{}[]?-_+~<>i!lI;:,\"^`'. "[::-1]
DEFAULT_GRAYSCALE_10_CHARACTERS = " .:-=+*#%@"

def _load_xarray_netcdf(path: Union[str, Path], **kwargs) -> xarray.Dataset:
"""
Expand Down Expand Up @@ -88,12 +92,105 @@ def _as_xarray_dataarray(data: Union[str, Path, xarray.DataArray]) -> xarray.Dat
return data


def _ascii_art(
diff_data: DataArray,
*,
max_width: int = 60,
y_vs_x_aspect_ratio=2.5,
grayscale_characters: str = DEFAULT_GRAYSCALE_70_CHARACTERS,
) -> str:
max_grayscale_idx = len(grayscale_characters) - 1
x_scale: int = max(1, int(diff_data.sizes["x"] / max_width))
y_scale: int = max(1, int(diff_data.sizes["y"] / (max_width / y_vs_x_aspect_ratio)))
data_max = diff_data.max().item()
if data_max == 0:
data_max = 1
coarsened = diff_data.coarsen(dim={"x": x_scale, "y": y_scale}, boundary="pad").mean()
coarsened = coarsened.transpose("y", "x", ...)
top = "┌" + "─" * coarsened.sizes["x"] + "┐\n"
bottom = "\n└" + "─" * coarsened.sizes["x"] + "┘"

def _pixel_char(v) -> str:
i = 0 if np.isnan(v) else int(v * max_grayscale_idx / data_max)
if v > 0 and i == 0:
i = 1 # don't show a blank for a difference above the threshold
else:
i = min(max_grayscale_idx, i)
return grayscale_characters[i]

return top + "\n".join(["│" + "".join([_pixel_char(v) for v in row]) + "│" for row in coarsened]) + bottom


def _compare_xarray_dataarray_xy(
actual: Union[xarray.DataArray, str, Path],
expected: Union[xarray.DataArray, str, Path],
*,
rtol: float = _DEFAULT_RTOL,
atol: float = _DEFAULT_ATOL,
name: str = None,
) -> List[str]:
"""
Additional compare for two compatible spatial xarray DataArrays with tolerance (rtol, atol)
:return: list of issues (empty if no issues)
"""
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you document what the difference is with the existing _compare_xarray_dataarray ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

or isn't it possible to integrate this feature in the existing _compare_xarray_dataarray instead of duplicating most of it?

issues = []
threshold = abs(expected * rtol) + atol
diff_exact = abs(expected - actual)
diff_mask = diff_exact > threshold
diff_lenient = diff_exact.where(diff_mask)

non_x_y_dims = list(set(expected.dims) - {"x", "y"})
value_mapping = dict(map(lambda d: (d, expected[d].data), non_x_y_dims))
shape = tuple([len(value_mapping[x]) for x in non_x_y_dims])

for shape_index, v in np.ndenumerate(np.ndarray(shape)):
indexers = {}
for index, value_index in enumerate(shape_index):
indexers[non_x_y_dims[index]] = value_mapping[non_x_y_dims[index]][value_index]
diff_data = diff_lenient.sel(indexers=indexers)
total_pixel_count = expected.sel(indexers).count().item()
diff_pixel_count = diff_data.count().item()

if diff_pixel_count > 0:
diff_pixel_percentage = round(diff_pixel_count * 100 / total_pixel_count, 1)
diff_mean = round(diff_data.mean().item(), 2)
diff_var = round(diff_data.var().item(), 2)

key = name + ": " if name else ""
key += ",".join([f"{k} {str(v1)}" for k, v1 in indexers.items()])
issues.append(
f"{key}: value difference exceeds tolerance (rtol {rtol}, atol {atol}), min:{diff_data.min().data}, max: {diff_data.max().data}, mean: {diff_mean}, var: {diff_var}"
)

_log.warning(f"Difference (ascii art) for {key}:\n{_ascii_art(diff_data)}")

coord_grid = np.meshgrid(diff_data.coords["x"], diff_data.coords["y"])
mask = diff_data.notnull()
if mask.dims[0] != "y":
mask = mask.transpose()
x_coords = coord_grid[0][mask]
y_coords = coord_grid[1][mask]

diff_bbox = ((x_coords.min().item(), y_coords.min().item()), (x_coords.max().item(), y_coords.max().item()))
diff_area = (x_coords.max() - x_coords.min()) * (y_coords.max() - y_coords.min())
total_area = abs(
(diff_data.coords["y"][-1].data - diff_data.coords["y"][0].data)
* (diff_data.coords["x"][-1].data - diff_data.coords["x"][0].data)
)
area_percentage = round(diff_area * 100 / total_area, 1)
issues.append(
f"{key}: differing pixels: {diff_pixel_count}/{total_pixel_count} ({diff_pixel_percentage}%), bbox {diff_bbox} - {area_percentage}% of the area"
)
return issues


def _compare_xarray_dataarray(
actual: Union[xarray.DataArray, str, Path],
expected: Union[xarray.DataArray, str, Path],
*,
rtol: float = _DEFAULT_RTOL,
atol: float = _DEFAULT_ATOL,
name: str = None,
) -> List[str]:
"""
Compare two xarray DataArrays with tolerance and report mismatch issues (as strings)
Expand All @@ -116,7 +213,7 @@ def _compare_xarray_dataarray(
issues = []

# `xarray.testing.assert_allclose` currently does not always
# provides detailed information about shape/dimension mismatches
# provide detailed information about shape/dimension mismatches
# so we enrich the issue listing with some more details
if actual.dims != expected.dims:
issues.append(f"Dimension mismatch: {actual.dims} != {expected.dims}")
Expand All @@ -127,13 +224,14 @@ def _compare_xarray_dataarray(
issues.append(f"Coordinates mismatch for dimension {dim!r}: {acs} != {ecs}")
if actual.shape != expected.shape:
issues.append(f"Shape mismatch: {actual.shape} != {expected.shape}")

compatible = len(issues) == 0
try:
xarray.testing.assert_allclose(a=actual, b=expected, rtol=rtol, atol=atol)
except AssertionError as e:
# TODO: message of `assert_allclose` is typically multiline, split it again or make it one line?
issues.append(str(e).strip())

if compatible and {"x", "y"} <= set(expected.dims):
issues.extend(_compare_xarray_dataarray_xy(actual=actual, expected=expected, rtol=rtol, atol=atol, name=name))
return issues


Expand Down Expand Up @@ -162,7 +260,6 @@ def assert_xarray_dataarray_allclose(
if issues:
raise AssertionError("\n".join(issues))


def _compare_xarray_datasets(
actual: Union[xarray.Dataset, str, Path],
expected: Union[xarray.Dataset, str, Path],
Expand All @@ -180,15 +277,14 @@ def _compare_xarray_datasets(
expected = _as_xarray_dataset(expected)

all_issues = []
# TODO: just leverage DataSet support in xarray.testing.assert_allclose for all this?
actual_vars = set(actual.data_vars)
expected_vars = set(expected.data_vars)
_log.debug(f"_compare_xarray_datasets: actual_vars={actual_vars!r} expected_vars={expected_vars!r}")
if actual_vars != expected_vars:
all_issues.append(f"Xarray DataSet variables mismatch: {actual_vars} != {expected_vars}")
for var in expected_vars.intersection(actual_vars):
_log.debug(f"_compare_xarray_datasets: comparing variable {var!r}")
issues = _compare_xarray_dataarray(actual[var], expected[var], rtol=rtol, atol=atol)
issues = _compare_xarray_dataarray(actual[var], expected[var], rtol=rtol, atol=atol, name=var)
if issues:
all_issues.append(f"Issues for variable {var!r}:")
all_issues.extend(issues)
Expand Down
2 changes: 0 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
with open("openeo/_version.py") as fp:
exec(fp.read(), _version)


with open("README.md", "r") as fh:
long_description = fh.read()

Expand Down Expand Up @@ -56,7 +55,6 @@
"ipython",
]


name = "openeo"
setup(
name=name,
Expand Down
105 changes: 104 additions & 1 deletion tests/testing/test_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from openeo.testing.results import (
_compare_xarray_dataarray,
assert_job_results_allclose,
assert_xarray_allclose,
assert_xarray_dataarray_allclose,
assert_xarray_dataset_allclose,
)
Expand Down Expand Up @@ -351,6 +350,110 @@ def test_allclose_minimal_success(self, tmp_path, actual_dir, expected_dir):
ds.to_netcdf(actual_dir / "data.nc")
assert_job_results_allclose(actual=actual_dir, expected=expected_dir, tmp_path=tmp_path)

def test_allclose_xy_success(self, tmp_path, actual_dir, expected_dir):
expected_ds = xarray.Dataset(
{
"b1": xarray.Variable(dims=["t", "x", "y"], data=2 * numpy.ones((3, 4, 5))),
"b2": xarray.Variable(dims=["t", "x", "y"], data=3 * numpy.ones((3, 4, 5))),
},
coords={
"t": range(0, 3),
"x": range(4, 8),
"y": range(5, 10),
},
)
expected_ds.to_netcdf(expected_dir / "data.nc")
actual_ds = xarray.Dataset(
{
"b1": xarray.Variable(dims=["t", "x", "y"], data=1 * numpy.ones((3, 4, 5))),
"b2": xarray.Variable(dims=["t", "x", "y"], data=3 * numpy.ones((3, 4, 5))),
},
coords={
"t": range(0, 3),
"x": range(4, 8),
"y": range(5, 10),
},
)
actual_ds.to_netcdf(actual_dir / "data.nc")
assert_job_results_allclose(actual=actual_dir, expected=expected_dir, tmp_path=tmp_path, rtol=1)

def test_allclose_minimal_xy_different(self, tmp_path, actual_dir, expected_dir):
expected_ds = xarray.Dataset(
{
"b1": xarray.Variable(dims=["t", "x", "y"], data=2 * numpy.ones((3, 4, 5))),
"b2": xarray.Variable(dims=["t", "x", "y"], data=3 * numpy.ones((3, 4, 5))),
},
coords={
"t": range(0, 3),
"x": range(4, 8),
"y": range(5, 10),
},
)
expected_ds.to_netcdf(expected_dir / "data.nc")
actual_ds = xarray.Dataset(
{
"b1": xarray.Variable(dims=["t", "x", "y"], data=1 * numpy.ones((3, 4, 5))),
"b2": xarray.Variable(dims=["t", "x", "y"], data=3 * numpy.ones((3, 4, 5))),
},
coords={
"t": range(0, 3),
"x": range(4, 8),
"y": range(5, 10),
},
)
actual_ds.to_netcdf(actual_dir / "data.nc")
with raises_assertion_error_or_not(
r"Issues for file 'data.nc'.*"
r"Issues for variable 'b1'.*"
r"Left and right DataArray objects are not close.*Differing values:.*"
r"t 0: value difference exceeds tolerance \(rtol 1e-06, atol 1e-06\), min:1.0, max: 1.0, mean: 1.0, var: 0.0.*"
r"t 0: differing pixels: 20/20 \(100.0%\), bbox \(\(4, 5\), \(7, 9\)\) - 100.0% of the area.*"
r"t 1: value difference exceeds tolerance \(rtol 1e-06, atol 1e-06\), min:1.0, max: 1.0, mean: 1.0, var: 0.0.*"
r"t 1: differing pixels: 20/20 \(100.0%\), bbox \(\(4, 5\), \(7, 9\)\) - 100.0% of the area.*"
r"t 2: value difference exceeds tolerance \(rtol 1e-06, atol 1e-06\), min:1.0, max: 1.0, mean: 1.0, var: 0.0.*"
r"t 2: differing pixels: 20/20 \(100.0%\), bbox \(\(4, 5\), \(7, 9\)\) - 100.0% of the area"
):
assert_job_results_allclose(actual=actual_dir, expected=expected_dir, tmp_path=tmp_path)

def test_allclose_minimal_xy_different_small_area(self, tmp_path, actual_dir, expected_dir):
expected_ds = xarray.Dataset(
{
"b1": xarray.Variable(dims=["t", "x", "y"], data=2 * numpy.ones((3, 4, 5))),
"b2": xarray.Variable(dims=["t", "x", "y"], data=3 * numpy.ones((3, 4, 5))),
},
coords={
"t": range(0, 3),
"x": range(4, 8),
"y": range(5, 10),
},
)
expected_ds.to_netcdf(expected_dir / "data.nc")
b2_modified_data = 3 * numpy.ones((3, 4, 5))
b2_modified_data[2][2][2] *= 15
b2_modified_data[2][2][3] *= 14
b2_modified_data[2][3][2] *= 13
b2_modified_data[2][3][3] *= 12
actual_ds = xarray.Dataset(
{
"b1": xarray.Variable(dims=["t", "x", "y"], data=2 * numpy.ones((3, 4, 5))),
"b2": xarray.Variable(dims=["t", "x", "y"], data=b2_modified_data),
},
coords={
"t": range(0, 3),
"x": range(4, 8),
"y": range(5, 10),
},
)
actual_ds.to_netcdf(actual_dir / "data.nc")
with raises_assertion_error_or_not(
r"Issues for file 'data.nc'.*"
r"Issues for variable 'b2'.*"
r"Left and right DataArray objects are not close.*Differing values:.*"
r"t 2: value difference exceeds tolerance \(rtol 1e-06, atol 1e-06\), min:33.0, max: 42.0, mean: 37.5, var: 11.2.*"
r"t 2: differing pixels: 4/20 \(20.0%\), bbox \(\(6, 7\), \(7, 8\)\) - 8.3% of the area"
):
assert_job_results_allclose(actual=actual_dir, expected=expected_dir, tmp_path=tmp_path)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you also add a simple test for ascii_art (e.g. with a simple 10 by 5 use case or something like that).
It's currently a public function, so people/projects might start depending on it

def test_allclose_basic_fail(self, tmp_path, actual_dir, expected_dir):
expected_ds = xarray.Dataset({"a": (["time"], [1, 2, 3])}, coords={"time": [11, 22, 33]})
expected_ds.to_netcdf(expected_dir / "data.nc")
Expand Down