From 9282fc455c70d4c3f54ef566757b30bdf78a3c6b Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 7 May 2025 15:14:03 +0200 Subject: [PATCH 01/23] add IntervalIndex Co-authored-by: Deepak Cherian --- xarray/indexes/interval_index.py | 112 +++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 xarray/indexes/interval_index.py diff --git a/xarray/indexes/interval_index.py b/xarray/indexes/interval_index.py new file mode 100644 index 00000000000..fd171bb5c93 --- /dev/null +++ b/xarray/indexes/interval_index.py @@ -0,0 +1,112 @@ +from collections.abc import Hashable + +import numpy as np +import pandas as pd + +from xarray import Variable +from xarray.core.indexes import Index, PandasIndex + + +class IntervalIndex(Index): + def __init__(self, index: PandasIndex, bounds_name: Hashable, bounds_dim: str): + assert isinstance(index.index, pd.IntervalIndex) + self._index = index + self._bounds_name = bounds_name + self._bounds_dim = bounds_dim + + @classmethod + def from_variables(cls, variables, options): + assert len(variables) == 2 + + for k, v in variables.items(): + if v.ndim == 2: + bounds_name, bounds = k, v + elif v.ndim == 1: + dim, _ = k, v + + bounds = bounds.transpose(..., dim) + left, right = bounds.data.tolist() + index = PandasIndex(pd.IntervalIndex.from_arrays(left, right), dim) + bounds_dim = (set(bounds.dims) - set(dim)).pop() + + return cls(index, bounds_name, bounds_dim) + + @classmethod + def concat(cls, indexes, dim, positions=None): + new_index = PandasIndex.concat( + [idx._index for idx in indexes], dim, positions=positions + ) + + if indexes: + bounds_name = indexes[0]._bounds_name + bounds_dim = indexes[0]._bounds_dim + if any( + idx._bounds_name != bounds_name or idx._bounds_dim != bounds_dim + for idx in indexes + ): + raise ValueError( + f"Cannot concatenate along dimension {dim!r} indexes with different " + "boundary coordinate or dimension names" + ) + else: + bounds_name = new_index.index.name + "_bounds" + bounds_dim = "bnd" + + return cls(new_index, bounds_name, bounds_dim) + + def create_variables(self, variables=None): + empty_var = Variable((), 0) + bounds_attrs = variables.get(self._bounds_name, empty_var).attrs + mid_attrs = variables.get(self._index.dim, empty_var).attrs + + bounds_var = Variable( + dims=(self._bounds_dim, self._index.dim), + data=np.stack([self._index.index.left, self._index.index.right], axis=0), + attrs=bounds_attrs, + ) + mid_var = Variable( + dims=(self._index.dim,), + data=self._index.index.mid, + attrs=mid_attrs, + ) + + return {self._index.dim: mid_var, self._bounds_name: bounds_var} + + def should_add_coord_to_array(self, name, var, dims): + # add both the mid and boundary coordinates if the index dimension + # is present in the array dimensions + if self._index.dim in dims: + return True + else: + return False + + def equals(self, other): + if not isinstance(other, IntervalIndex): + return False + return self._index.equals(other._index, exclude_dims=frozenset()) + + def sel(self, labels, **kwargs): + return self._index.sel(labels, **kwargs) + + def isel(self, indexers): + new_index = self._index.isel(indexers) + if new_index is not None: + return type(self)(new_index, self._bounds_name, self._bounds_dim) + else: + return None + + def roll(self, shifts): + new_index = self._index.roll(shifts) + return type(self)(new_index, self._bounds_name, self._bounds_dim) + + def rename(self, name_dict, dims_dict): + new_index = self._index.rename(name_dict, dims_dict) + + bounds_name = name_dict.get(self._bounds_name, self._bounds_name) + bounds_dim = dims_dict.get(self._bounds_dim, self._bounds_dim) + + return type(self)(new_index, bounds_name, bounds_dim) + + def __repr__(self): + string = f"{self._index!r}" + return string From f71f76768292324ae15100f81e5a4277583041c0 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 7 May 2025 16:17:57 +0200 Subject: [PATCH 02/23] add index description (docstrings) --- xarray/indexes/interval_index.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/xarray/indexes/interval_index.py b/xarray/indexes/interval_index.py index fd171bb5c93..c37301a1c9e 100644 --- a/xarray/indexes/interval_index.py +++ b/xarray/indexes/interval_index.py @@ -8,6 +8,22 @@ class IntervalIndex(Index): + """Xarray index of 1-dimensional intervals. + + This index is built on top of :py:class:`~xarray.indexes.PandasIndex` and + wraps a :py:class:`pandas.IntervalIndex`. It is associated with two + coordinate variables: + + - a 1-dimensional coordinate where each label represents an interval that is + materialized by its midpoint (i.e., the average of its left and right + boundaries) + + - a 2-dimensional coordinate that represents the left and right boundaries + of each interval. One of the two dimensions is shared with the + aforementioned coordinate and the other one has length 2. + + """ + def __init__(self, index: PandasIndex, bounds_name: Hashable, bounds_dim: str): assert isinstance(index.index, pd.IntervalIndex) self._index = index From f7041fd0784756a09368f646d0adab2e22903fb0 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 7 May 2025 17:01:34 +0200 Subject: [PATCH 03/23] add type annotations --- xarray/core/indexes.py | 7 +-- xarray/indexes/interval_index.py | 78 +++++++++++++++++++++++++------- 2 files changed, 63 insertions(+), 22 deletions(-) diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index 8babb885a5e..519303dd63b 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -612,9 +612,6 @@ def get_indexer_nd(index: pd.Index, labels, method=None, tolerance=None) -> np.n return indexer -T_PandasIndex = TypeVar("T_PandasIndex", bound="PandasIndex") - - class PandasIndex(Index): """Wrap a pandas.Index as an xarray compatible index.""" @@ -912,9 +909,7 @@ def rename(self, name_dict, dims_dict): new_dim = dims_dict.get(self.dim, self.dim) return self._replace(index, dim=new_dim) - def _copy( - self: T_PandasIndex, deep: bool = True, memo: dict[int, Any] | None = None - ) -> T_PandasIndex: + def _copy(self, deep: bool = True, memo: dict[int, Any] | None = None) -> Self: if deep: # pandas is not using the memo index = self.index.copy(deep=True) diff --git a/xarray/indexes/interval_index.py b/xarray/indexes/interval_index.py index c37301a1c9e..815ec9aff62 100644 --- a/xarray/indexes/interval_index.py +++ b/xarray/indexes/interval_index.py @@ -1,10 +1,17 @@ -from collections.abc import Hashable +from __future__ import annotations + +from collections.abc import Hashable, Iterable, Mapping, Sequence +from typing import TYPE_CHECKING, Any, cast import numpy as np import pandas as pd from xarray import Variable from xarray.core.indexes import Index, PandasIndex +from xarray.core.indexing import IndexSelResult + +if TYPE_CHECKING: + from xarray.core.types import Self class IntervalIndex(Index): @@ -24,6 +31,10 @@ class IntervalIndex(Index): """ + _index: PandasIndex + _bounds_name: Hashable + _bounds_dim: str + def __init__(self, index: PandasIndex, bounds_name: Hashable, bounds_dim: str): assert isinstance(index.index, pd.IntervalIndex) self._index = index @@ -31,7 +42,12 @@ def __init__(self, index: PandasIndex, bounds_name: Hashable, bounds_dim: str): self._bounds_dim = bounds_dim @classmethod - def from_variables(cls, variables, options): + def from_variables( + cls, + variables: Mapping[Any, Variable], + *, + options: Mapping[str, Any], + ) -> Self: assert len(variables) == 2 for k, v in variables.items(): @@ -45,10 +61,15 @@ def from_variables(cls, variables, options): index = PandasIndex(pd.IntervalIndex.from_arrays(left, right), dim) bounds_dim = (set(bounds.dims) - set(dim)).pop() - return cls(index, bounds_name, bounds_dim) + return cls(index, bounds_name, str(bounds_dim)) @classmethod - def concat(cls, indexes, dim, positions=None): + def concat( + cls, + indexes: Sequence[IntervalIndex], + dim: Hashable, + positions: Iterable[Iterable[int]] | None = None, + ) -> IntervalIndex: new_index = PandasIndex.concat( [idx._index for idx in indexes], dim, positions=positions ) @@ -70,25 +91,41 @@ def concat(cls, indexes, dim, positions=None): return cls(new_index, bounds_name, bounds_dim) - def create_variables(self, variables=None): + @property + def _pd_index(self) -> pd.IntervalIndex: + # For typing purpose only + # TODO: cleaner to make PandasIndex a generic class, i.e., PandasIndex[pd.IntervalIndex] + # will be easier once PEP 696 is fully supported (starting from Python 3.13) + return cast(pd.IntervalIndex, self._index.index) + + def create_variables( + self, variables: Mapping[Any, Variable] | None = None + ) -> dict[Any, Variable]: + if variables is None: + variables = {} empty_var = Variable((), 0) bounds_attrs = variables.get(self._bounds_name, empty_var).attrs mid_attrs = variables.get(self._index.dim, empty_var).attrs bounds_var = Variable( dims=(self._bounds_dim, self._index.dim), - data=np.stack([self._index.index.left, self._index.index.right], axis=0), + data=np.stack([self._pd_index.left, self._pd_index.right], axis=0), attrs=bounds_attrs, ) mid_var = Variable( dims=(self._index.dim,), - data=self._index.index.mid, + data=self._pd_index.mid, attrs=mid_attrs, ) return {self._index.dim: mid_var, self._bounds_name: bounds_var} - def should_add_coord_to_array(self, name, var, dims): + def should_add_coord_to_array( + self, + name: Hashable, + var: Variable, + dims: set[Hashable], + ) -> bool: # add both the mid and boundary coordinates if the index dimension # is present in the array dimensions if self._index.dim in dims: @@ -96,33 +133,42 @@ def should_add_coord_to_array(self, name, var, dims): else: return False - def equals(self, other): + def to_pandas_index(self) -> pd.Index: + return self._pd_index + + def equals(self, other: Index) -> bool: if not isinstance(other, IntervalIndex): return False - return self._index.equals(other._index, exclude_dims=frozenset()) + return self._index.equals(other._index) - def sel(self, labels, **kwargs): + def sel(self, labels: dict[Any, Any], **kwargs) -> IndexSelResult: return self._index.sel(labels, **kwargs) - def isel(self, indexers): + def isel( + self, indexers: Mapping[Any, int | slice | np.ndarray | Variable] + ) -> Self | None: new_index = self._index.isel(indexers) if new_index is not None: return type(self)(new_index, self._bounds_name, self._bounds_dim) else: return None - def roll(self, shifts): + def roll(self, shifts: Mapping[Any, int]) -> Self | None: new_index = self._index.roll(shifts) return type(self)(new_index, self._bounds_name, self._bounds_dim) - def rename(self, name_dict, dims_dict): + def rename( + self, + name_dict: Mapping[Any, Hashable], + dims_dict: Mapping[Any, Hashable], + ) -> Self: new_index = self._index.rename(name_dict, dims_dict) bounds_name = name_dict.get(self._bounds_name, self._bounds_name) bounds_dim = dims_dict.get(self._bounds_dim, self._bounds_dim) - return type(self)(new_index, bounds_name, bounds_dim) + return type(self)(new_index, bounds_name, str(bounds_dim)) - def __repr__(self): + def __repr__(self) -> str: string = f"{self._index!r}" return string From 940177473432872cf741f466670f4cbf630b7335 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 7 May 2025 17:28:02 +0200 Subject: [PATCH 04/23] expose IntervalIndex publicly via xarray.indexes --- xarray/indexes/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/indexes/__init__.py b/xarray/indexes/__init__.py index fafdb49c7e1..5430c6c5401 100644 --- a/xarray/indexes/__init__.py +++ b/xarray/indexes/__init__.py @@ -8,6 +8,7 @@ PandasIndex, PandasMultiIndex, ) +from xarray.indexes.interval_index import IntervalIndex from xarray.indexes.range_index import RangeIndex -__all__ = ["Index", "PandasIndex", "PandasMultiIndex", "RangeIndex"] +__all__ = ["Index", "IntervalIndex", "PandasIndex", "PandasMultiIndex", "RangeIndex"] From 781d33f71a590a9c99ad4a521d09ac9757650481 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Wed, 7 May 2025 17:28:27 +0200 Subject: [PATCH 05/23] add a few TODOs --- xarray/indexes/interval_index.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/xarray/indexes/interval_index.py b/xarray/indexes/interval_index.py index 815ec9aff62..e9ed13f0a95 100644 --- a/xarray/indexes/interval_index.py +++ b/xarray/indexes/interval_index.py @@ -48,16 +48,22 @@ def from_variables( *, options: Mapping[str, Any], ) -> Self: + # TODO: allow set the index from one variable? Guess bounds like cf_xarray's add_bounds assert len(variables) == 2 for k, v in variables.items(): if v.ndim == 2: + # TODO: be flexible with dimension order? Check which dim has length 2 bounds_name, bounds = k, v elif v.ndim == 1: dim, _ = k, v bounds = bounds.transpose(..., dim) left, right = bounds.data.tolist() + # TODO: support non-dimension coordinates (pass variable name to pd.IntervalIndex.from_arrays) + # TODO: propagate coordinate dtype (pass it to PandasIndex constructor) + # TODO: add "closed" build option (maybe choose "closed='both'" as default here? to be consistent with + # CF conventions: https://cfconventions.org/cf-conventions/cf-conventions.html#bounds-one-d) index = PandasIndex(pd.IntervalIndex.from_arrays(left, right), dim) bounds_dim = (set(bounds.dims) - set(dim)).pop() @@ -107,11 +113,16 @@ def create_variables( bounds_attrs = variables.get(self._bounds_name, empty_var).attrs mid_attrs = variables.get(self._index.dim, empty_var).attrs + # TODO: create a PandasIndexingAdapter subclass for the boundary variable + # and wrap it here (avoid data copy) bounds_var = Variable( dims=(self._bounds_dim, self._index.dim), data=np.stack([self._pd_index.left, self._pd_index.right], axis=0), attrs=bounds_attrs, ) + # TODO: use PandasIndexingAdapter directly (avoid data copy) + # and/or maybe add an index build option to preserve original labels? + # (if those differ from interval midpoints as defined by pd.IntervalIndex) mid_var = Variable( dims=(self._index.dim,), data=self._pd_index.mid, From 48dc0bde1e1876b9c58f1ded7cf4a3c637046186 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Thu, 8 May 2025 09:53:31 +0200 Subject: [PATCH 06/23] clean-up Co-authored-by: Deepak Cherian --- xarray/indexes/interval_index.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/xarray/indexes/interval_index.py b/xarray/indexes/interval_index.py index e9ed13f0a95..217046332e2 100644 --- a/xarray/indexes/interval_index.py +++ b/xarray/indexes/interval_index.py @@ -139,10 +139,7 @@ def should_add_coord_to_array( ) -> bool: # add both the mid and boundary coordinates if the index dimension # is present in the array dimensions - if self._index.dim in dims: - return True - else: - return False + return self._index.dim in dims def to_pandas_index(self) -> pd.Index: return self._pd_index From b424b12ee7a594a3b18dfa6f7a96de9a8906e247 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Thu, 8 May 2025 09:53:46 +0200 Subject: [PATCH 07/23] better docstrings Co-authored-by: Deepak Cherian --- xarray/indexes/interval_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/indexes/interval_index.py b/xarray/indexes/interval_index.py index 217046332e2..b676c26ab26 100644 --- a/xarray/indexes/interval_index.py +++ b/xarray/indexes/interval_index.py @@ -22,7 +22,7 @@ class IntervalIndex(Index): coordinate variables: - a 1-dimensional coordinate where each label represents an interval that is - materialized by its midpoint (i.e., the average of its left and right + materialized by a central value (commonly the average of its left and right boundaries) - a 2-dimensional coordinate that represents the left and right boundaries From 8d80e7122a157ca5024b18a84f7d2c91d949c366 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Thu, 8 May 2025 13:50:28 +0200 Subject: [PATCH 08/23] refactor: use two sub-indexes - a PandasIndex for central values - another PandasIndex (with pd.IntervalIndex) for boundaries --- xarray/indexes/interval_index.py | 229 +++++++++++++++++++------------ 1 file changed, 143 insertions(+), 86 deletions(-) diff --git a/xarray/indexes/interval_index.py b/xarray/indexes/interval_index.py index b676c26ab26..efaffea2642 100644 --- a/xarray/indexes/interval_index.py +++ b/xarray/indexes/interval_index.py @@ -17,9 +17,7 @@ class IntervalIndex(Index): """Xarray index of 1-dimensional intervals. - This index is built on top of :py:class:`~xarray.indexes.PandasIndex` and - wraps a :py:class:`pandas.IntervalIndex`. It is associated with two - coordinate variables: + This index is associated with two coordinate variables: - a 1-dimensional coordinate where each label represents an interval that is materialized by a central value (commonly the average of its left and right @@ -27,18 +25,29 @@ class IntervalIndex(Index): - a 2-dimensional coordinate that represents the left and right boundaries of each interval. One of the two dimensions is shared with the - aforementioned coordinate and the other one has length 2. + aforementioned coordinate and the other one has length 2 + + Interval boundaries are wrapped in a :py:class:`pandas.IntervalIndex` and + central values are wrapped in a separate :py:class:`pandas.Index`. """ - _index: PandasIndex - _bounds_name: Hashable + _mid_index: PandasIndex + _bounds_index: PandasIndex _bounds_dim: str - def __init__(self, index: PandasIndex, bounds_name: Hashable, bounds_dim: str): - assert isinstance(index.index, pd.IntervalIndex) - self._index = index - self._bounds_name = bounds_name + def __init__( + self, + mid_index: PandasIndex, + bounds_index: PandasIndex, + bounds_dim: str | None = None, + ): + assert isinstance(bounds_index.index, pd.IntervalIndex) + self._mid_index = mid_index + self._bounds_index = bounds_index + + if bounds_dim is None: + bounds_dim = "bounds" self._bounds_dim = bounds_dim @classmethod @@ -48,26 +57,67 @@ def from_variables( *, options: Mapping[str, Any], ) -> Self: - # TODO: allow set the index from one variable? Guess bounds like cf_xarray's add_bounds - assert len(variables) == 2 - - for k, v in variables.items(): - if v.ndim == 2: - # TODO: be flexible with dimension order? Check which dim has length 2 - bounds_name, bounds = k, v - elif v.ndim == 1: - dim, _ = k, v - - bounds = bounds.transpose(..., dim) - left, right = bounds.data.tolist() - # TODO: support non-dimension coordinates (pass variable name to pd.IntervalIndex.from_arrays) - # TODO: propagate coordinate dtype (pass it to PandasIndex constructor) - # TODO: add "closed" build option (maybe choose "closed='both'" as default here? to be consistent with - # CF conventions: https://cfconventions.org/cf-conventions/cf-conventions.html#bounds-one-d) - index = PandasIndex(pd.IntervalIndex.from_arrays(left, right), dim) - bounds_dim = (set(bounds.dims) - set(dim)).pop() - - return cls(index, bounds_name, str(bounds_dim)) + if len(variables) == 2: + mid_var: Variable | None = None + bounds_var: Variable | None = None + + for name, var in variables.items(): + if var.ndim == 1: + mid_name = name + mid_var = var + elif var.ndim == 2: + bounds_name = name + bounds_var = var + + if mid_var is None or bounds_var is None: + raise ValueError( + "invalid coordinates given to IntervalIndex. When two coordinates are given, " + "one must be 1-dimensional (central values) and the other must be " + "2-dimensional (boundaries). Actual coordinate variables:\n" + + "\n".join(variables.values()) + ) + + if mid_var.dims[0] == bounds_var.dims[0]: + dim, bounds_dim = bounds_var.dims + elif mid_var.dims[0] == bounds_var.dims[1]: + bounds_dim, dim = bounds_var.dims + else: + raise ValueError( + "dimension names mismatch between " + f"the central coordinate {mid_name!r} {mid_var.dims!r} and " + f"the boundary coordinate {bounds_name!r} {bounds_var.dims!r} " + "given to IntervalIndex" + ) + + if bounds_var.sizes[bounds_dim] != 2: + raise ValueError( + f"invalid shape for the boundary coordinate given to IntervalIndex (expected dimension {bounds_dim!r} of size 2)" + ) + + pd_mid_index = pd.Index(mid_var.values, name=mid_name) + mid_index = PandasIndex(pd_mid_index, dim, coord_dtype=mid_var.dtype) + + left, right = bounds_var.transpose(..., dim).values.tolist() + # TODO: make closed configurable + pd_bounds_index = pd.IntervalIndex.from_arrays( + left, right, name=bounds_name + ) + bounds_index = PandasIndex( + pd_bounds_index, dim, coord_dtype=bounds_var.dtype + ) + + elif len(variables) == 1: + # TODO: allow setting the index from one variable? Perhaps in this fallback order: + # - check if the coordinate wraps a pd.IntervalIndex + # - look after the CF `bounds` attribute + # - guess bounds like cf_xarray's add_bounds + raise ValueError( + "Setting an IntervalIndex from one coordinate is not yet supported" + ) + else: + raise ValueError("Too many coordinate variables given to IntervalIndex") + + return cls(mid_index, bounds_index, bounds_dim=str(bounds_dim)) @classmethod def concat( @@ -76,60 +126,62 @@ def concat( dim: Hashable, positions: Iterable[Iterable[int]] | None = None, ) -> IntervalIndex: - new_index = PandasIndex.concat( - [idx._index for idx in indexes], dim, positions=positions + new_mid_index = PandasIndex.concat( + [idx._mid_index for idx in indexes], dim, positions=positions + ) + new_bounds_index = PandasIndex.concat( + [idx._bounds_index for idx in indexes], dim, positions=positions ) if indexes: - bounds_name = indexes[0]._bounds_name bounds_dim = indexes[0]._bounds_dim - if any( - idx._bounds_name != bounds_name or idx._bounds_dim != bounds_dim - for idx in indexes - ): + # TODO: check whether this may actually happen or concat fails early during alignment + if any(idx._bounds_dim != bounds_dim for idx in indexes): raise ValueError( f"Cannot concatenate along dimension {dim!r} indexes with different " "boundary coordinate or dimension names" ) else: - bounds_name = new_index.index.name + "_bounds" - bounds_dim = "bnd" + bounds_dim = "bounds" + + return cls(new_mid_index, new_bounds_index, bounds_dim) - return cls(new_index, bounds_name, bounds_dim) + @property + def dim(self) -> Hashable: + return self._bounds_index.dim @property - def _pd_index(self) -> pd.IntervalIndex: - # For typing purpose only - # TODO: cleaner to make PandasIndex a generic class, i.e., PandasIndex[pd.IntervalIndex] - # will be easier once PEP 696 is fully supported (starting from Python 3.13) - return cast(pd.IntervalIndex, self._index.index) + def bounds_dim(self) -> Hashable: + return self._bounds_dim def create_variables( self, variables: Mapping[Any, Variable] | None = None ) -> dict[Any, Variable]: - if variables is None: - variables = {} - empty_var = Variable((), 0) - bounds_attrs = variables.get(self._bounds_name, empty_var).attrs - mid_attrs = variables.get(self._index.dim, empty_var).attrs - - # TODO: create a PandasIndexingAdapter subclass for the boundary variable - # and wrap it here (avoid data copy) - bounds_var = Variable( - dims=(self._bounds_dim, self._index.dim), - data=np.stack([self._pd_index.left, self._pd_index.right], axis=0), - attrs=bounds_attrs, - ) - # TODO: use PandasIndexingAdapter directly (avoid data copy) - # and/or maybe add an index build option to preserve original labels? - # (if those differ from interval midpoints as defined by pd.IntervalIndex) - mid_var = Variable( - dims=(self._index.dim,), - data=self._pd_index.mid, - attrs=mid_attrs, + new_variables = self._mid_index.create_variables(variables) + + # boundary variable (we cannot just defer to self._bounds_index.create_variables()) + bounds_pd_index = cast(pd.IntervalIndex, self._bounds_index.index) + bounds_varname = bounds_pd_index.name + attrs: Mapping[Hashable, Any] | None + encoding: Mapping[Hashable, Any] | None + + if variables is not None and bounds_varname in variables: + var = variables[bounds_varname] + attrs = var.attrs + encoding = var.encoding + else: + attrs = None + encoding = None + + # TODO: wrap index data in a PandasIndexingAdapter subclass instead + # TODO: do we want to preserve the original dimension order for the boundary coordinate? + # (using CF-compliant order below) + data = np.stack([bounds_pd_index.left, bounds_pd_index.right], axis=-1) + new_variables[bounds_varname] = Variable( + (self.dim, self.bounds_dim), data, attrs=attrs, encoding=encoding ) - return {self._index.dim: mid_var, self._bounds_name: bounds_var} + return new_variables def should_add_coord_to_array( self, @@ -137,46 +189,51 @@ def should_add_coord_to_array( var: Variable, dims: set[Hashable], ) -> bool: - # add both the mid and boundary coordinates if the index dimension - # is present in the array dimensions - return self._index.dim in dims - - def to_pandas_index(self) -> pd.Index: - return self._pd_index + # add both the central and boundary coordinates if the dimension + # that they both share is present in the array dimensions + return self.dim in dims def equals(self, other: Index) -> bool: if not isinstance(other, IntervalIndex): return False - return self._index.equals(other._index) + return self._mid_index.equals(other._mid_index) and self._bounds_index.equals( + other._bounds_index + ) def sel(self, labels: dict[Any, Any], **kwargs) -> IndexSelResult: - return self._index.sel(labels, **kwargs) + return self._bounds_index.sel(labels, **kwargs) def isel( self, indexers: Mapping[Any, int | slice | np.ndarray | Variable] ) -> Self | None: - new_index = self._index.isel(indexers) - if new_index is not None: - return type(self)(new_index, self._bounds_name, self._bounds_dim) - else: + new_mid_index = self._mid_index.isel(indexers) + new_bounds_index = self._bounds_index.isel(indexers) + + if new_mid_index is None or new_bounds_index is None: return None + else: + return type(self)(new_mid_index, new_bounds_index, self.bounds_dim) def roll(self, shifts: Mapping[Any, int]) -> Self | None: - new_index = self._index.roll(shifts) - return type(self)(new_index, self._bounds_name, self._bounds_dim) + new_mid_index = self._mid_index.roll(shifts) + new_bounds_index = self._bounds_index.roll(shifts) + + return type(self)(new_mid_index, new_bounds_index, self._bounds_dim) def rename( self, name_dict: Mapping[Any, Hashable], dims_dict: Mapping[Any, Hashable], ) -> Self: - new_index = self._index.rename(name_dict, dims_dict) + new_mid_index = self._mid_index.rename(name_dict, dims_dict) + new_bounds_index = self._bounds_index.rename(name_dict, dims_dict) - bounds_name = name_dict.get(self._bounds_name, self._bounds_name) - bounds_dim = dims_dict.get(self._bounds_dim, self._bounds_dim) + bounds_dim = dims_dict.get(self.bounds_dim, self.bounds_dim) - return type(self)(new_index, bounds_name, str(bounds_dim)) + return type(self)(new_mid_index, new_bounds_index, str(bounds_dim)) def __repr__(self) -> str: - string = f"{self._index!r}" - return string + text = "IntervalIndex\n" + text += f"- central values:\n{self._mid_index!r}\n" + text += f"- boundaries:\n{self._bounds_index!r}\n" + return text From e60a1a4db00fea6901a94c2fa576a112c9331477 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Thu, 8 May 2025 14:05:48 +0200 Subject: [PATCH 09/23] check consistent central values vs. intervals --- xarray/indexes/interval_index.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/xarray/indexes/interval_index.py b/xarray/indexes/interval_index.py index efaffea2642..70a3270327f 100644 --- a/xarray/indexes/interval_index.py +++ b/xarray/indexes/interval_index.py @@ -106,6 +106,14 @@ def from_variables( pd_bounds_index, dim, coord_dtype=bounds_var.dtype ) + actual_indexer = pd_bounds_index.get_indexer(pd_mid_index) + expected_indexer = np.arange(pd_mid_index.size) + if not np.array_equal(actual_indexer, expected_indexer): + raise ValueError( + "invalid coordinates given to IntervalIndex. Not all central values are " + "in their corresponding interval" + ) + elif len(variables) == 1: # TODO: allow setting the index from one variable? Perhaps in this fallback order: # - check if the coordinate wraps a pd.IntervalIndex From 8918fe83934e49eeedd8b05d8b3a4e2a2186804b Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Thu, 8 May 2025 14:16:49 +0200 Subject: [PATCH 10/23] fix mypy --- xarray/indexes/interval_index.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xarray/indexes/interval_index.py b/xarray/indexes/interval_index.py index 70a3270327f..f2ddfdbe430 100644 --- a/xarray/indexes/interval_index.py +++ b/xarray/indexes/interval_index.py @@ -74,7 +74,7 @@ def from_variables( "invalid coordinates given to IntervalIndex. When two coordinates are given, " "one must be 1-dimensional (central values) and the other must be " "2-dimensional (boundaries). Actual coordinate variables:\n" - + "\n".join(variables.values()) + + "\n".join(str(var) for var in variables.values()) ) if mid_var.dims[0] == bounds_var.dims[0]: @@ -204,6 +204,7 @@ def should_add_coord_to_array( def equals(self, other: Index) -> bool: if not isinstance(other, IntervalIndex): return False + return self._mid_index.equals(other._mid_index) and self._bounds_index.equals( other._bounds_index ) @@ -220,7 +221,7 @@ def isel( if new_mid_index is None or new_bounds_index is None: return None else: - return type(self)(new_mid_index, new_bounds_index, self.bounds_dim) + return type(self)(new_mid_index, new_bounds_index, str(self.bounds_dim)) def roll(self, shifts: Mapping[Any, int]) -> Self | None: new_mid_index = self._mid_index.roll(shifts) From c722a2ed7e6a4b651759432cb1a540f2db7bb673 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Thu, 8 May 2025 15:41:04 +0200 Subject: [PATCH 11/23] implement join and reindex_like --- xarray/indexes/interval_index.py | 50 ++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/xarray/indexes/interval_index.py b/xarray/indexes/interval_index.py index f2ddfdbe430..f992f2f9d7b 100644 --- a/xarray/indexes/interval_index.py +++ b/xarray/indexes/interval_index.py @@ -14,6 +14,13 @@ from xarray.core.types import Self +def check_mid_in_interval(mid_index: pd.Index, bounds_index: pd.IntervalIndex): + actual_indexer = bounds_index.get_indexer(mid_index) + expected_indexer = np.arange(mid_index.size) + if not np.array_equal(actual_indexer, expected_indexer): + raise ValueError("not all central values are in their corresponding interval") + + class IntervalIndex(Index): """Xarray index of 1-dimensional intervals. @@ -43,6 +50,8 @@ def __init__( bounds_dim: str | None = None, ): assert isinstance(bounds_index.index, pd.IntervalIndex) + assert mid_index.dim == bounds_index.dim + self._mid_index = mid_index self._bounds_index = bounds_index @@ -91,7 +100,8 @@ def from_variables( if bounds_var.sizes[bounds_dim] != 2: raise ValueError( - f"invalid shape for the boundary coordinate given to IntervalIndex (expected dimension {bounds_dim!r} of size 2)" + "invalid shape for the boundary coordinate given to IntervalIndex " + f"(expected dimension {bounds_dim!r} of size 2)" ) pd_mid_index = pd.Index(mid_var.values, name=mid_name) @@ -106,13 +116,7 @@ def from_variables( pd_bounds_index, dim, coord_dtype=bounds_var.dtype ) - actual_indexer = pd_bounds_index.get_indexer(pd_mid_index) - expected_indexer = np.arange(pd_mid_index.size) - if not np.array_equal(actual_indexer, expected_indexer): - raise ValueError( - "invalid coordinates given to IntervalIndex. Not all central values are " - "in their corresponding interval" - ) + check_mid_in_interval(pd_mid_index, pd_bounds_index) elif len(variables) == 1: # TODO: allow setting the index from one variable? Perhaps in this fallback order: @@ -156,7 +160,7 @@ def concat( @property def dim(self) -> Hashable: - return self._bounds_index.dim + return self._mid_index.dim @property def bounds_dim(self) -> Hashable: @@ -209,6 +213,34 @@ def equals(self, other: Index) -> bool: other._bounds_index ) + def join(self, other: Self, how: str = "inner") -> Self: + joined_mid_index = self._mid_index.join(other._mid_index, how=how) + joined_bounds_index = self._bounds_index.join(other._bounds_index, how=how) + + assert isinstance(joined_bounds_index, pd.IntervalIndex) + check_mid_in_interval( + joined_mid_index.index, cast(pd.IntervalIndex, joined_bounds_index.index) + ) + + return type(self)(joined_mid_index, joined_bounds_index, self.bounds_dim) + + def reindex_like( + self, other: Self, method=None, tolerance=None + ) -> dict[Hashable, Any]: + mid_indexers = self._mid_index.reindex_like( + other._mid_index, method=method, tolerance=tolerance + ) + bounds_indexers = self._mid_index.reindex_like( + other._bounds_index, method=method, tolerance=tolerance + ) + + if not np.array_equal(mid_indexers[self.dim], bounds_indexers[self.dim]): + raise ValueError( + f"conflicting reindexing of central values and intervals along dimension {self.dim!r}" + ) + + return mid_indexers + def sel(self, labels: dict[Any, Any], **kwargs) -> IndexSelResult: return self._bounds_index.sel(labels, **kwargs) From 23fb18b7330deb514249e590df1f9d2ff53e066d Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Thu, 8 May 2025 15:48:09 +0200 Subject: [PATCH 12/23] add mid_index and bounds_index properties --- xarray/indexes/interval_index.py | 58 ++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/xarray/indexes/interval_index.py b/xarray/indexes/interval_index.py index f992f2f9d7b..e7d574bde47 100644 --- a/xarray/indexes/interval_index.py +++ b/xarray/indexes/interval_index.py @@ -139,14 +139,14 @@ def concat( positions: Iterable[Iterable[int]] | None = None, ) -> IntervalIndex: new_mid_index = PandasIndex.concat( - [idx._mid_index for idx in indexes], dim, positions=positions + [idx.mid_index for idx in indexes], dim, positions=positions ) new_bounds_index = PandasIndex.concat( - [idx._bounds_index for idx in indexes], dim, positions=positions + [idx.bounds_index for idx in indexes], dim, positions=positions ) if indexes: - bounds_dim = indexes[0]._bounds_dim + bounds_dim = indexes[0].bounds_dim # TODO: check whether this may actually happen or concat fails early during alignment if any(idx._bounds_dim != bounds_dim for idx in indexes): raise ValueError( @@ -156,11 +156,19 @@ def concat( else: bounds_dim = "bounds" - return cls(new_mid_index, new_bounds_index, bounds_dim) + return cls(new_mid_index, new_bounds_index, str(bounds_dim)) + + @property + def mid_index(self) -> PandasIndex: + return self._mid_index + + @property + def bounds_index(self) -> PandasIndex: + return self._bounds_index @property def dim(self) -> Hashable: - return self._mid_index.dim + return self.mid_index.dim @property def bounds_dim(self) -> Hashable: @@ -169,10 +177,10 @@ def bounds_dim(self) -> Hashable: def create_variables( self, variables: Mapping[Any, Variable] | None = None ) -> dict[Any, Variable]: - new_variables = self._mid_index.create_variables(variables) + new_variables = self.mid_index.create_variables(variables) - # boundary variable (we cannot just defer to self._bounds_index.create_variables()) - bounds_pd_index = cast(pd.IntervalIndex, self._bounds_index.index) + # boundary variable (we cannot just defer to self.bounds_index.create_variables()) + bounds_pd_index = cast(pd.IntervalIndex, self.bounds_index.index) bounds_varname = bounds_pd_index.name attrs: Mapping[Hashable, Any] | None encoding: Mapping[Hashable, Any] | None @@ -209,13 +217,13 @@ def equals(self, other: Index) -> bool: if not isinstance(other, IntervalIndex): return False - return self._mid_index.equals(other._mid_index) and self._bounds_index.equals( - other._bounds_index + return self.mid_index.equals(other.mid_index) and self.bounds_index.equals( + other.bounds_index ) def join(self, other: Self, how: str = "inner") -> Self: - joined_mid_index = self._mid_index.join(other._mid_index, how=how) - joined_bounds_index = self._bounds_index.join(other._bounds_index, how=how) + joined_mid_index = self.mid_index.join(other.mid_index, how=how) + joined_bounds_index = self.bounds_index.join(other.bounds_index, how=how) assert isinstance(joined_bounds_index, pd.IntervalIndex) check_mid_in_interval( @@ -227,11 +235,11 @@ def join(self, other: Self, how: str = "inner") -> Self: def reindex_like( self, other: Self, method=None, tolerance=None ) -> dict[Hashable, Any]: - mid_indexers = self._mid_index.reindex_like( - other._mid_index, method=method, tolerance=tolerance + mid_indexers = self.mid_index.reindex_like( + other.mid_index, method=method, tolerance=tolerance ) - bounds_indexers = self._mid_index.reindex_like( - other._bounds_index, method=method, tolerance=tolerance + bounds_indexers = self.mid_index.reindex_like( + other.bounds_index, method=method, tolerance=tolerance ) if not np.array_equal(mid_indexers[self.dim], bounds_indexers[self.dim]): @@ -242,13 +250,13 @@ def reindex_like( return mid_indexers def sel(self, labels: dict[Any, Any], **kwargs) -> IndexSelResult: - return self._bounds_index.sel(labels, **kwargs) + return self.bounds_index.sel(labels, **kwargs) def isel( self, indexers: Mapping[Any, int | slice | np.ndarray | Variable] ) -> Self | None: - new_mid_index = self._mid_index.isel(indexers) - new_bounds_index = self._bounds_index.isel(indexers) + new_mid_index = self.mid_index.isel(indexers) + new_bounds_index = self.bounds_index.isel(indexers) if new_mid_index is None or new_bounds_index is None: return None @@ -256,8 +264,8 @@ def isel( return type(self)(new_mid_index, new_bounds_index, str(self.bounds_dim)) def roll(self, shifts: Mapping[Any, int]) -> Self | None: - new_mid_index = self._mid_index.roll(shifts) - new_bounds_index = self._bounds_index.roll(shifts) + new_mid_index = self.mid_index.roll(shifts) + new_bounds_index = self.bounds_index.roll(shifts) return type(self)(new_mid_index, new_bounds_index, self._bounds_dim) @@ -266,8 +274,8 @@ def rename( name_dict: Mapping[Any, Hashable], dims_dict: Mapping[Any, Hashable], ) -> Self: - new_mid_index = self._mid_index.rename(name_dict, dims_dict) - new_bounds_index = self._bounds_index.rename(name_dict, dims_dict) + new_mid_index = self.mid_index.rename(name_dict, dims_dict) + new_bounds_index = self.bounds_index.rename(name_dict, dims_dict) bounds_dim = dims_dict.get(self.bounds_dim, self.bounds_dim) @@ -275,6 +283,6 @@ def rename( def __repr__(self) -> str: text = "IntervalIndex\n" - text += f"- central values:\n{self._mid_index!r}\n" - text += f"- boundaries:\n{self._bounds_index!r}\n" + text += f"- central values:\n{self.mid_index!r}\n" + text += f"- boundaries:\n{self.bounds_index!r}\n" return text From de4f5d891a648b0cfe82702ff91662f40c1a194e Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 9 May 2025 11:19:31 +0200 Subject: [PATCH 13/23] clean-up indexing.PandasIndexingAdapter typing --- xarray/core/indexing.py | 55 +++++------------------------------------ 1 file changed, 6 insertions(+), 49 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index c1b847202c7..d973eb12ac8 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -1846,29 +1846,13 @@ def _prepare_key(self, key: Any | tuple[Any, ...]) -> tuple[Any, ...]: return key - def _handle_result( - self, result: Any - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): + def _handle_result(self, result: Any) -> PandasIndexingAdapter | np.ndarray: if isinstance(result, pd.Index): return type(self)(result, dtype=self.dtype) else: return self._convert_scalar(result) - def _oindex_get( - self, indexer: OuterIndexer - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): + def _oindex_get(self, indexer: OuterIndexer) -> PandasIndexingAdapter | np.ndarray: key = self._prepare_key(indexer.tuple) if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional @@ -1881,13 +1865,7 @@ def _oindex_get( def _vindex_get( self, indexer: VectorizedIndexer - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): + ) -> PandasIndexingAdapter | np.ndarray: _assert_not_chunked_indexer(indexer.tuple) key = self._prepare_key(indexer.tuple) @@ -1901,13 +1879,7 @@ def _vindex_get( def __getitem__( self, indexer: ExplicitIndexer - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): + ) -> PandasIndexingAdapter | np.ndarray: key = self._prepare_key(indexer.tuple) if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional @@ -1987,15 +1959,7 @@ def _convert_scalar(self, item): item = item[idx] return super()._convert_scalar(item) - def _oindex_get( - self, indexer: OuterIndexer - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): + def _oindex_get(self, indexer: OuterIndexer) -> PandasIndexingAdapter | np.ndarray: result = super()._oindex_get(indexer) if isinstance(result, type(self)): result.level = self.level @@ -2003,13 +1967,7 @@ def _oindex_get( def _vindex_get( self, indexer: VectorizedIndexer - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): + ) -> PandasIndexingAdapter | np.ndarray: result = super()._vindex_get(indexer) if isinstance(result, type(self)): result.level = self.level @@ -2019,7 +1977,6 @@ def __getitem__(self, indexer: ExplicitIndexer): result = super().__getitem__(indexer) if isinstance(result, type(self)): result.level = self.level - return result def __repr__(self) -> str: From e1bf896278e662e3e042f00f2d7fde7508e2375d Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 9 May 2025 13:05:34 +0200 Subject: [PATCH 14/23] streamline PandasIndexingAdapter indexing logic Grouping the logic into one method will make it easier overriding the behavior in subclasses (interval index) without affecting much readability. Also it yield more DRY code. --- xarray/core/indexing.py | 67 ++++++++++++----------------------------- 1 file changed, 20 insertions(+), 47 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index d973eb12ac8..bb54f6b83df 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -1838,57 +1838,42 @@ def _convert_scalar(self, item) -> np.ndarray: # a NumPy array. return to_0d_array(item) - def _prepare_key(self, key: Any | tuple[Any, ...]) -> tuple[Any, ...]: - if isinstance(key, tuple) and len(key) == 1: + def _index_get( + self, indexer: ExplicitIndexer, func_name: str + ) -> PandasIndexingAdapter | np.ndarray: + key = indexer.tuple + + if len(key) == 1: # unpack key so it can index a pandas.Index object (pandas.Index # objects don't like tuples) (key,) = key - return key + # if multidimensional key, convert the index to numpy array and index the latter + if getattr(key, "ndim", 0) > 1: + indexable = NumpyIndexingAdapter(np.asarray(self)) + return getattr(indexable, func_name)(indexer) + + # otherwise index the pandas index then re-wrap or convert the result + result = self.array[key] - def _handle_result(self, result: Any) -> PandasIndexingAdapter | np.ndarray: if isinstance(result, pd.Index): return type(self)(result, dtype=self.dtype) else: return self._convert_scalar(result) def _oindex_get(self, indexer: OuterIndexer) -> PandasIndexingAdapter | np.ndarray: - key = self._prepare_key(indexer.tuple) - - if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional - indexable = NumpyIndexingAdapter(np.asarray(self)) - return indexable.oindex[indexer] - - result = self.array[key] - - return self._handle_result(result) + return self._index_get(indexer, "_oindex_get") def _vindex_get( self, indexer: VectorizedIndexer ) -> PandasIndexingAdapter | np.ndarray: _assert_not_chunked_indexer(indexer.tuple) - key = self._prepare_key(indexer.tuple) - - if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional - indexable = NumpyIndexingAdapter(np.asarray(self)) - return indexable.vindex[indexer] - - result = self.array[key] - - return self._handle_result(result) + return self._index_get(indexer, "_vindex_get") def __getitem__( self, indexer: ExplicitIndexer ) -> PandasIndexingAdapter | np.ndarray: - key = self._prepare_key(indexer.tuple) - - if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional - indexable = NumpyIndexingAdapter(np.asarray(self)) - return indexable[indexer] - - result = self.array[key] - - return self._handle_result(result) + return self._index_get(indexer, "__getitem__") def transpose(self, order) -> pd.Index: return self.array # self.array should be always one-dimensional @@ -1953,28 +1938,16 @@ def __array__( else: return super().__array__(dtype, copy=copy) - def _convert_scalar(self, item): + def _convert_scalar(self, item: Any): if isinstance(item, tuple) and self.level is not None: idx = tuple(self.array.names).index(self.level) item = item[idx] return super()._convert_scalar(item) - def _oindex_get(self, indexer: OuterIndexer) -> PandasIndexingAdapter | np.ndarray: - result = super()._oindex_get(indexer) - if isinstance(result, type(self)): - result.level = self.level - return result - - def _vindex_get( - self, indexer: VectorizedIndexer + def _index_get( + self, indexer: ExplicitIndexer, func_name: str ) -> PandasIndexingAdapter | np.ndarray: - result = super()._vindex_get(indexer) - if isinstance(result, type(self)): - result.level = self.level - return result - - def __getitem__(self, indexer: ExplicitIndexer): - result = super().__getitem__(indexer) + result = super()._index_get(indexer, func_name) if isinstance(result, type(self)): result.level = self.level return result From 06a3b92e34ca983543434789678f2dd691789cd6 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 9 May 2025 13:34:49 +0200 Subject: [PATCH 15/23] add xarray indexing adapater for pd.IntervalIndex So we can wrap pd.IntervalIndex in a boundary 2-dimensional coordinate variable. --- xarray/core/indexing.py | 89 ++++++++++++++++++++++++++++++++ xarray/indexes/interval_index.py | 7 +-- 2 files changed, 93 insertions(+), 3 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index bb54f6b83df..f403cc8a89d 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -28,6 +28,7 @@ get_valid_numpy_dtype, is_duck_array, is_duck_dask_array, + is_full_slice, is_scalar, is_valid_numpy_dtype, to_0d_array, @@ -1993,6 +1994,94 @@ def copy(self, deep: bool = True) -> Self: return type(self)(array, self._dtype, self.level) +class PandasIntervalIndexingAdapter(PandasIndexingAdapter): + """Wraps a pandas.IntervalIndex as a 2-dimensional coordinate array. + + When the array is not transposed, left and right interval boundaries are on + the 2nd axis, i.e., shape is (N, 2). + + """ + + __slots__ = ("_bounds_axis", "_dtype", "array") + + array: pd.IntervalIndex + _dtype: np.dtype | pd.api.extensions.ExtensionDtype + _bounds_axis: int + + def __init__( + self, + array: pd.IntervalIndex, + dtype: DTypeLike | pd.api.extensions.ExtensionDtype | None = None, + transpose: bool = False, + ): + super().__init__(array, dtype=dtype) + + if transpose: + self._bounds_axis = 0 + else: + self._bounds_axis = -1 + + @property + def shape(self) -> _Shape: + if self._bounds_axis == 0: + return (2, len(self.array)) + else: + return (len(self.array), 2) + + def __array__( + self, + dtype: np.typing.DTypeLike | None = None, + /, + *, + copy: bool | None = None, + ) -> np.ndarray: + if dtype is None and is_valid_numpy_dtype(self.dtype): + dtype = cast(np.dtype, self.dtype) + else: + dtype = get_valid_numpy_dtype(self.array) + + return np.stack( + [self.array.left, self.array.right], axis=self._bounds_axis, dtype=dtype + ) + + def get_duck_array(self) -> np.ndarray: + return np.asarray(self) + + def _index_get( + self, indexer: ExplicitIndexer, func_name: str + ) -> PandasIndexingAdapter | np.ndarray: + key = indexer.tuple + + if len(key) == 1: + # unpack key so it can index a pandas.Index object (pandas.Index + # objects don't like tuples) + (key,) = key + elif len(key) == 2 and is_full_slice(key[self._bounds_axis]): + # OK to index the pandas.IntervalIndex and keep it wrapped + # (drop the bounds axis key) + key = key[self._bounds_axis + 1] + + # if length-2 or multidimensional key, convert the index to numpy array + # and index the latter + if (isinstance(key, tuple) and len(key) == 2) or getattr(key, "ndim", 0) > 1: + indexable = NumpyIndexingAdapter(np.asarray(self)) + return getattr(indexable, func_name)(indexer) + + # otherwise index the pandas IntervalIndex then re-wrap or convert the result + result = self.array[key] + + if isinstance(result, pd.IntervalIndex): + return type(self)(result, dtype=self.dtype) + elif isinstance(result, pd.Interval): + return np.array([result.left, result.right]) + else: + return self._convert_scalar(result) + + def transpose(self, order: Iterable[int]) -> Self: + transpose = tuple(order) == (1, 0) + return type(self)(self.array, dtype=self.dtype, transpose=transpose) + + class CoordinateTransformIndexingAdapter(ExplicitlyIndexedNDArrayMixin): """Wrap a CoordinateTransform as a lazy coordinate array. diff --git a/xarray/indexes/interval_index.py b/xarray/indexes/interval_index.py index e7d574bde47..78d14e59bb2 100644 --- a/xarray/indexes/interval_index.py +++ b/xarray/indexes/interval_index.py @@ -8,7 +8,7 @@ from xarray import Variable from xarray.core.indexes import Index, PandasIndex -from xarray.core.indexing import IndexSelResult +from xarray.core.indexing import IndexSelResult, PandasIntervalIndexingAdapter if TYPE_CHECKING: from xarray.core.types import Self @@ -193,10 +193,11 @@ def create_variables( attrs = None encoding = None - # TODO: wrap index data in a PandasIndexingAdapter subclass instead # TODO: do we want to preserve the original dimension order for the boundary coordinate? # (using CF-compliant order below) - data = np.stack([bounds_pd_index.left, bounds_pd_index.right], axis=-1) + data = PandasIntervalIndexingAdapter( + bounds_pd_index, dtype=self.bounds_index.coord_dtype + ) new_variables[bounds_varname] = Variable( (self.dim, self.bounds_dim), data, attrs=attrs, encoding=encoding ) From 80f496ff02323253d35832903ef5162801569d7a Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 9 May 2025 14:38:26 +0200 Subject: [PATCH 16/23] clean-up PandasIndexingAdapter dtype handling Prevent numpy.dtype conversions or castings implemented in various places, gather the logic into one method. --- xarray/core/indexing.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index f403cc8a89d..8e2dfd38a3d 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -1779,6 +1779,15 @@ def __init__( def dtype(self) -> np.dtype | pd.api.extensions.ExtensionDtype: # type: ignore[override] return self._dtype + def _get_numpy_dtype(self, dtype: np.typing.DTypeLike | None = None) -> np.dtype: + if dtype is None: + if is_valid_numpy_dtype(self.dtype): + return cast(np.dtype, self.dtype) + else: + return get_valid_numpy_dtype(self.array) + else: + return np.dtype(dtype) + def __array__( self, dtype: np.typing.DTypeLike | None = None, @@ -1786,11 +1795,9 @@ def __array__( *, copy: bool | None = None, ) -> np.ndarray: - if dtype is None and is_valid_numpy_dtype(self.dtype): - dtype = cast(np.dtype, self.dtype) - else: - dtype = get_valid_numpy_dtype(self.array) + dtype = self._get_numpy_dtype(dtype) array = self.array + if isinstance(array, pd.PeriodIndex): with suppress(AttributeError): # this might not be public API @@ -1830,10 +1837,8 @@ def _convert_scalar(self, item) -> np.ndarray: # numpy fails to convert pd.Timestamp to np.datetime64[ns] item = np.asarray(item.to_datetime64()) elif self.dtype != object: - dtype = self.dtype - if pd.api.types.is_extension_array_dtype(dtype): - dtype = get_valid_numpy_dtype(self.array) - item = np.asarray(item, dtype=cast(np.dtype, dtype)) + dtype = self._get_numpy_dtype() + item = np.asarray(item, dtype=dtype) # as for numpy.ndarray indexing, we always want the result to be # a NumPy array. @@ -1897,7 +1902,9 @@ def copy(self, deep: bool = True) -> Self: def nbytes(self) -> int: if pd.api.types.is_extension_array_dtype(self.dtype): return self.array.nbytes - return cast(np.dtype, self.dtype).itemsize * len(self.array) + + dtype = self._get_numpy_dtype() + return dtype.itemsize * len(self.array) class PandasMultiIndexingAdapter(PandasIndexingAdapter): @@ -2073,7 +2080,7 @@ def _index_get( if isinstance(result, pd.IntervalIndex): return type(self)(result, dtype=self.dtype) elif isinstance(result, pd.Interval): - return np.array([result.left, result.right]) + return np.array([result.left, result.right], dtype=self._get_numpy_dtype()) else: return self._convert_scalar(result) From 67d8f6c43fd913dc031dc0a41cc89bf4a4de02cb Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 9 May 2025 14:41:19 +0200 Subject: [PATCH 17/23] fix mypy --- xarray/core/indexing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 8e2dfd38a3d..e1a68498308 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -1881,7 +1881,7 @@ def __getitem__( ) -> PandasIndexingAdapter | np.ndarray: return self._index_get(indexer, "__getitem__") - def transpose(self, order) -> pd.Index: + def transpose(self, order) -> Self | pd.Index: return self.array # self.array should be always one-dimensional def __repr__(self) -> str: @@ -2057,7 +2057,7 @@ def get_duck_array(self) -> np.ndarray: def _index_get( self, indexer: ExplicitIndexer, func_name: str ) -> PandasIndexingAdapter | np.ndarray: - key = indexer.tuple + key: tuple | Any = indexer.tuple if len(key) == 1: # unpack key so it can index a pandas.Index object (pandas.Index From a8015aa49d3da93e9fa597743fc2c8a3497ff9fc Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 9 May 2025 14:41:28 +0200 Subject: [PATCH 18/23] IntervalIndex sel / isel: handle boundary dim & coord --- xarray/indexes/interval_index.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/xarray/indexes/interval_index.py b/xarray/indexes/interval_index.py index 78d14e59bb2..b76274a5b32 100644 --- a/xarray/indexes/interval_index.py +++ b/xarray/indexes/interval_index.py @@ -9,6 +9,7 @@ from xarray import Variable from xarray.core.indexes import Index, PandasIndex from xarray.core.indexing import IndexSelResult, PandasIntervalIndexingAdapter +from xarray.core.utils import is_full_slice if TYPE_CHECKING: from xarray.core.types import Self @@ -251,11 +252,30 @@ def reindex_like( return mid_indexers def sel(self, labels: dict[Any, Any], **kwargs) -> IndexSelResult: + bounds_coord_name = self.bounds_index.index.name + if bounds_coord_name in labels: + raise ValueError( + "IntervalIndex doesn't support label-based selection " + f"using the boundary coordinate {bounds_coord_name!r}" + ) + return self.bounds_index.sel(labels, **kwargs) def isel( self, indexers: Mapping[Any, int | slice | np.ndarray | Variable] ) -> Self | None: + indexers = dict(indexers) + + if self.bounds_dim in indexers: + if is_full_slice(indexers[self._bounds_dim]): + # prevent errors raised when calling isel on the underlying PandasIndex objects + indexers.pop(self.bounds_dim) + if self.dim not in indexers: + indexers[self.dim] = slice(None) + else: + # drop the index when selecting on the bounds dimension + return None + new_mid_index = self.mid_index.isel(indexers) new_bounds_index = self.bounds_index.isel(indexers) From 5b5cbeea0218def6e0033c3de08e1ff89ba64920 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Fri, 9 May 2025 14:49:56 +0200 Subject: [PATCH 19/23] more clean-up --- xarray/core/indexing.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index e1a68498308..bb750a50792 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -1937,8 +1937,8 @@ def __array__( *, copy: bool | None = None, ) -> np.ndarray: - if dtype is None: - dtype = cast(np.dtype, self.dtype) + dtype = self._get_numpy_dtype(dtype) + if self.level is not None: return np.asarray( self.array.get_level_values(self.level).values, dtype=dtype @@ -2042,10 +2042,7 @@ def __array__( *, copy: bool | None = None, ) -> np.ndarray: - if dtype is None and is_valid_numpy_dtype(self.dtype): - dtype = cast(np.dtype, self.dtype) - else: - dtype = get_valid_numpy_dtype(self.array) + dtype = self._get_numpy_dtype(dtype) return np.stack( [self.array.left, self.array.right], axis=self._bounds_axis, dtype=dtype @@ -2080,7 +2077,8 @@ def _index_get( if isinstance(result, pd.IntervalIndex): return type(self)(result, dtype=self.dtype) elif isinstance(result, pd.Interval): - return np.array([result.left, result.right], dtype=self._get_numpy_dtype()) + dtype = self._get_numpy_dtype() + return np.array([result.left, result.right], dtype=dtype) else: return self._convert_scalar(result) From fdc19435b844dc32eb6fa68e3c491860b90e1783 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Thu, 3 Jul 2025 14:23:11 +0200 Subject: [PATCH 20/23] rename IntervalIndex -> CFIntervalIndex --- xarray/indexes/__init__.py | 4 +-- ...interval_index.py => cf_interval_index.py} | 27 ++++++++++--------- 2 files changed, 16 insertions(+), 15 deletions(-) rename xarray/indexes/{interval_index.py => cf_interval_index.py} (93%) diff --git a/xarray/indexes/__init__.py b/xarray/indexes/__init__.py index 5430c6c5401..7be60b68cf6 100644 --- a/xarray/indexes/__init__.py +++ b/xarray/indexes/__init__.py @@ -8,7 +8,7 @@ PandasIndex, PandasMultiIndex, ) -from xarray.indexes.interval_index import IntervalIndex +from xarray.indexes.cf_interval_index import CFIntervalIndex from xarray.indexes.range_index import RangeIndex -__all__ = ["Index", "IntervalIndex", "PandasIndex", "PandasMultiIndex", "RangeIndex"] +__all__ = ["CFIntervalIndex", "Index", "PandasIndex", "PandasMultiIndex", "RangeIndex"] diff --git a/xarray/indexes/interval_index.py b/xarray/indexes/cf_interval_index.py similarity index 93% rename from xarray/indexes/interval_index.py rename to xarray/indexes/cf_interval_index.py index b76274a5b32..cfe6efe6b7c 100644 --- a/xarray/indexes/interval_index.py +++ b/xarray/indexes/cf_interval_index.py @@ -22,10 +22,11 @@ def check_mid_in_interval(mid_index: pd.Index, bounds_index: pd.IntervalIndex): raise ValueError("not all central values are in their corresponding interval") -class IntervalIndex(Index): - """Xarray index of 1-dimensional intervals. +class CFIntervalIndex(Index): + """Xarray index of CF-like 1-dimensional intervals. - This index is associated with two coordinate variables: + This index is associated with two coordinate variables like in the Climate + and Forecast (CF) conventions: - a 1-dimensional coordinate where each label represents an interval that is materialized by a central value (commonly the average of its left and right @@ -81,7 +82,7 @@ def from_variables( if mid_var is None or bounds_var is None: raise ValueError( - "invalid coordinates given to IntervalIndex. When two coordinates are given, " + "invalid coordinates given to CFIntervalIndex. When two coordinates are given, " "one must be 1-dimensional (central values) and the other must be " "2-dimensional (boundaries). Actual coordinate variables:\n" + "\n".join(str(var) for var in variables.values()) @@ -96,12 +97,12 @@ def from_variables( "dimension names mismatch between " f"the central coordinate {mid_name!r} {mid_var.dims!r} and " f"the boundary coordinate {bounds_name!r} {bounds_var.dims!r} " - "given to IntervalIndex" + "given to CFIntervalIndex" ) if bounds_var.sizes[bounds_dim] != 2: raise ValueError( - "invalid shape for the boundary coordinate given to IntervalIndex " + "invalid shape for the boundary coordinate given to CFIntervalIndex " f"(expected dimension {bounds_dim!r} of size 2)" ) @@ -125,20 +126,20 @@ def from_variables( # - look after the CF `bounds` attribute # - guess bounds like cf_xarray's add_bounds raise ValueError( - "Setting an IntervalIndex from one coordinate is not yet supported" + "Setting a CFIntervalIndex from one coordinate is not yet supported" ) else: - raise ValueError("Too many coordinate variables given to IntervalIndex") + raise ValueError("Too many coordinate variables given to CFIntervalIndex") return cls(mid_index, bounds_index, bounds_dim=str(bounds_dim)) @classmethod def concat( cls, - indexes: Sequence[IntervalIndex], + indexes: Sequence[CFIntervalIndex], dim: Hashable, positions: Iterable[Iterable[int]] | None = None, - ) -> IntervalIndex: + ) -> CFIntervalIndex: new_mid_index = PandasIndex.concat( [idx.mid_index for idx in indexes], dim, positions=positions ) @@ -216,7 +217,7 @@ def should_add_coord_to_array( return self.dim in dims def equals(self, other: Index) -> bool: - if not isinstance(other, IntervalIndex): + if not isinstance(other, CFIntervalIndex): return False return self.mid_index.equals(other.mid_index) and self.bounds_index.equals( @@ -255,7 +256,7 @@ def sel(self, labels: dict[Any, Any], **kwargs) -> IndexSelResult: bounds_coord_name = self.bounds_index.index.name if bounds_coord_name in labels: raise ValueError( - "IntervalIndex doesn't support label-based selection " + "CFIntervalIndex doesn't support label-based selection " f"using the boundary coordinate {bounds_coord_name!r}" ) @@ -303,7 +304,7 @@ def rename( return type(self)(new_mid_index, new_bounds_index, str(bounds_dim)) def __repr__(self) -> str: - text = "IntervalIndex\n" + text = "CFIntervalIndex\n" text += f"- central values:\n{self.mid_index!r}\n" text += f"- boundaries:\n{self.bounds_index!r}\n" return text From edfa435ebac528ddf2ef38648dfa969b6f220c6a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 3 Jul 2025 12:25:09 +0000 Subject: [PATCH 21/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/indexes/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/xarray/indexes/__init__.py b/xarray/indexes/__init__.py index 75803a47570..c9147d9dfc2 100644 --- a/xarray/indexes/__init__.py +++ b/xarray/indexes/__init__.py @@ -13,7 +13,6 @@ from xarray.indexes.cf_interval_index import CFIntervalIndex from xarray.indexes.range_index import RangeIndex - __all__ = [ "CFIntervalIndex", "CoordinateTransform", @@ -22,4 +21,4 @@ "PandasIndex", "PandasMultiIndex", "RangeIndex", -] \ No newline at end of file +] From bc20226dc5d36b02888fca8e9efb976967730cf6 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Thu, 3 Jul 2025 14:27:56 +0200 Subject: [PATCH 22/23] fix circular import --- xarray/indexes/cf_interval_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/indexes/cf_interval_index.py b/xarray/indexes/cf_interval_index.py index cfe6efe6b7c..48c8e208b93 100644 --- a/xarray/indexes/cf_interval_index.py +++ b/xarray/indexes/cf_interval_index.py @@ -6,10 +6,10 @@ import numpy as np import pandas as pd -from xarray import Variable from xarray.core.indexes import Index, PandasIndex from xarray.core.indexing import IndexSelResult, PandasIntervalIndexingAdapter from xarray.core.utils import is_full_slice +from xarray.core.variable import Variable if TYPE_CHECKING: from xarray.core.types import Self From 4cabb7c3fa0e7531bea95c60f7dae6006967c9d6 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 8 Jul 2025 09:43:42 -0700 Subject: [PATCH 23/23] Fix bad merge --- xarray/core/indexing.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 5f9fe61f62a..9df8917075c 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -1963,6 +1963,14 @@ def __array__( else: return super().__array__(dtype, copy=copy) + @property + def _in_memory(self) -> bool: + # The pd.MultiIndex's data is fully in memory, but it has a different + # layout than the level and dimension coordinate arrays. Marking this + # adapter class as a "lazy" array will prevent costly conversion when, + # e.g., formatting the Xarray reprs. + return False + def _convert_scalar(self, item: Any): if isinstance(item, tuple) and self.level is not None: idx = tuple(self.array.names).index(self.level)