Skip to content

Clean-up indexing adapter classes #10355

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Jul 7, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
bc94c6d
clean-up indexing.PandasIndexingAdapter typing
benbovy May 9, 2025
17ff7e9
streamline PandasIndexingAdapter indexing logic
benbovy May 9, 2025
2b25155
clean-up PandasIndexingAdapter dtype handling
benbovy May 9, 2025
9981078
more clean-up
benbovy May 9, 2025
29098ac
repr: prevent loading lazy variables into memory
benbovy May 26, 2025
5f09354
fix array (index) subsetting
benbovy May 26, 2025
c4a853e
Merge branch 'main' into cleanup-pandas-indexing-adapter
benbovy Jul 4, 2025
0e5154c
treat multi-index and coord-transform variables as lazy
benbovy Jul 4, 2025
4efb135
update whats new
benbovy Jul 4, 2025
ef73a7e
add benchmarks for pandas and xarray RangeIndex
benbovy Jul 7, 2025
28b661a
Merge branch 'main' into cleanup-pandas-indexing-adapter
benbovy Jul 7, 2025
a2ccb7d
fix benchmark numba import error (numpy 2.3)
benbovy Jul 7, 2025
07f6cdb
benchmark: pin numpy in conf + consistent conda env
benbovy Jul 7, 2025
a953b41
pyproject: bump setuptools(-scm)
benbovy Jul 7, 2025
2be275d
ci benchmarks: try fixing package install
benbovy Jul 7, 2025
825cdb1
next try
benbovy Jul 7, 2025
c890a69
next try
benbovy Jul 7, 2025
16fe98b
next try
benbovy Jul 7, 2025
8ae12f7
benchmarks: try disabling no build isolation
benbovy Jul 7, 2025
f40f38c
Revert "benchmarks: try disabling no build isolation"
benbovy Jul 7, 2025
0ecc214
Revert "next try"
benbovy Jul 7, 2025
74e993c
Revert "next try"
benbovy Jul 7, 2025
3420fc9
Revert "next try"
benbovy Jul 7, 2025
97579f5
Revert "ci benchmarks: try fixing package install"
benbovy Jul 7, 2025
86df720
Revert "pyproject: bump setuptools(-scm)"
benbovy Jul 7, 2025
0887a8e
I'm tired of Python packaging
benbovy Jul 7, 2025
8a76b46
Let's fix all this later
benbovy Jul 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions xarray/core/formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@
from xarray.core.duck_array_ops import array_all, array_any, array_equiv, astype, ravel
from xarray.core.extension_array import PandasExtensionArray
from xarray.core.indexing import (
CoordinateTransformIndexingAdapter,
BasicIndexer,
ExplicitlyIndexed,
MemoryCachedArray,
PandasIndexingAdapter,
)
from xarray.core.options import OPTIONS, _get_boolean_with_default
from xarray.core.treenode import group_subtrees
Expand Down Expand Up @@ -91,6 +91,8 @@ def first_n_items(array, n_desired):

if n_desired < array.size:
indexer = _get_indexer_at_least_n_items(array.shape, n_desired, from_end=False)
if isinstance(array, ExplicitlyIndexed):
indexer = BasicIndexer(indexer)
array = array[indexer]

# We pass variable objects in to handle indexing
Expand All @@ -115,6 +117,8 @@ def last_n_items(array, n_desired):

if n_desired < array.size:
indexer = _get_indexer_at_least_n_items(array.shape, n_desired, from_end=True)
if isinstance(array, ExplicitlyIndexed):
indexer = BasicIndexer(indexer)
array = array[indexer]

# We pass variable objects in to handle indexing
Expand Down Expand Up @@ -664,11 +668,6 @@ def short_data_repr(array):
"""Format "data" for DataArray and Variable."""
internal_data = getattr(array, "variable", array)._data

if isinstance(
internal_data, PandasIndexingAdapter | CoordinateTransformIndexingAdapter
):
array = internal_data._get_array_subset()

if isinstance(array, np.ndarray):
return short_array_repr(array)
elif is_duck_array(internal_data):
Expand Down
58 changes: 25 additions & 33 deletions xarray/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from xarray.core import duck_array_ops
from xarray.core.coordinate_transform import CoordinateTransform
from xarray.core.nputils import NumpyVIndexAdapter
from xarray.core.options import OPTIONS
from xarray.core.types import T_Xarray
from xarray.core.utils import (
NDArrayMixin,
Expand Down Expand Up @@ -1774,6 +1773,12 @@ def __init__(
else:
self._dtype = np.dtype(cast(DTypeLike, dtype))

@property
def _in_memory(self) -> bool:
# prevent costly conversion of a memory-saving pd.RangeIndex into a
# large numpy array.
return not isinstance(self.array, pd.RangeIndex)

@property
def dtype(self) -> np.dtype | pd.api.extensions.ExtensionDtype: # type: ignore[override]
return self._dtype
Expand Down Expand Up @@ -1887,24 +1892,13 @@ def __getitem__(
def transpose(self, order) -> pd.Index:
return self.array # self.array should be always one-dimensional

def _get_array_subset(self) -> np.ndarray:
# avoid converting a large pd.Index (especially pd.MultiIndex and pd.RangeIndex)
# into a numpy array for the array repr
threshold = max(100, OPTIONS["display_values_threshold"] + 2)
if self.size > threshold:
pos = threshold // 2
subset_start = (self[OuterIndexer((slice(pos),))],)
subset_end = (self[OuterIndexer((slice(-pos, None),))],)
return np.concatenate(
[np.asarray(subset_start), np.asarray(subset_end)], axis=-1
)
else:
return np.asarray(self)

def _repr_inline_(self, max_width: int) -> str:
# we want to display values in the inline repr for lazy coordinates too
# (pd.RangeIndex and pd.MultiIndex). `format_array_flat` prevents loading
# the whole array in memory.
from xarray.core.formatting import format_array_flat

return format_array_flat(self._get_array_subset(), max_width)
return format_array_flat(self, max_width)

def __repr__(self) -> str:
return f"{type(self).__name__}(array={self.array!r}, dtype={self.dtype!r})"
Expand Down Expand Up @@ -1968,6 +1962,14 @@ def __array__(
else:
return super().__array__(dtype, copy=copy)

@property
def _in_memory(self) -> bool:
# The pd.MultiIndex's data is fully in memory, but it has a different
# layout than the level and dimension coordinate arrays. Marking this
# adapter class as a "lazy" array will prevent costly conversion when,
# e.g., formatting the Xarray reprs.
return False

def _convert_scalar(self, item: Any):
if isinstance(item, tuple) and self.level is not None:
idx = tuple(self.array.names).index(self.level)
Expand Down Expand Up @@ -2032,6 +2034,10 @@ def dtype(self) -> np.dtype:
def shape(self) -> tuple[int, ...]:
return tuple(self._transform.dim_size.values())

@property
def _in_memory(self) -> bool:
return False

def get_duck_array(self) -> np.ndarray:
all_coords = self._transform.generate_coords(dims=self._dims)
return np.asarray(all_coords[self._coord_name])
Expand Down Expand Up @@ -2092,23 +2098,9 @@ def transpose(self, order: Iterable[int]) -> Self:
def __repr__(self: Any) -> str:
return f"{type(self).__name__}(transform={self._transform!r})"

def _get_array_subset(self) -> np.ndarray:
threshold = max(100, OPTIONS["display_values_threshold"] + 2)
if self.size > threshold:
pos = threshold // 2
flat_indices = np.concatenate(
[np.arange(0, pos), np.arange(self.size - pos, self.size)]
)
subset = self.vindex[
VectorizedIndexer(np.unravel_index(flat_indices, self.shape))
]
else:
subset = self

return np.asarray(subset)

def _repr_inline_(self, max_width: int) -> str:
"""Good to see some labels even for a lazy coordinate."""
# we want to display values in the inline repr for this lazy coordinate
# `format_array_flat` prevents loading the whole array in memory.
from xarray.core.formatting import format_array_flat

return format_array_flat(self._get_array_subset(), max_width)
return format_array_flat(self, max_width)
10 changes: 8 additions & 2 deletions xarray/core/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from xarray.core.extension_array import PandasExtensionArray
from xarray.core.indexing import (
BasicIndexer,
CoordinateTransformIndexingAdapter,
OuterIndexer,
PandasIndexingAdapter,
VectorizedIndexer,
Expand Down Expand Up @@ -403,10 +404,15 @@ def _new(
return cls_(dims_, data, attrs_)

@property
def _in_memory(self):
def _in_memory(self) -> bool:
if isinstance(
self._data, PandasIndexingAdapter | CoordinateTransformIndexingAdapter
):
return self._data._in_memory

return isinstance(
self._data,
np.ndarray | np.number | PandasIndexingAdapter | PandasExtensionArray,
np.ndarray | np.number | PandasExtensionArray,
) or (
isinstance(self._data, indexing.MemoryCachedArray)
and isinstance(self._data.array, indexing.NumpyIndexingAdapter)
Expand Down
11 changes: 11 additions & 0 deletions xarray/tests/test_coordinate_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,17 @@ def test_coordinate_transform_variable_repr_inline() -> None:
)


def test_coordinate_transform_variable_repr() -> None:
var = create_coords(scale=2.0, shape=(2, 2))["x"].variable

actual = repr(var)
expected = """
<xarray.Variable (y: 2, x: 2)> Size: 32B
[4 values with dtype=float64]
""".strip()
assert actual == expected


def test_coordinate_transform_variable_basic_outer_indexing() -> None:
var = create_coords(scale=2.0, shape=(4, 4))["x"].variable

Expand Down
43 changes: 43 additions & 0 deletions xarray/tests/test_formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -1189,3 +1189,46 @@ def test_array_repr_dtypes():
Dimensions without coordinates: x
""".strip()
assert actual == expected


def test_repr_pandas_range_index() -> None:
# lazy data repr but values shown in inline repr
xidx = xr.indexes.PandasIndex(pd.RangeIndex(10), "x")
ds = xr.Dataset(coords=xr.Coordinates.from_xindex(xidx))
actual = repr(ds.x)
expected = """
<xarray.DataArray 'x' (x: 10)> Size: 80B
[10 values with dtype=int64]
Coordinates:
* x (x) int64 80B 0 1 2 3 4 5 6 7 8 9
""".strip()
assert actual == expected


def test_repr_pandas_multi_index() -> None:
# lazy data repr but values shown in inline repr
midx = pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=["foo", "bar"])
coords = xr.Coordinates.from_pandas_multiindex(midx, "x")
ds = xr.Dataset(coords=coords)

actual = repr(ds.x)
expected = """
<xarray.DataArray 'x' (x: 4)> Size: 32B
[4 values with dtype=object]
Coordinates:
* x (x) object 32B MultiIndex
* foo (x) object 32B 'a' 'a' 'b' 'b'
* bar (x) int64 32B 1 2 1 2
""".strip()
assert actual == expected

actual = repr(ds.foo)
expected = """
<xarray.DataArray 'foo' (x: 4)> Size: 32B
[4 values with dtype=object]
Coordinates:
* x (x) object 32B MultiIndex
* foo (x) object 32B 'a' 'a' 'b' 'b'
* bar (x) int64 32B 1 2 1 2
""".strip()
assert actual == expected
Loading