[Data] Adding more ops to BlockColumnAccessor (ray-project#51571)

alexeykudinkin · Dhakshin Suriakannu · commit b5bc0dd718fd · 2025-03-27T11:39:53.000-07:00
## Why are these changes needed?

1. Adding more ops to `BlockColumnAccessor`
2. Fixing circular imports in Ray Data
3. Fixing AggregateFnV2 to be proper ABC
4. Simplifying `accumulate_block` op

---------

Signed-off-by: Alexey Kudinkin &lt;ak@anyscale.com&gt;
Signed-off-by: Dhakshin Suriakannu &lt;d_suriakannu@apple.com&gt;
diff --git a/python/ray/air/util/tensor_extensions/arrow.py b/python/ray/air/util/tensor_extensions/arrow.py
@@ -204,6 +204,13 @@ def _convert_to_pyarrow_native_array(
         if len(column_values) > 0 and isinstance(column_values[0], datetime):
             column_values = _convert_datetime_to_np_datetime(column_values)
 
+        # To avoid deserialization penalty of converting Arrow arrays (`Array` and `ChunkedArray`)
+        # to Python objects and then back to Arrow, we instead combine them into ListArray manually
+        if len(column_values) > 0 and isinstance(
+            column_values[0], (pa.Array, pa.ChunkedArray)
+        ):
+            return _combine_as_list_array(column_values)
+
         # NOTE: We explicitly infer PyArrow `DataType` so that
         #       we can perform upcasting to be able to accommodate
         #       blocks that are larger than 2Gb in size (limited
@@ -238,6 +245,27 @@ def _convert_to_pyarrow_native_array(
         raise ArrowConversionError(str(column_values)) from e
 
 
+def _combine_as_list_array(column_values: List[Union[pa.Array, pa.ChunkedArray]]):
+    """Combines list of Arrow arrays into a single `ListArray`"""
+
+    # First, compute respective offsets in the resulting array
+    lens = [len(v) for v in column_values]
+    offsets = pa.array(np.concatenate([[0], np.cumsum(lens)]), type=pa.int32())
+
+    # Concat all the chunks into a single contiguous array
+    combined = pa.concat_arrays(
+        itertools.chain(
+            *[
+                v.chunks if isinstance(v, pa.ChunkedArray) else [v]
+                for v in column_values
+            ]
+        )
+    )
+
+    # TODO support null masking
+    return pa.ListArray.from_arrays(offsets, combined, pa.list_(combined.type))
+
+
 def _coerce_np_datetime_to_pa_timestamp_precision(
     column_values: np.ndarray, dtype: pa.TimestampType, column_name: str
 ):
diff --git a/python/ray/data/BUILD b/python/ray/data/BUILD
@@ -268,20 +268,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "test_aggregate",
-    size = "small",
-    srcs = ["tests/test_aggregate.py"],
-    tags = [
-        "exclusive",
-        "team:data",
-    ],
-    deps = [
-        ":conftest",
-        "//:ray_lib",
-    ],
-)
-
 py_test(
     name = "test_avro",
     size = "small",
diff --git a/python/ray/data/__init__.py b/python/ray/data/__init__.py
@@ -4,6 +4,7 @@
 from packaging.version import parse as parse_version
 
 from ray._private.arrow_utils import get_pyarrow_version
+
 from ray.data._internal.compute import ActorPoolStrategy
 from ray.data._internal.datasource.tfrecords_datasource import TFXReadOptions
 from ray.data._internal.execution.interfaces import (
diff --git a/python/ray/data/_internal/arrow_block.py b/python/ray/data/_internal/arrow_block.py
@@ -34,6 +34,7 @@
     BlockType,
     U,
     BlockColumnAccessor,
+    BlockColumn,
 )
 from ray.data.context import DataContext
 
@@ -427,5 +428,28 @@ def sum_of_squared_diffs_from_mean(
         )
         return res.as_py() if as_py else res
 
-    def to_pylist(self):
+    def quantile(
+        self, *, q: float, ignore_nulls: bool, as_py: bool = True
+    ) -> Optional[U]:
+        import pyarrow.compute as pac
+
+        array = pac.quantile(self._column, q=q, skip_nulls=ignore_nulls)
+        # NOTE: That quantile method still returns an array
+        res = array[0]
+        return res.as_py() if as_py else res
+
+    def unique(self) -> BlockColumn:
+        import pyarrow.compute as pac
+
+        return pac.unique(self._column)
+
+    def flatten(self) -> BlockColumn:
+        import pyarrow.compute as pac
+
+        return pac.list_flatten(self._column)
+
+    def to_pylist(self) -> List[Any]:
         return self._column.to_pylist()
+
+    def _as_arrow_compatible(self) -> Union[List[Any], "pyarrow.Array"]:
+        return self._column
diff --git a/python/ray/data/_internal/pandas_block.py b/python/ray/data/_internal/pandas_block.py
@@ -29,6 +29,7 @@
     BlockType,
     U,
     BlockColumnAccessor,
+    BlockColumn,
 )
 from ray.data.context import DataContext
 
@@ -150,6 +151,18 @@ def mean(self, *, ignore_nulls: bool, as_py: bool = True) -> Optional[U]:
             sum_ / self.count(ignore_nulls=ignore_nulls) if not is_null(sum_) else sum_
         )
 
+    def quantile(
+        self, *, q: float, ignore_nulls: bool, as_py: bool = True
+    ) -> Optional[U]:
+        return self._column.quantile(q=q)
+
+    def unique(self) -> BlockColumn:
+        pd = lazy_import_pandas()
+        return pd.Series(self._column.unique())
+
+    def flatten(self) -> BlockColumn:
+        return self._column.list.flatten()
+
     def sum_of_squared_diffs_from_mean(
         self,
         ignore_nulls: bool,
@@ -164,9 +177,12 @@ def sum_of_squared_diffs_from_mean(
 
         return ((self._column - mean) ** 2).sum(skipna=ignore_nulls)
 
-    def to_pylist(self):
+    def to_pylist(self) -> List[Any]:
         return self._column.to_list()
 
+    def _as_arrow_compatible(self) -> Union[List[Any], "pyarrow.Array"]:
+        return self.to_pylist()
+
     def _is_all_null(self):
         return not self._column.notna().any()
 
diff --git a/python/ray/data/aggregate.py b/python/ray/data/aggregate.py
@@ -1,10 +1,9 @@
 import abc
 import math
-from typing import TYPE_CHECKING, Any, Callable, List, Optional
+from typing import TYPE_CHECKING, Callable, List, Optional, Any
 
 import numpy as np
 
-from ray.data._internal.planner.exchange.sort_task_spec import SortKey
 from ray.data._internal.util import is_null
 from ray.data.block import AggType, Block, BlockAccessor, KeyType, T, U
 from ray.util.annotations import PublicAPI, Deprecated
@@ -105,7 +104,7 @@ def _validate(self, schema: Optional["Schema"]) -> None:
 
 
 @PublicAPI(stability="alpha")
-class AggregateFnV2(AggregateFn):
+class AggregateFnV2(AggregateFn, abc.ABC):
     """Provides an interface to implement efficient aggregations to be applied
     to the dataset.
 
@@ -148,9 +147,7 @@ def __init__(
             name=name,
             init=_safe_zero_factory,
             merge=_safe_combine,
-            accumulate_block=(
-                lambda acc, block: _safe_combine(acc, _safe_aggregate(block))
-            ),
+            accumulate_block=lambda _, block: _safe_aggregate(block),
             finalize=_safe_finalize,
         )
 
@@ -177,6 +174,8 @@ def _finalize(self, accumulator: AggType) -> Optional[U]:
 
     def _validate(self, schema: Optional["Schema"]) -> None:
         if self._target_col_name:
+            from ray.data._internal.planner.exchange.sort_task_spec import SortKey
+
             SortKey(self._target_col_name).validate_schema(schema)
 
 
@@ -540,12 +539,13 @@ class Unique(AggregateFnV2):
     def __init__(
         self,
         on: Optional[str] = None,
+        ignore_nulls: bool = True,
         alias_name: Optional[str] = None,
     ):
         super().__init__(
             alias_name if alias_name else f"unique({str(on)})",
             on=on,
-            ignore_nulls=False,
+            ignore_nulls=ignore_nulls,
             zero_factory=set,
         )
 
diff --git a/python/ray/data/block.py b/python/ray/data/block.py
@@ -573,6 +573,21 @@ def mean(self, *, ignore_nulls: bool, as_py: bool = True) -> Optional[U]:
         """Returns a mean of the values in the column"""
         raise NotImplementedError()
 
+    def quantile(
+        self, *, q: float, ignore_nulls: bool, as_py: bool = True
+    ) -> Optional[U]:
+        """Returns requested quantile of the given column"""
+        raise NotImplementedError()
+
+    def unique(self) -> BlockColumn:
+        """Returns new column holding only distinct values of the current one"""
+        raise NotImplementedError()
+
+    def flatten(self) -> BlockColumn:
+        """Flattens nested lists merging them into top-level container"""
+
+        raise NotImplementedError()
+
     def sum_of_squared_diffs_from_mean(
         self,
         *,
@@ -583,10 +598,14 @@ def sum_of_squared_diffs_from_mean(
         """Returns a sum of diffs (from mean) squared for the column"""
         raise NotImplementedError()
 
-    def to_pylist(self):
+    def to_pylist(self) -> List[Any]:
         """Converts block column to a list of Python native objects"""
         raise NotImplementedError()
 
+    def _as_arrow_compatible(self) -> Union[List[Any], "pyarrow.Array"]:
+        """Converts block column into a representation compatible with Arrow"""
+        raise NotImplementedError()
+
     @staticmethod
     def for_column(col: BlockColumn) -> "BlockColumnAccessor":
         """Create a column accessor for the given column"""
diff --git a/python/ray/data/tests/test_aggregate.py b/python/ray/data/tests/test_aggregate.py