[data] move parameters to transform, fit_transform func

xingyu-long · xingyu-long · commit 7caf5af8e2d5 · 2025-04-26T21:40:14.000-07:00
Signed-off-by: Xingyu Long &lt;xingyulong97@gmail.com&gt;
diff --git a/python/ray/data/preprocessor.py b/python/ray/data/preprocessor.py
@@ -11,9 +11,7 @@
     Union,
     List,
     Optional,
-    Callable,
     Literal,
-    Tuple,
 )
 
 from ray.air.util.data_batch_conversion import BatchFormat
@@ -57,25 +55,6 @@ class Preprocessor(abc.ABC):
       implemented method.
     """
 
-    def __init__(
-        self,
-        num_cpus: Optional[float] = None,
-        memory: Optional[float] = None,
-        batch_size: Union[int, None, Literal["default"]] = None,
-        concurrency: Optional[int] = None,
-    ):
-        """
-        Args:
-            num_cpus: The number of CPUs to reserve for each parallel map worker.
-            memory: The heap memory in bytes to reserve for each parallel map worker.
-            batch_size: The maximum number of rows to return.
-            concurrency: The maximum number of Ray workers to use concurrently.
-        """
-        self._num_cpus = num_cpus
-        self._memory = memory
-        self._batch_size = batch_size
-        self._concurrency = concurrency
-
     class FitStatus(str, Enum):
         """The fit status of preprocessor."""
 
@@ -147,7 +126,15 @@ def fit(self, ds: "Dataset") -> "Preprocessor":
         self._fitted = True
         return fitted_ds
 
-    def fit_transform(self, ds: "Dataset") -> "Dataset":
+    def fit_transform(
+        self,
+        ds: "Dataset",
+        *,
+        transform_num_cpus: Optional[float] = None,
+        transform_memory: Optional[float] = None,
+        transform_batch_size: Union[int, None, Literal["default"]] = None,
+        transform_concurrency: Optional[int] = None,
+    ) -> "Dataset":
         """Fit this Preprocessor to the Dataset and then transform the Dataset.
 
         Calling it more than once will overwrite all previously fitted state:
@@ -156,18 +143,40 @@ def fit_transform(self, ds: "Dataset") -> "Dataset":
 
         Args:
             ds: Input Dataset.
+            transform_num_cpus: The number of CPUs to reserve for each parallel map worker.
+            transform_memory: The heap memory in bytes to reserve for each parallel map worker.
+            transform_batch_size: The maximum number of rows to return.
+            transform_concurrency: The maximum number of Ray workers to use concurrently.
 
         Returns:
             ray.data.Dataset: The transformed Dataset.
         """
         self.fit(ds)
-        return self.transform(ds)
+        return self.transform(
+            ds,
+            num_cpus=transform_num_cpus,
+            memory=transform_memory,
+            batch_size=transform_batch_size,
+            concurrency=transform_concurrency,
+        )
 
-    def transform(self, ds: "Dataset") -> "Dataset":
+    def transform(
+        self,
+        ds: "Dataset",
+        *,
+        num_cpus: Optional[float] = None,
+        memory: Optional[float] = None,
+        batch_size: Union[int, None, Literal["default"]] = None,
+        concurrency: Optional[int] = None,
+    ) -> "Dataset":
         """Transform the given dataset.
 
         Args:
             ds: Input Dataset.
+            num_cpus: The number of CPUs to reserve for each parallel map worker.
+            memory: The heap memory in bytes to reserve for each parallel map worker.
+            batch_size: The maximum number of rows to return.
+            concurrency: The maximum number of Ray workers to use concurrently.
 
         Returns:
             ray.data.Dataset: The transformed Dataset.
@@ -184,7 +193,7 @@ def transform(self, ds: "Dataset") -> "Dataset":
                 "`fit` must be called before `transform`, "
                 "or simply use fit_transform() to run both steps"
             )
-        transformed_ds = self._transform(ds)
+        transformed_ds = self._transform(ds, num_cpus, memory, batch_size, concurrency)
         return transformed_ds
 
     def transform_batch(self, data: "DataBatchType") -> "DataBatchType":
@@ -246,18 +255,27 @@ def _determine_transform_to_use(self) -> BatchFormat:
                 "for Preprocessor transforms."
             )
 
-    def _transform(self, ds: "Dataset") -> "Dataset":
-        # TODO(matt): Expose `batch_size` or similar configurability.
-        # The default may be too small for some datasets and too large for others.
+    def _transform(
+        self,
+        ds: "Dataset",
+        num_cpus: Optional[float] = None,
+        memory: Optional[float] = None,
+        batch_size: Union[int, None, Literal["default"]] = None,
+        concurrency: Optional[int] = None,
+    ) -> "Dataset":
         transform_type = self._determine_transform_to_use()
 
         # Our user-facing batch format should only be pandas or NumPy, other
         # formats {arrow, simple} are internal.
         kwargs = self._get_transform_config()
-        kwargs["num_cpus"] = self._num_cpus
-        kwargs["memory"] = self._memory
-        kwargs["batch_size"] = self._batch_size
-        kwargs["concurrency"] = self._concurrency
+        if num_cpus is not None:
+            kwargs["num_cpus"] = num_cpus
+        if memory is not None:
+            kwargs["memory"] = memory
+        if batch_size is not None:
+            kwargs["batch_size"] = batch_size
+        if concurrency is not None:
+            kwargs["concurrency"] = concurrency
 
         if transform_type == BatchFormat.PANDAS:
             return ds.map_batches(
diff --git a/python/ray/data/preprocessors/tokenizer.py b/python/ray/data/preprocessors/tokenizer.py
@@ -59,10 +59,6 @@ class Tokenizer(Preprocessor):
             columns will be the same as the input columns. If not None, the length of
             ``output_columns`` must match the length of ``columns``, othwerwise an error
             will be raised.
-        num_cpus: The number of CPUs to reserve for each parallel map worker.
-        memory: The heap memory in bytes to reserve for each parallel map worker.
-        batch_size: The maximum number of rows to return.
-        concurrency: The maximum number of Ray workers to use concurrently.
     """
 
     _is_fittable = False
@@ -72,18 +68,7 @@ def __init__(
         columns: List[str],
         tokenization_fn: Optional[Callable[[str], List[str]]] = None,
         output_columns: Optional[List[str]] = None,
-        *,
-        num_cpus: Optional[float] = None,
-        memory: Optional[float] = None,
-        batch_size: Union[int, None, Literal["default"]] = None,
-        concurrency: Optional[int] = None,
     ):
-        super().__init__(
-            num_cpus=num_cpus,
-            memory=memory,
-            batch_size=batch_size,
-            concurrency=concurrency,
-        )
         self.columns = columns
         # TODO(matt): Add a more robust default tokenizer.
         self.tokenization_fn = tokenization_fn or simple_split_tokenizer
diff --git a/python/ray/data/tests/preprocessors/test_preprocessors.py b/python/ray/data/tests/preprocessors/test_preprocessors.py
@@ -165,8 +165,11 @@ def test_fit_twice(mocked_warn):
     mocked_warn.assert_called_once_with(msg)
 
 
-def test_initialization_parameters():
+def test_transform_all_configs():
     batch_size = 2
+    num_cpus = 2
+    concurrency = 2
+    memory = 1024
 
     class DummyPreprocessor(Preprocessor):
         _is_fittable = False
@@ -175,56 +178,27 @@ def _get_transform_config(self):
             return {"batch_size": batch_size}
 
         def _transform_numpy(self, data):
+            assert ray.get_runtime_context().get_assigned_resources()["CPU"] == num_cpus
             assert (
-                ray.get_runtime_context().get_assigned_resources()["CPU"]
-                == self._num_cpus
+                ray.get_runtime_context().get_assigned_resources()["memory"] == memory
             )
             assert len(data["value"]) == batch_size
             return data
 
         def _determine_transform_to_use(self):
             return "numpy"
 
-    prep = DummyPreprocessor(
-        num_cpus=2,
-        concurrency=2,
-        batch_size=batch_size,
-    )
+    prep = DummyPreprocessor()
     ds = ray.data.from_pandas(pd.DataFrame({"value": list(range(10))}))
-    ds = prep.transform(ds)
-
+    ds = prep.transform(
+        ds,
+        num_cpus=num_cpus,
+        memory=memory,
+        concurrency=concurrency,
+    )
     assert [x["value"] for x in ds.take(5)] == [0, 1, 2, 3, 4]
 
 
-def test_transform_config():
-    """Tests that the transform_config of
-    the Preprocessor is respected during transform."""
-
-    batch_size = 2
-
-    class DummyPreprocessor(Preprocessor):
-        _is_fittable = False
-
-        def _transform_numpy(self, data):
-            assert len(data["value"]) == batch_size
-            return data
-
-        def _transform_pandas(self, data):
-            raise RuntimeError(
-                "Pandas transform should not be called with numpy batch format."
-            )
-
-        def _get_transform_config(self):
-            return {"batch_size": 2}
-
-        def _determine_transform_to_use(self):
-            return "numpy"
-
-    prep = DummyPreprocessor()
-    ds = ray.data.from_pandas(pd.DataFrame({"value": list(range(4))}))
-    prep.transform(ds)
-
-
 @pytest.mark.parametrize("dataset_format", ["simple", "pandas", "arrow"])
 def test_transform_all_formats(create_dummy_preprocessors, dataset_format):
     (