Skip to content

Commit f0dcb1d

Browse files
committed
[data] update the arguments based on recent comment
Signed-off-by: Xingyu Long <xingyulong97@gmail.com>
1 parent 7caf5af commit f0dcb1d

File tree

2 files changed

+20
-22
lines changed

2 files changed

+20
-22
lines changed

python/ray/data/preprocessor.py

+19-21
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,7 @@
44
import pickle
55
import warnings
66
from enum import Enum
7-
from typing import (
8-
TYPE_CHECKING,
9-
Any,
10-
Dict,
11-
Union,
12-
List,
13-
Optional,
14-
Literal,
15-
)
7+
from typing import TYPE_CHECKING, Any, Dict, Union, List, Optional
168

179
from ray.air.util.data_batch_conversion import BatchFormat
1810
from ray.util.annotations import DeveloperAPI, PublicAPI
@@ -132,7 +124,7 @@ def fit_transform(
132124
*,
133125
transform_num_cpus: Optional[float] = None,
134126
transform_memory: Optional[float] = None,
135-
transform_batch_size: Union[int, None, Literal["default"]] = None,
127+
transform_batch_size: Optional[int] = None,
136128
transform_concurrency: Optional[int] = None,
137129
) -> "Dataset":
138130
"""Fit this Preprocessor to the Dataset and then transform the Dataset.
@@ -143,10 +135,10 @@ def fit_transform(
143135
144136
Args:
145137
ds: Input Dataset.
146-
transform_num_cpus: The number of CPUs to reserve for each parallel map worker.
147-
transform_memory: The heap memory in bytes to reserve for each parallel map worker.
148-
transform_batch_size: The maximum number of rows to return.
149-
transform_concurrency: The maximum number of Ray workers to use concurrently.
138+
transform_num_cpus: [experimental] The number of CPUs to reserve for each parallel map worker.
139+
transform_memory: [experimental] The heap memory in bytes to reserve for each parallel map worker.
140+
transform_batch_size: [experimental] The maximum number of rows to return.
141+
transform_concurrency: [experimental] The maximum number of Ray workers to use concurrently.
150142
151143
Returns:
152144
ray.data.Dataset: The transformed Dataset.
@@ -164,19 +156,19 @@ def transform(
164156
self,
165157
ds: "Dataset",
166158
*,
159+
batch_size: Optional[int] = None,
167160
num_cpus: Optional[float] = None,
168161
memory: Optional[float] = None,
169-
batch_size: Union[int, None, Literal["default"]] = None,
170162
concurrency: Optional[int] = None,
171163
) -> "Dataset":
172164
"""Transform the given dataset.
173165
174166
Args:
175167
ds: Input Dataset.
176-
num_cpus: The number of CPUs to reserve for each parallel map worker.
177-
memory: The heap memory in bytes to reserve for each parallel map worker.
178-
batch_size: The maximum number of rows to return.
179-
concurrency: The maximum number of Ray workers to use concurrently.
168+
batch_size: [experimental] Advanced configuration for adjusting input size for each worker.
169+
num_cpus: [experimental] The number of CPUs to reserve for each parallel map worker.
170+
memory: [experimental] The heap memory in bytes to reserve for each parallel map worker.
171+
concurrency: [experimental] The maximum number of Ray workers to use concurrently.
180172
181173
Returns:
182174
ray.data.Dataset: The transformed Dataset.
@@ -193,7 +185,13 @@ def transform(
193185
"`fit` must be called before `transform`, "
194186
"or simply use fit_transform() to run both steps"
195187
)
196-
transformed_ds = self._transform(ds, num_cpus, memory, batch_size, concurrency)
188+
transformed_ds = self._transform(
189+
ds,
190+
batch_size=batch_size,
191+
num_cpus=num_cpus,
192+
memory=memory,
193+
concurrency=concurrency,
194+
)
197195
return transformed_ds
198196

199197
def transform_batch(self, data: "DataBatchType") -> "DataBatchType":
@@ -258,9 +256,9 @@ def _determine_transform_to_use(self) -> BatchFormat:
258256
def _transform(
259257
self,
260258
ds: "Dataset",
259+
batch_size: Optional[int],
261260
num_cpus: Optional[float] = None,
262261
memory: Optional[float] = None,
263-
batch_size: Union[int, None, Literal["default"]] = None,
264262
concurrency: Optional[int] = None,
265263
) -> "Dataset":
266264
transform_type = self._determine_transform_to_use()

python/ray/data/preprocessors/tokenizer.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Callable, List, Optional, Literal, Union
1+
from typing import Callable, List, Optional
22

33
import pandas as pd
44

0 commit comments

Comments
 (0)