4
4
import pickle
5
5
import warnings
6
6
from enum import Enum
7
- from typing import (
8
- TYPE_CHECKING ,
9
- Any ,
10
- Dict ,
11
- Union ,
12
- List ,
13
- Optional ,
14
- Literal ,
15
- )
7
+ from typing import TYPE_CHECKING , Any , Dict , Union , List , Optional
16
8
17
9
from ray .air .util .data_batch_conversion import BatchFormat
18
10
from ray .util .annotations import DeveloperAPI , PublicAPI
@@ -132,7 +124,7 @@ def fit_transform(
132
124
* ,
133
125
transform_num_cpus : Optional [float ] = None ,
134
126
transform_memory : Optional [float ] = None ,
135
- transform_batch_size : Union [int , None , Literal [ "default" ] ] = None ,
127
+ transform_batch_size : Optional [int ] = None ,
136
128
transform_concurrency : Optional [int ] = None ,
137
129
) -> "Dataset" :
138
130
"""Fit this Preprocessor to the Dataset and then transform the Dataset.
@@ -143,10 +135,10 @@ def fit_transform(
143
135
144
136
Args:
145
137
ds: Input Dataset.
146
- transform_num_cpus: The number of CPUs to reserve for each parallel map worker.
147
- transform_memory: The heap memory in bytes to reserve for each parallel map worker.
148
- transform_batch_size: The maximum number of rows to return.
149
- transform_concurrency: The maximum number of Ray workers to use concurrently.
138
+ transform_num_cpus: [experimental] The number of CPUs to reserve for each parallel map worker.
139
+ transform_memory: [experimental] The heap memory in bytes to reserve for each parallel map worker.
140
+ transform_batch_size: [experimental] The maximum number of rows to return.
141
+ transform_concurrency: [experimental] The maximum number of Ray workers to use concurrently.
150
142
151
143
Returns:
152
144
ray.data.Dataset: The transformed Dataset.
@@ -164,19 +156,19 @@ def transform(
164
156
self ,
165
157
ds : "Dataset" ,
166
158
* ,
159
+ batch_size : Optional [int ] = None ,
167
160
num_cpus : Optional [float ] = None ,
168
161
memory : Optional [float ] = None ,
169
- batch_size : Union [int , None , Literal ["default" ]] = None ,
170
162
concurrency : Optional [int ] = None ,
171
163
) -> "Dataset" :
172
164
"""Transform the given dataset.
173
165
174
166
Args:
175
167
ds: Input Dataset.
176
- num_cpus: The number of CPUs to reserve for each parallel map worker.
177
- memory: The heap memory in bytes to reserve for each parallel map worker.
178
- batch_size: The maximum number of rows to return .
179
- concurrency: The maximum number of Ray workers to use concurrently.
168
+ batch_size: [experimental] Advanced configuration for adjusting input size for each worker.
169
+ num_cpus: [experimental] The number of CPUs to reserve for each parallel map worker.
170
+ memory: [experimental] The heap memory in bytes to reserve for each parallel map worker .
171
+ concurrency: [experimental] The maximum number of Ray workers to use concurrently.
180
172
181
173
Returns:
182
174
ray.data.Dataset: The transformed Dataset.
@@ -193,7 +185,13 @@ def transform(
193
185
"`fit` must be called before `transform`, "
194
186
"or simply use fit_transform() to run both steps"
195
187
)
196
- transformed_ds = self ._transform (ds , num_cpus , memory , batch_size , concurrency )
188
+ transformed_ds = self ._transform (
189
+ ds ,
190
+ batch_size = batch_size ,
191
+ num_cpus = num_cpus ,
192
+ memory = memory ,
193
+ concurrency = concurrency ,
194
+ )
197
195
return transformed_ds
198
196
199
197
def transform_batch (self , data : "DataBatchType" ) -> "DataBatchType" :
@@ -258,9 +256,9 @@ def _determine_transform_to_use(self) -> BatchFormat:
258
256
def _transform (
259
257
self ,
260
258
ds : "Dataset" ,
259
+ batch_size : Optional [int ],
261
260
num_cpus : Optional [float ] = None ,
262
261
memory : Optional [float ] = None ,
263
- batch_size : Union [int , None , Literal ["default" ]] = None ,
264
262
concurrency : Optional [int ] = None ,
265
263
) -> "Dataset" :
266
264
transform_type = self ._determine_transform_to_use ()
0 commit comments