|
6 | 6 | import traceback
|
7 | 7 | import h5py
|
8 | 8 | import numpy as np
|
9 |
| -from sklearn.preprocessing import RobustScaler |
| 9 | +from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler |
10 | 10 | import warnings
|
11 | 11 |
|
12 | 12 | warnings.filterwarnings("ignore")
|
@@ -228,26 +228,76 @@ def check_and_delete_corrupt_h5_file(file_path, logger):
|
228 | 228 | logger.info(f"File does not exist '{basename}'")
|
229 | 229 |
|
230 | 230 |
|
231 |
| -def standardize_features(x_train, x_test, scaler=RobustScaler): |
232 |
| - """Standardize the features in the training and test sets using |
233 |
| - RobustScaler as a default. |
| 231 | +def standardize_features(x_train, x_test, scaler=RobustScaler, scaler_params={}): |
| 232 | + """ |
| 233 | + Standardize the features in the training and test sets using the specified scaler. |
| 234 | +
|
| 235 | + The function offers flexibility to choose between `StandardScaler`, `RobustScaler`, and `MinMaxScaler`. |
| 236 | + It allows customization of the chosen scaler’s parameters using a dictionary and raises a ValueError |
| 237 | + if an unsupported scaler is passed. |
234 | 238 |
|
235 | 239 | Parameters
|
236 | 240 | ----------
|
237 |
| - x_train : array-like |
| 241 | + x_train : array-like of shape (n_samples, n_features) |
238 | 242 | Training set features.
|
239 |
| - x_test : array-like |
| 243 | + x_test : array-like of shape (n_samples, n_features) |
240 | 244 | Test set features.
|
| 245 | + scaler : {StandardScaler, RobustScaler, MinMaxScaler}, optional, default=RobustScaler |
| 246 | + The scaling class to be used for standardization. Choose from: |
| 247 | + - StandardScaler: Standardize features by removing the mean and scaling to unit variance. |
| 248 | + - RobustScaler: Scale features using statistics that are robust to outliers. |
| 249 | + - MinMaxScaler: Scale features to a given range (usually between 0 and 1). |
| 250 | + scaler_params : dict, optional, default={} |
| 251 | + Parameters to be passed to the selected scaler. Example: {'with_mean': False} for `StandardScaler`. |
241 | 252 |
|
242 | 253 | Returns
|
243 | 254 | -------
|
244 |
| - x_train : array-like |
| 255 | + x_train : array-like of shape (n_samples, n_features) |
245 | 256 | Standardized training set features.
|
246 |
| - x_test : array-like |
| 257 | + x_test : array-like of shape (n_samples, n_features) |
247 | 258 | Standardized test set features.
|
| 259 | +
|
| 260 | + Raises |
| 261 | + ------ |
| 262 | + ValueError |
| 263 | + If the specified scaler is not one of `StandardScaler`, `RobustScaler`, or `MinMaxScaler`. |
| 264 | +
|
| 265 | + Example |
| 266 | + ------- |
| 267 | + >>> from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler |
| 268 | + >>> import numpy as np |
| 269 | + >>> x_train = np.array([[1, 2], [2, 3], [3, 4]]) |
| 270 | + >>> x_test = np.array([[4, 5], [5, 6]]) |
| 271 | +
|
| 272 | + # Example with StandardScaler and a custom parameter |
| 273 | + >>> scaler_params = {'with_mean': False} |
| 274 | + >>> x_train_scaled, x_test_scaled = standardize_features( |
| 275 | + ... x_train, x_test, scaler=StandardScaler, scaler_params=scaler_params |
| 276 | + ... ) |
| 277 | +
|
| 278 | + # Example with RobustScaler (default) |
| 279 | + >>> x_train_scaled, x_test_scaled = standardize_features(x_train, x_test, scaler=RobustScaler) |
| 280 | +
|
| 281 | + # Example with MinMaxScaler |
| 282 | + >>> x_train_scaled, x_test_scaled = standardize_features(x_train, x_test, scaler=MinMaxScaler) |
| 283 | +
|
| 284 | + # Example with an invalid scaler (this will raise a ValueError) |
| 285 | + >>> try: |
| 286 | + ... x_train_scaled, x_test_scaled = standardize_features(x_train, x_test, scaler="InvalidScaler") |
| 287 | + ... except ValueError as e: |
| 288 | + ... print(e) |
| 289 | + 'Invalid scaler specified. Choose from StandardScaler, RobustScaler, or MinMaxScaler.' |
248 | 290 | """
|
249 |
| - standardize = scaler() |
250 |
| - x_train = standardize.fit_transform(x_train) |
251 |
| - x_test = standardize.transform(x_test) |
252 |
| - return x_train, x_test |
| 291 | + if scaler not in [StandardScaler, RobustScaler, MinMaxScaler]: |
| 292 | + raise ValueError( |
| 293 | + "Invalid scaler specified. Choose from StandardScaler, RobustScaler, or MinMaxScaler." |
| 294 | + ) |
| 295 | + |
| 296 | + # Initialize the chosen scaler with the specified parameters |
| 297 | + scaler_instance = scaler(**scaler_params) |
253 | 298 |
|
| 299 | + # Fit the scaler on the training data and transform both training and test data |
| 300 | + x_train = scaler_instance.fit_transform(x_train) |
| 301 | + x_test = scaler_instance.transform(x_test) |
| 302 | + |
| 303 | + return x_train, x_test |
0 commit comments