2
2
3
3
This module provides MinRedundancyMaxRelevance (MRMR) feature selection for classification or regression tasks.
4
4
In a classification task, the target should be of object or pandas category dtype, while in a regression task,
5
- the target should be of numpy categorical dtype . The predictors can be categorical or numerical without requiring encoding,
5
+ the target should be numeric . The predictors can be categorical or numerical without requiring encoding,
6
6
as the appropriate method (correlation, correlation ratio, or Theil's U) will be automatically selected based on the data type.
7
7
8
8
Module Structure:
@@ -42,16 +42,16 @@ class MinRedundancyMaxRelevance(SelectorMixin, BaseEstimator):
42
42
relevance_func: callable, optional
43
43
relevance function having arguments "X", "y", "sample_weight" and returning a pd.Series
44
44
containing a score of relevance for each feature
45
- redundancy : callable, optional
45
+ redundancy_func : callable, optional
46
46
Redundancy method.
47
47
If callable, it should take "X", "sample_weight" as input and return a pandas.Series
48
48
containing a score of redundancy for each feature.
49
- denominator : str or callable (optional, default='mean')
49
+ denominator_func : str or callable (optional, default='mean')
50
50
Synthesis function to apply to the denominator of MRMR score.
51
51
If string, name of method. Supported: 'max', 'mean'.
52
52
If callable, it should take an iterable as input and return a scalar.
53
53
task: str
54
- either "regression" or "classifiction "
54
+ either "regression" or "classification "
55
55
only_same_domain: bool (optional, default=False)
56
56
If False, all the necessary correlation coefficients are computed.
57
57
If True, only features belonging to the same domain are compared.
@@ -60,7 +60,7 @@ class MinRedundancyMaxRelevance(SelectorMixin, BaseEstimator):
60
60
return_scores: bool (optional, default=False)
61
61
If False, only the list of selected features is returned.
62
62
If True, a tuple containing (list of selected features, relevance, redundancy) is returned.
63
- n_jobs: int (optional, default=- 1)
63
+ n_jobs: int (optional, default=1)
64
64
Maximum number of workers to use. Only used when relevance = "f" or redundancy = "corr".
65
65
If -1, use as many workers as min(cpu count, number of features).
66
66
show_progress: bool (optional, default=True)
@@ -89,10 +89,11 @@ class MinRedundancyMaxRelevance(SelectorMixin, BaseEstimator):
89
89
>>> pred_name = [f"pred_{i}" for i in range(X.shape[1])]
90
90
>>> X.columns = pred_name
91
91
>>> y.name = "target"
92
- >>> fs_mrmr = MinRedundancyMaxRelevance(n_features_to_select=5,
92
+ >>> fs_mrmr = MinRedundancyMaxRelevance(
93
+ >>> n_features_to_select=5,
93
94
>>> relevance_func=None,
94
95
>>> redundancy_func=None,
95
- >>> task= "regression",#"classification",
96
+ >>> task="regression", #"classification",
96
97
>>> denominator_func=np.mean,
97
98
>>> only_same_domain=False,
98
99
>>> return_scores=False,
@@ -146,16 +147,16 @@ def fit(self, X, y, sample_weight=None):
146
147
X : pd.DataFrame, shape (n_samples, n_features)
147
148
Data from which to compute variances, where `n_samples` is
148
149
the number of samples and `n_features` is the number of features.
149
- y : any, default=None
150
- Ignored. This parameter exists only for compatibility with
151
- sklearn.pipeline.Pipeline.
150
+ y : array-like or pd.Series of shape (n_samples,)
151
+ Target vector. Must be numeric for regression or categorical for classification.
152
152
sample_weight : pd.Series, optional, shape (n_samples,)
153
153
weights for computing the statistics (e.g. weighted average)
154
154
155
155
Returns
156
156
-------
157
157
self : object
158
- Returns the instance itself.
158
+ If `return_scores=False`, returns self.
159
+ If `return_scores=True`, returns (selected_features, relevance_scores).
159
160
"""
160
161
161
162
if isinstance (X , pd .DataFrame ):
@@ -212,6 +213,9 @@ def fit(self, X, y, sample_weight=None):
212
213
[x in self .selected_features for x in self .feature_names_in_ ]
213
214
)
214
215
self .not_selected_features_ = self .not_selected_features
216
+
217
+ if self .return_scores :
218
+ return self .selected_features_ , self .relevance_ , self .redundancy_
215
219
return self
216
220
217
221
def transform (self , X ):
@@ -232,7 +236,7 @@ def transform(self, X):
232
236
raise TypeError ("X is not a dataframe" )
233
237
return X [self .selected_features_ ]
234
238
235
- def fit_transform (self , X , y , sample_weight = None ):
239
+ def fit_transform (self , X , y , sample_weight = None , ** fit_params ):
236
240
"""
237
241
Fit to data, then transform it.
238
242
Fits transformer to `X` and `y` and optionally sample_weight
0 commit comments