Skip to content

Commit 4592cd9

Browse files
authored
Merge pull request bigbio#10 from enriquea/main
added fs pipeline & code clean-up
2 parents 3d3e882 + 33634c4 commit 4592cd9

File tree

14 files changed

+1336
-409
lines changed

14 files changed

+1336
-409
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,3 +132,4 @@ dmypy.json
132132
local/
133133
testscripts/
134134
.idea/*
135+
/benchmarking/

docs/README.methods.md

Lines changed: 1 addition & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -53,75 +53,4 @@ A typical workflow written using `fsspark` can be divided roughly in four major
5353

5454
### 5. Feature selection pipeline example
5555

56-
Check it out [here](fsspark/pipeline/fs_corr_rf.py) a full FS pipeline example.
57-
58-
```python
59-
"""
60-
Example of a feature selection pipeline implemented in fsspark.
61-
62-
After data import and pre-processing, the pipeline applies univariate correlation filter,
63-
multivariate correlation filter and Randon Forest classification.
64-
65-
"""
66-
67-
from fsspark.config.context import init_spark, stop_spark_session
68-
from fsspark.fs.core import FSDataFrame
69-
from fsspark.fs.ml import cv_rf_classification, get_accuracy, get_predictions
70-
from fsspark.fs.multivariate import multivariate_filter
71-
from fsspark.fs.univariate import univariate_filter
72-
from fsspark.fs.utils import (remove_features_by_missingness_rate,
73-
impute_missing)
74-
from fsspark.utils.datasets import get_tnbc_data_path
75-
from fsspark.utils.io import import_table_as_psdf
76-
77-
# Init spark
78-
init_spark()
79-
80-
# Import data
81-
fsdf = import_table_as_psdf(get_tnbc_data_path(),
82-
n_partitions=5)
83-
84-
fsdf = FSDataFrame(fsdf, sample_col='Sample', label_col='label')
85-
86-
# Step 1. Data pre-processing.
87-
88-
# a) Filter missingness rate
89-
fsdf = remove_features_by_missingness_rate(fsdf, threshold=0.1)
90-
91-
# b) Impute data frame
92-
fsdf = impute_missing(fsdf)
93-
94-
# c) Scale features
95-
fsdf = fsdf.scale_features(scaler_method='standard')
96-
97-
# Split dataset in training/testing
98-
training_df, testing_df = fsdf.split_df(label_type_cat=True,
99-
split_training_factor=0.8)
100-
101-
# Step 2. Apply univariate correlation filter
102-
training_df = univariate_filter(training_df,
103-
univariate_method='u_corr',
104-
corr_threshold=0.3)
105-
106-
# Step 3. Apply multivariate correlation filter
107-
training_df = multivariate_filter(training_df,
108-
multivariate_method='m_corr',
109-
corr_threshold=0.7
110-
)
111-
112-
# Step 4. ML-algorithm with cross-validation
113-
cv_model = cv_rf_classification(training_df,
114-
binary_classification=False)
115-
116-
# Print out some stats
117-
118-
# Get accuracy from training
119-
acc = get_accuracy(model=cv_model)
120-
print(f"Training accuracy: {acc}")
121-
122-
# Get predictions from training
123-
pred = get_predictions(model=cv_model)
124-
pred.show()
125-
126-
stop_spark_session()
127-
```
56+
[FS pipeline example](../fsspark/pipeline/fs_pipeline_example.py)

fsspark/config/context.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,14 @@
66
PYARROW_SETTINGS,
77
PANDAS_ON_SPARK_API_SETTINGS)
88

9-
109
os.environ['PYARROW_IGNORE_TIMEZONE'] = "1"
10+
11+
1112
# os.environ['JAVA_HOME'] = "/Library/Java/JavaVirtualMachines/jdk1.8.0_162.jdk/Contents/Home"
1213
# os.environ['SPARK_HOME'] = "/usr/local/spark-3.3.0-bin-hadoop3"
1314

14-
def init_spark(apply_pyarrow_settings: bool = True,
15+
def init_spark(master: str = "local[8]",
16+
apply_pyarrow_settings: bool = True,
1517
apply_extra_spark_settings: bool = True,
1618
apply_pandas_settings: bool = True) -> SparkSession:
1719
"""
@@ -24,7 +26,7 @@ def init_spark(apply_pyarrow_settings: bool = True,
2426

2527
# init or get spark session.
2628
spark = (SparkSession.builder
27-
.master("local[8]")
29+
.master(master)
2830
.appName("fs-spark")
2931
)
3032

fsspark/fs/constants.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# Define constants for the project
2+
3+
4+
# Define univariate feature selection methods constants
5+
ANOVA = 'anova'
6+
UNIVARIATE_CORRELATION = 'u_corr'
7+
F_REGRESSION = 'f_regression'
8+
9+
# Define dict with univariate feature selection methods and brief description
10+
UNIVARIATE_METHODS = {
11+
ANOVA: 'ANOVA univariate feature selection (F-classification)',
12+
UNIVARIATE_CORRELATION: 'Univariate Correlation',
13+
F_REGRESSION: 'Univariate F-regression'
14+
}
15+
16+
# Define multivariate feature selection methods constants
17+
MULTIVARIATE_CORRELATION = 'm_corr'
18+
MULTIVARIATE_VARIANCE = 'variance'
19+
20+
# Define dict with multivariate feature selection methods and brief description
21+
MULTIVARIATE_METHODS = {
22+
MULTIVARIATE_CORRELATION: 'Multivariate Correlation',
23+
MULTIVARIATE_VARIANCE: 'Multivariate Variance'
24+
}
25+
26+
# Define machine learning wrapper methods constants
27+
28+
# binary classification
29+
RF_BINARY = 'rf_binary'
30+
LSVC_BINARY = 'lsvc_binary'
31+
FM_BINARY = 'fm_binary' # TODO: implement this method
32+
33+
# multilabel classification
34+
RF_MULTILABEL = 'rf_multilabel'
35+
LR_MULTILABEL = 'lg_multilabel' # TODO: implement this method
36+
37+
# regression
38+
RF_REGRESSION = 'rf_regression'
39+
FM_REGRESSION = 'fm_regression' # TODO: implement this method
40+
41+
42+
# Define dict with machine learning wrapper methods and brief description
43+
ML_METHODS = {
44+
RF_BINARY: 'Random Forest Binary Classifier',
45+
LSVC_BINARY: 'Linear SVC Binary Classifier',
46+
FM_BINARY: 'Factorization Machine Binary Classifier',
47+
48+
RF_MULTILABEL: 'Random Forest Multi-label Classifier',
49+
LR_MULTILABEL: 'Logistic Regression Multi-label Classifier',
50+
51+
RF_REGRESSION: 'Random Forest Regression',
52+
FM_REGRESSION: 'Factorization Machine Regression'
53+
}

fsspark/fs/core.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -179,14 +179,6 @@ def get_sample_label(self) -> list:
179179
"""
180180
return self.__indexed_instances.tolist()
181181

182-
# def get_samples(self) -> pyspark.pandas.Series:
183-
# """
184-
# Get samples identifiers from DataFrame. Coerce data type to string.
185-
#
186-
# :return: Pandas Series
187-
# """
188-
# return self.__df[self.__sample_col].astype("str")
189-
190182
def get_sdf_vector(self, output_column_vector: str = 'features') -> pyspark.sql.DataFrame:
191183
"""
192184
Return a Spark dataframe with feature columns assembled into a column vector (a.k.a. Dense Vector column).
@@ -204,6 +196,18 @@ def get_sdf_vector(self, output_column_vector: str = 'features') -> pyspark.sql.
204196

205197
return sdf_vector
206198

199+
def get_sdf_and_label(self,
200+
output_column_vector: str = 'features') -> Tuple[pyspark.sql.dataframe.DataFrame, str, str]:
201+
"""
202+
Extracts the Spark DataFrame and label column name from FSDataFrame.
203+
204+
:param: output_column_vector: Name of the output column vector.
205+
:return: A tuple containing the Spark DataFrame and the label column name.
206+
"""
207+
sdf = self.get_sdf_vector(output_column_vector=output_column_vector)
208+
label_col = self.get_label_col_name()
209+
return sdf, label_col, output_column_vector
210+
207211
def _collect_features_as_array(self) -> np.array:
208212
"""
209213
Collect features from FSDataFrame as an array.

0 commit comments

Comments
 (0)