KOSASIH · deepsource-autofix · Jun 4, 2024
diff --git a/blockchain_integration/pi_network/ai/fraud_detection_ai.py b/blockchain_integration/pi_network/ai/fraud_detection_ai.py
@@ -1,58 +1,63 @@
 # pi_network/ai/fraud_detection_ai.py
-import pandas as pd
-import numpy as np
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
-from sklearn.preprocessing import StandardScaler
-from sklearn.decomposition import PCA
-from sklearn.manifold import TSNE
 import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
 import seaborn as sns
+from catboost import CatBoostClassifier
+from imblearn.ensemble import EasyEnsemble
+from imblearn.over_sampling import SMOTE
+from keras.callbacks import EarlyStopping, ModelCheckpoint
+from keras.layers import LSTM, Dense, Dropout
 from keras.models import Sequential
-from keras.layers import Dense, Dropout, LSTM
 from keras.utils import to_categorical
-from keras.callbacks import EarlyStopping, ModelCheckpoint
-from imblearn.over_sampling import SMOTE
-from imblearn.ensemble import EasyEnsemble
-from xgboost import XGBClassifier
-from catboost import CatBoostClassifier
 from lightgbm import LGBMClassifier
-from sklearn.ensemble import GradientBoostingClassifier
-from sklearn.ensemble import VotingClassifier
-from sklearn.ensemble import StackingClassifier
-from sklearn.ensemble import BaggingClassifier
-from sklearn.ensemble import AdaBoostClassifier
-from sklearn.svm import SVC
-from sklearn.naive_bayes import GaussianNB
+from sklearn.decomposition import PCA
 from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
-from sklearn.tree import DecisionTreeClassifier
-from sklearn.neighbors import KNeighborsClassifier
+from sklearn.ensemble import (
+    AdaBoostClassifier,
+    BaggingClassifier,
+    GradientBoostingClassifier,
+    RandomForestClassifier,
+    StackingClassifier,
+    VotingClassifier,
+)
+from sklearn.feature_selection import RFECV, SelectFromModel
 from sklearn.linear_model import LogisticRegression
-from sklearn.feature_selection import SelectFromModel
-from sklearn.feature_selection import RFECV
+from sklearn.manifold import TSNE
+from sklearn.metrics import (
+    accuracy_score,
+    average_precision_score,
+    classification_report,
+    confusion_matrix,
+    f1_score,
+    precision_score,
+    recall_score,
+    roc_auc_score,
+)
+from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
+from sklearn.naive_bayes import GaussianNB
+from sklearn.neighbors import KNeighborsClassifier
 from sklearn.pipeline import Pipeline
-from sklearn.model_selection import GridSearchCV
-from sklearn.model_selection import RandomizedSearchCV
-from sklearn.metrics import roc_auc_score
-from sklearn.metrics import average_precision_score
-from sklearn.metrics import f1_score
-from sklearn.metrics import recall_score
-from sklearn.metrics import precision_score
+from sklearn.preprocessing import StandardScaler
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from xgboost import XGBClassifier
 
 # Load the dataset
-df = pd.read_csv('fraud_data.csv')
+df = pd.read_csv("fraud_data.csv")
 
 # Preprocess the data
-X = df.drop(['is_fraud'], axis=1)
-y = df['is_fraud']
+X = df.drop(["is_fraud"], axis=1)
+y = df["is_fraud"]
 
 # Handle imbalanced dataset
 smote = SMOTE(random_state=42)
 X_res, y_res = smote.fit_resample(X, y)
 
 # Split the data into training and testing sets
-X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)
+X_train, X_test, y_train, y_test = train_test_split(
+    X_res, y_res, test_size=0.2, random_state=42
+)
 
 # Scale the data
 scaler = StandardScaler()
@@ -71,40 +76,64 @@
 
 # Define the neural network model
 model = Sequential()
-model.add(Dense(64, activation='relu', input_shape=(X_train_pca.shape[1],)))
+model.add(Dense(64, activation="relu", input_shape=(X_train_pca.shape[1],)))
 model.add(Dropout(0.2))
-model.add(Dense(32, activation='relu'))
+model.add(Dense(32, activation="relu"))
 model.add(Dropout(0.2))
-model.add(Dense(2, activation='softmax'))
+model.add(Dense(2, activation="softmax"))
 
 # Compile the model
-model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
+model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
 
 # Define the early stopping callback
-early_stopping = EarlyStopping(monitor='val_loss', patience=5, min_delta=0.001)
+early_stopping = EarlyStopping(monitor="val_loss", patience=5, min_delta=0.001)
 
 # Define the model checkpoint callback
-model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, mode='min')
+model_checkpoint = ModelCheckpoint(
+    "best_model.h5", monitor="val_loss", save_best_only=True, mode="min"
+)
 
 # Train the model
-history = model.fit(X_train_pca, to_categorical(y_train), epochs=100, batch_size=128, validation_data=(X_test_pca, to_categorical(y_test)), callbacks=[early_stopping, model_checkpoint])
+history = model.fit(
+    X_train_pca,
+    to_categorical(y_train),
+    epochs=100,
+    batch_size=128,
+    validation_data=(X_test_pca, to_categorical(y_test)),
+    callbacks=[early_stopping, model_checkpoint],
+)
 
 # Evaluate the model
 y_pred = model.predict(X_test_pca)
 y_pred_class = np.argmax(y_pred, axis=1)
-print('Accuracy:', accuracy_score(y_test, y_pred_class))
-print('Classification Report:')
+print("Accuracy:", accuracy_score(y_test, y_pred_class))
+print("Classification Report:")
 print(classification_report(y_test, y_pred_class))
-print('Confusion Matrix:')
+print("Confusion Matrix:")
 print(confusion_matrix(y_test, y_pred_class))
 
 # Define the ensemble models
 xgb_model = XGBClassifier()
 cat_model = CatBoostClassifier()
 lgbm_model = LGBMClassifier()
 gbm_model = GradientBoostingClassifier()
-voting_model = VotingClassifier(estimators=[('xgb', xgb_model), ('cat', cat_model), ('lgbm', lgbm_model), ('gbm', gbm_model)])
-stacking_model = StackingClassifier(estimators=[('xgb', xgb_model), ('cat', cat_model), ('lgbm', lgbm_model), ('gbm', gbm_model)], final_estimator=LogisticRegression())
+voting_model = VotingClassifier(
+    estimators=[
+        ("xgb", xgb_model),
+        ("cat", cat_model),
+        ("lgbm", lgbm_model),
+        ("gbm", gbm_model),
+    ]
+)
+stacking_model = StackingClassifier(
+    estimators=[
+        ("xgb", xgb_model),
+        ("cat", cat_model),
+        ("lgbm", lgbm_model),
+        ("gbm", gbm_model),
+    ],
+    final_estimator=LogisticRegression(),
+)
 bagging_model = BaggingClassifier(base_estimator=xgb_model, n_estimators=10)
 adaboost_model = AdaBoostClassifier(base_estimator=xgb_model, n_estimators=10)
 
@@ -128,18 +157,18 @@
 y_pred_bagging = bagging_model.predict(X_test_pca)
 y_pred_adaboost = adaboost_model.predict(X_test_pca)
 
-print('XGB Accuracy:', accuracy_score(y_test, y_pred_xgb))
-print('CAT Accuracy:', accuracy_score(y_test, y_pred_cat))
-print('LGBM Accuracy:', accuracy_score(y_test, y_pred_lgbm))
-print('GBM Accuracy:', accuracy_score(y_test, y_pred_gbm))
-print('Voting Accuracy:', accuracy_score(y_test, y_pred_voting))
-print('Stacking Accuracy:', accuracy_score(y_test, y_pred_stacking))
-print('Bagging Accuracy:', accuracy_score(y_test, y_pred_bagging))
-print('AdaBoost Accuracy:', accuracy_score(y_test, y_pred_adaboost))
+print("XGB Accuracy:", accuracy_score(y_test, y_pred_xgb))
+print("CAT Accuracy:", accuracy_score(y_test, y_pred_cat))
+print("LGBM Accuracy:", accuracy_score(y_test, y_pred_lgbm))
+print("GBM Accuracy:", accuracy_score(y_test, y_pred_gbm))
+print("Voting Accuracy:", accuracy_score(y_test, y_pred_voting))
+print("Stacking Accuracy:", accuracy_score(y_test, y_pred_stacking))
+print("Bagging Accuracy:", accuracy_score(y_test, y_pred_bagging))
+print("AdaBoost Accuracy:", accuracy_score(y_test, y_pred_adaboost))
 
 # Define the feature selection models
 rf_model = RandomForestClassifier()
-svm_model = SVC(kernel='linear', probability=True)
+svm_model = SVC(kernel="linear", probability=True)
 gnb_model = GaussianNB()
 qda_model = QuadraticDiscriminantAnalysis()
 dt_model = DecisionTreeClassifier()
@@ -164,116 +193,142 @@
 y_pred_knn = knn_model.predict(X_test_pca)
 y_pred_lr = lr_model.predict(X_test_pca)
 
-print('RF Accuracy:', accuracy_score(y_test, y_pred_rf))
-print('SVM Accuracy:', accuracy_score(y_test, y_pred_svm))
-print('GNB Accuracy:', accuracy_score(y_test, y_pred_gnb))
-print('QDA Accuracy:', accuracy_score(y_test, y_pred_qda))
-print('DT Accuracy:', accuracy_score(y_test, y_pred_dt))
-print('KNN Accuracy:', accuracy_score(y_test, y_pred_knn))
-print('LR Accuracy:', accuracy_score(y_test, y_pred_lr))
+print("RF Accuracy:", accuracy_score(y_test, y_pred_rf))
+print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
+print("GNB Accuracy:", accuracy_score(y_test, y_pred_gnb))
+print("QDA Accuracy:", accuracy_score(y_test, y_pred_qda))
+print("DT Accuracy:", accuracy_score(y_test, y_pred_dt))
+print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
+print("LR Accuracy:", accuracy_score(y_test, y_pred_lr))
 
 # Define the hyperparameter tuning models
-grid_search = GridSearchCV(xgb_model, {'max_depth': [3, 5, 7], 'learning_rate': [0.1, 0.5, 1.0]}, cv=5, scoring='accuracy')
-random_search = RandomizedSearchCV(xgb_model, {'max_depth': [3, 5, 7], 'learning_rate': [0.1, 0.5, 1.0]}, cv=5, scoring='accuracy', n_iter=10)
+grid_search = GridSearchCV(
+    xgb_model,
+    {"max_depth": [3, 5, 7], "learning_rate": [0.1, 0.5, 1.0]},
+    cv=5,
+    scoring="accuracy",
+)
+random_search = RandomizedSearchCV(
+    xgb_model,
+    {"max_depth": [3, 5, 7], "learning_rate": [0.1, 0.5, 1.0]},
+    cv=5,
+    scoring="accuracy",
+    n_iter=10,
+)
 
 # Train the hyperparameter tuning models
 grid_search.fit(X_train_pca, y_train)
 random_search.fit(X_train_pca, y_train)
 
 # Evaluate the hyperparameter tuning models
-print('Grid Search Best Parameters:', grid_search.best_params_)
-print('Grid Search Best Score:', grid_search.best_score_)
-print('Random Search Best Parameters:', random_search.best_params_)
-print('Random Search Best Score:', random_search.best_score_)
+print("Grid Search Best Parameters:", grid_search.best_params_)
+print("Grid Search Best Score:", grid_search.best_score_)
+print("Random Search Best Parameters:", random_search.best_params_)
+print("Random Search Best Score:", random_search.best_score_)
 
 # Define the feature importance models
 feature_importance = rf_model.feature_importances_
-print('Feature Importance:')
+print("Feature Importance:")
 print(feature_importance)
 
 # Define the partial dependence plots
-partial_dependence = pd.DataFrame({'Feature 1': X_test_pca[:, 0], 'Feature 2': X_test_pca[:, 1], 'Target': y_test})
-sns.lmplot(x='Feature 1', y='Target', data=partial_dependence, hue='is_fraud')
-sns.lmplot(x='Feature 2', y='Target', data=partial_dependence, hue='is_fraud')
+partial_dependence = pd.DataFrame(
+    {"Feature 1": X_test_pca[:, 0], "Feature 2": X_test_pca[:, 1], "Target": y_test}
+)
+sns.lmplot(x="Feature 1", y="Target", data=partial_dependence, hue="is_fraud")
+sns.lmplot(x="Feature 2", y="Target", data=partial_dependence, hue="is_fraud")
 
 # Define the learning curves
-train_sizes, train_scores, test_scores = learning_curve(xgb_model, X_train_pca, y_train, cv=5, scoring='accuracy')
+train_sizes, train_scores, test_scores = learning_curve(
+    xgb_model, X_train_pca, y_train, cv=5, scoring="accuracy"
+)
 train_scores_mean = np.mean(train_scores, axis=1)
 train_scores_std = np.std(train_scores, axis=1)
 test_scores_mean = np.mean(test_scores, axis=1)
 test_scores_std = np.std(test_scores, axis=1)
-plt.plot(train_sizes, train_scores_mean, label='Training Score')
-plt.plot(train_sizes, test_scores_mean, label='Test Score')
+plt.plot(train_sizes, train_scores_mean, label="Training Score")
+plt.plot(train_sizes, test_scores_mean, label="Test Score")
 plt.legend()
 plt.show()
 
 # Define the ROC-AUC curve
 y_pred_proba = model.predict(X_test_pca)
 roc_auc = roc_auc_score(y_test, y_pred_proba[:, 1])
-print('ROC-AUC Score:', roc_auc)
+print("ROC-AUC Score:", roc_auc)
 
 # Define the precision-recall curve
 precision, recall, _ = precision_recall_curve(y_test, y_pred_proba[:, 1])
 average_precision = average_precision_score(y_test, y_pred_proba[:, 1])
-print('Average Precision Score:', average_precision)
+print("Average Precision Score:", average_precision)
 
 # Define the F1 score
 f1 = f1_score(y_test, y_pred_class)
-print('F1 Score:', f1)
+print("F1 Score:", f1)
 
 # Define the recall score
 recall = recall_score(y_test, y_pred_class)
-print('Recall Score:', recall)
+print("Recall Score:", recall)
 
 # Define the precision score
 precision = precision_score(y_test, y_pred_class)
-print('Precision Score:', precision)
+print("Precision Score:", precision)
 
 # Define the confusion matrix
 cm = confusion_matrix(y_test, y_pred_class)
-print('Confusion Matrix:')
+print("Confusion Matrix:")
 print(cm)
 
 # Define the classification report
 cr = classification_report(y_test, y_pred_class)
-print('Classification Report:')
+print("Classification Report:")
 print(cr)
 
 # Define the feature selection pipeline
-feature_selection_pipeline = Pipeline([
-    ('feature_selection', SelectFromModel(rf_model)),
-    ('classifier', xgb_model)
-])
+feature_selection_pipeline = Pipeline(
+    [("feature_selection", SelectFromModel(rf_model)), ("classifier", xgb_model)]
+)
 
 # Train the feature selection pipeline
 feature_selection_pipeline.fit(X_train_pca, y_train)
 
 # Evaluate the feature selection pipeline
 y_pred_fs = feature_selection_pipeline.predict(X_test_pca)
-print('Feature Selection Accuracy:', accuracy_score(y_test, y_pred_fs))
+print("Feature Selection Accuracy:", accuracy_score(y_test, y_pred_fs))
 
 # Define the recursive feature elimination pipeline
-rfe_pipeline = Pipeline([
-    ('feature_selection', RFECV(rf_model, cv=5, scoring='accuracy')),
-    ('classifier', xgb_model)
-])
+rfe_pipeline = Pipeline(
+    [
+        ("feature_selection", RFECV(rf_model, cv=5, scoring="accuracy")),
+        ("classifier", xgb_model),
+    ]
+)
 
 # Train the recursive feature elimination pipeline
 rfe_pipeline.fit(X_train_pca, y_train)
 
 # Evaluate the recursive feature elimination pipeline
 y_pred_rfe = rfe_pipeline.predict(X_test_pca)
-print('Recursive Feature Elimination Accuracy:', accuracy_score(y_test, y_pred_rfe))
+print("Recursive Feature Elimination Accuracy:", accuracy_score(y_test, y_pred_rfe))
 
 # Define the hyperparameter tuning pipeline
-hyperparameter_tuning_pipeline = Pipeline([
-    ('feature_selection', SelectFromModel(rf_model)),
-    ('classifier', GridSearchCV(xgb_model, {'max_depth': [3, 5, 7], 'learning_rate': [0.1, 0.5, 1.0]}, cv=5, scoring='accuracy'))
-])
+hyperparameter_tuning_pipeline = Pipeline(
+    [
+        ("feature_selection", SelectFromModel(rf_model)),
+        (
+            "classifier",
+            GridSearchCV(
+                xgb_model,
+                {"max_depth": [3, 5, 7], "learning_rate": [0.1, 0.5, 1.0]},
+                cv=5,
+                scoring="accuracy",
+            ),
+        ),
+    ]
+)
 
 # Train the hyperparameter tuning pipeline
 hyperparameter_tuning_pipeline.fit(X_train_pca, y_train)
 
 # Evaluate the hyperparameter tuning pipeline
 y_pred_ht = hyperparameter_tuning_pipeline.predict(X_test_pca)
-print('Hyperparameter Tuning Accuracy:', accuracy_score(y_test, y_pred_ht))
+print("Hyperparameter Tuning Accuracy:", accuracy_score(y_test, y_pred_ht))