-
Notifications
You must be signed in to change notification settings - Fork 14
Description
Step 1: Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
Step 2: Load dataset
Example: SMS Spam Collection dataset (you can download as spam.csv)
data = pd.read_csv("spam.csv", encoding="latin-1")[["v1", "v2"]]
data.columns = ["label", "message"]
Step 3: Encode labels (ham = 0, spam = 1)
data["label"] = data["label"].map({"ham": 0, "spam": 1})
Step 4: Split dataset
X_train, X_test, y_train, y_test = train_test_split(
data["message"], data["label"], test_size=0.2, random_state=42
)
Step 5: Convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer(stop_words="english")
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
Step 6: Train model (Naive Bayes)
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)
Step 7: Predict
y_pred = model.predict(X_test_tfidf)
Step 8: Evaluate
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))