From 77fd327c729929a9056a931a6eb194b29a20dabc Mon Sep 17 00:00:00 2001 From: Deeksha1-Shet Date: Sun, 14 Jul 2024 20:42:22 +0530 Subject: [PATCH] Update Customer Churn Analysis.ipynb --- Customer Churn Analysis.ipynb | 41738 +------------------------------- 1 file changed, 1 insertion(+), 41737 deletions(-) diff --git a/Customer Churn Analysis.ipynb b/Customer Churn Analysis.ipynb index 7650a1f..f0abfbc 100644 --- a/Customer Churn Analysis.ipynb +++ b/Customer Churn Analysis.ipynb @@ -1,41737 +1 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "a9ee639c", - "metadata": {}, - "outputs": [], - "source": [ - "#importing libraries to retrieve data and read csv and to perform EDa tasks\n", - "import pandas as pd\n", - "import numpy as np\n", - "#importing for visualisation\n", - "import seaborn as sns\n", - "#sklearn for converting catagorical values to Numeric data" - ] - }, - { - "cell_type": "markdown", - "id": "902edb01", - "metadata": {}, - "source": [ - "# Real time use case\n", - "\n", - "Customer churn analysis helps businesses understand \n", - "why customers don't return for repeat business. Churn rate tells you\n", - "what portion of your customers leave over a period of time\n", - "\n", - "A company's churn rate, or employee churn rate, refers to both the attrition rate and the turnover rate. All of these terms refer to the number of employees who leave the organization during a specified period of time, generally a year. (Note that the term 'churn' used generically can also apply to customers.)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "56002c49", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "\n", - "#import dataset\n", - "data = pd.read_csv(\"C://Users//Lenovo//Downloads//Customer_Churn-1.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "a48f5f33", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
customerIDgenderSeniorCitizenPartnerDependentstenurePhoneServiceMultipleLinesInternetServiceOnlineSecurity...DeviceProtectionTechSupportStreamingTVStreamingMoviesContractPaperlessBillingPaymentMethodMonthlyChargesTotalChargesChurn
07590-VHVEGFemale0YesNo1NoNo phone serviceDSLNo...NoNoNoNoMonth-to-monthYesElectronic check29.8529.85No
15575-GNVDEMale0NoNo34YesNoDSLYes...YesNoNoNoOne yearNoMailed check56.951889.5No
23668-QPYBKMale0NoNo2YesNoDSLYes...NoNoNoNoMonth-to-monthYesMailed check53.85108.15Yes
37795-CFOCWMale0NoNo45NoNo phone serviceDSLYes...YesYesNoNoOne yearNoBank transfer (automatic)42.301840.75No
49237-HQITUFemale0NoNo2YesNoFiber opticNo...NoNoNoNoMonth-to-monthYesElectronic check70.70151.65Yes
59305-CDSKCFemale0NoNo8YesYesFiber opticNo...YesNoYesYesMonth-to-monthYesElectronic check99.65820.5Yes
61452-KIOVKMale0NoYes22YesYesFiber opticNo...NoNoYesNoMonth-to-monthYesCredit card (automatic)89.101949.4No
76713-OKOMCFemale0NoNo10NoNo phone serviceDSLYes...NoNoNoNoMonth-to-monthNoMailed check29.75301.9No
87892-POOKPFemale0YesNo28YesYesFiber opticNo...YesYesYesYesMonth-to-monthYesElectronic check104.803046.05Yes
96388-TABGUMale0NoYes62YesNoDSLYes...NoNoNoNoOne yearNoBank transfer (automatic)56.153487.95No
\n", - "

10 rows × 21 columns

\n", - "
" - ], - "text/plain": [ - " customerID gender SeniorCitizen Partner Dependents tenure PhoneService \\\n", - "0 7590-VHVEG Female 0 Yes No 1 No \n", - "1 5575-GNVDE Male 0 No No 34 Yes \n", - "2 3668-QPYBK Male 0 No No 2 Yes \n", - "3 7795-CFOCW Male 0 No No 45 No \n", - "4 9237-HQITU Female 0 No No 2 Yes \n", - "5 9305-CDSKC Female 0 No No 8 Yes \n", - "6 1452-KIOVK Male 0 No Yes 22 Yes \n", - "7 6713-OKOMC Female 0 No No 10 No \n", - "8 7892-POOKP Female 0 Yes No 28 Yes \n", - "9 6388-TABGU Male 0 No Yes 62 Yes \n", - "\n", - " MultipleLines InternetService OnlineSecurity ... DeviceProtection \\\n", - "0 No phone service DSL No ... No \n", - "1 No DSL Yes ... Yes \n", - "2 No DSL Yes ... No \n", - "3 No phone service DSL Yes ... Yes \n", - "4 No Fiber optic No ... No \n", - "5 Yes Fiber optic No ... Yes \n", - "6 Yes Fiber optic No ... No \n", - "7 No phone service DSL Yes ... No \n", - "8 Yes Fiber optic No ... Yes \n", - "9 No DSL Yes ... No \n", - "\n", - " TechSupport StreamingTV StreamingMovies Contract PaperlessBilling \\\n", - "0 No No No Month-to-month Yes \n", - "1 No No No One year No \n", - "2 No No No Month-to-month Yes \n", - "3 Yes No No One year No \n", - "4 No No No Month-to-month Yes \n", - "5 No Yes Yes Month-to-month Yes \n", - "6 No Yes No Month-to-month Yes \n", - "7 No No No Month-to-month No \n", - "8 Yes Yes Yes Month-to-month Yes \n", - "9 No No No One year No \n", - "\n", - " PaymentMethod MonthlyCharges TotalCharges Churn \n", - "0 Electronic check 29.85 29.85 No \n", - "1 Mailed check 56.95 1889.5 No \n", - "2 Mailed check 53.85 108.15 Yes \n", - "3 Bank transfer (automatic) 42.30 1840.75 No \n", - "4 Electronic check 70.70 151.65 Yes \n", - "5 Electronic check 99.65 820.5 Yes \n", - "6 Credit card (automatic) 89.10 1949.4 No \n", - "7 Mailed check 29.75 301.9 No \n", - "8 Electronic check 104.80 3046.05 Yes \n", - "9 Bank transfer (automatic) 56.15 3487.95 No \n", - "\n", - "[10 rows x 21 columns]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# After importing : Immediate step is to get the friendship with the data.\n", - "\n", - "#Top 10 recordings\n", - "data.head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "12cdffa9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
countmeanstdmin25%50%75%max
SeniorCitizen7043.00.1621470.3686120.000.00.000.001.00
tenure7043.032.37114924.5594810.009.029.0055.0072.00
MonthlyCharges7043.064.76169230.09004718.2535.570.3589.85118.75
\n", - "
" - ], - "text/plain": [ - " count mean std min 25% 50% 75% \\\n", - "SeniorCitizen 7043.0 0.162147 0.368612 0.00 0.0 0.00 0.00 \n", - "tenure 7043.0 32.371149 24.559481 0.00 9.0 29.00 55.00 \n", - "MonthlyCharges 7043.0 64.761692 30.090047 18.25 35.5 70.35 89.85 \n", - "\n", - " max \n", - "SeniorCitizen 1.00 \n", - "tenure 72.00 \n", - "MonthlyCharges 118.75 " - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#descriptive stats for numeric columns\n", - "data.describe().T" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "512a42bb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "customerID 0\n", - "gender 0\n", - "SeniorCitizen 0\n", - "Partner 0\n", - "Dependents 0\n", - "tenure 0\n", - "PhoneService 0\n", - "MultipleLines 0\n", - "InternetService 0\n", - "OnlineSecurity 0\n", - "OnlineBackup 0\n", - "DeviceProtection 0\n", - "TechSupport 0\n", - "StreamingTV 0\n", - "StreamingMovies 0\n", - "Contract 0\n", - "PaperlessBilling 0\n", - "PaymentMethod 0\n", - "MonthlyCharges 0\n", - "TotalCharges 0\n", - "Churn 0\n", - "dtype: int64" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#$checking null Values in the data\n", - "data.isnull().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "3182c4a4", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 7043 entries, 0 to 7042\n", - "Data columns (total 21 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 customerID 7043 non-null object \n", - " 1 gender 7043 non-null object \n", - " 2 SeniorCitizen 7043 non-null int64 \n", - " 3 Partner 7043 non-null object \n", - " 4 Dependents 7043 non-null object \n", - " 5 tenure 7043 non-null int64 \n", - " 6 PhoneService 7043 non-null object \n", - " 7 MultipleLines 7043 non-null object \n", - " 8 InternetService 7043 non-null object \n", - " 9 OnlineSecurity 7043 non-null object \n", - " 10 OnlineBackup 7043 non-null object \n", - " 11 DeviceProtection 7043 non-null object \n", - " 12 TechSupport 7043 non-null object \n", - " 13 StreamingTV 7043 non-null object \n", - " 14 StreamingMovies 7043 non-null object \n", - " 15 Contract 7043 non-null object \n", - " 16 PaperlessBilling 7043 non-null object \n", - " 17 PaymentMethod 7043 non-null object \n", - " 18 MonthlyCharges 7043 non-null float64\n", - " 19 TotalCharges 7043 non-null object \n", - " 20 Churn 7043 non-null object \n", - "dtypes: float64(1), int64(2), object(18)\n", - "memory usage: 1.1+ MB\n" - ] - } - ], - "source": [ - "data.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "12040083", - "metadata": {}, - "outputs": [], - "source": [ - "#changing total charges to numeric as it is continuos variable\n", - "data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors = 'coerce')\n", - "\n", - "#errors = 'coerce' : It will ignore all non-numeric values.It will replace all non-numeric values with NaN." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "f31e0524", - "metadata": {}, - "outputs": [], - "source": [ - "#drop nan values\n", - "data = data.dropna(how='any', axis = 0)\n", - "\n", - "# axis=0 will be removing rows from dataset.\n", - "# axis=1 will be removing columns" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "7a25184d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Int64Index: 7032 entries, 0 to 7042\n", - "Data columns (total 21 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 customerID 7032 non-null object \n", - " 1 gender 7032 non-null object \n", - " 2 SeniorCitizen 7032 non-null int64 \n", - " 3 Partner 7032 non-null object \n", - " 4 Dependents 7032 non-null object \n", - " 5 tenure 7032 non-null int64 \n", - " 6 PhoneService 7032 non-null object \n", - " 7 MultipleLines 7032 non-null object \n", - " 8 InternetService 7032 non-null object \n", - " 9 OnlineSecurity 7032 non-null object \n", - " 10 OnlineBackup 7032 non-null object \n", - " 11 DeviceProtection 7032 non-null object \n", - " 12 TechSupport 7032 non-null object \n", - " 13 StreamingTV 7032 non-null object \n", - " 14 StreamingMovies 7032 non-null object \n", - " 15 Contract 7032 non-null object \n", - " 16 PaperlessBilling 7032 non-null object \n", - " 17 PaymentMethod 7032 non-null object \n", - " 18 MonthlyCharges 7032 non-null float64\n", - " 19 TotalCharges 7032 non-null float64\n", - " 20 Churn 7032 non-null object \n", - "dtypes: float64(2), int64(2), object(17)\n", - "memory usage: 1.2+ MB\n" - ] - } - ], - "source": [ - "data.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "7fbf471f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
indexcustomerIDgenderSeniorCitizenPartnerDependentstenurePhoneServiceMultipleLinesInternetService...DeviceProtectionTechSupportStreamingTVStreamingMoviesContractPaperlessBillingPaymentMethodMonthlyChargesTotalChargesChurn
007590-VHVEGFemale0YesNo1NoNo phone serviceDSL...NoNoNoNoMonth-to-monthYesElectronic check29.8529.85No
115575-GNVDEMale0NoNo34YesNoDSL...YesNoNoNoOne yearNoMailed check56.951889.50No
223668-QPYBKMale0NoNo2YesNoDSL...NoNoNoNoMonth-to-monthYesMailed check53.85108.15Yes
337795-CFOCWMale0NoNo45NoNo phone serviceDSL...YesYesNoNoOne yearNoBank transfer (automatic)42.301840.75No
449237-HQITUFemale0NoNo2YesNoFiber optic...NoNoNoNoMonth-to-monthYesElectronic check70.70151.65Yes
..................................................................
702770386840-RESVBMale0YesYes24YesYesDSL...YesYesYesYesOne yearYesMailed check84.801990.50No
702870392234-XADUHFemale0YesYes72YesYesFiber optic...YesNoYesYesOne yearYesCredit card (automatic)103.207362.90No
702970404801-JZAZLFemale0YesYes11NoNo phone serviceDSL...NoNoNoNoMonth-to-monthYesElectronic check29.60346.45No
703070418361-LTMKDMale1YesNo4YesYesFiber optic...NoNoNoNoMonth-to-monthYesMailed check74.40306.60Yes
703170423186-AJIEKMale0NoNo66YesNoFiber optic...YesYesYesYesTwo yearYesBank transfer (automatic)105.656844.50No
\n", - "

7032 rows × 22 columns

\n", - "
" - ], - "text/plain": [ - " index customerID gender SeniorCitizen Partner Dependents tenure \\\n", - "0 0 7590-VHVEG Female 0 Yes No 1 \n", - "1 1 5575-GNVDE Male 0 No No 34 \n", - "2 2 3668-QPYBK Male 0 No No 2 \n", - "3 3 7795-CFOCW Male 0 No No 45 \n", - "4 4 9237-HQITU Female 0 No No 2 \n", - "... ... ... ... ... ... ... ... \n", - "7027 7038 6840-RESVB Male 0 Yes Yes 24 \n", - "7028 7039 2234-XADUH Female 0 Yes Yes 72 \n", - "7029 7040 4801-JZAZL Female 0 Yes Yes 11 \n", - "7030 7041 8361-LTMKD Male 1 Yes No 4 \n", - "7031 7042 3186-AJIEK Male 0 No No 66 \n", - "\n", - " PhoneService MultipleLines InternetService ... DeviceProtection \\\n", - "0 No No phone service DSL ... No \n", - "1 Yes No DSL ... Yes \n", - "2 Yes No DSL ... No \n", - "3 No No phone service DSL ... Yes \n", - "4 Yes No Fiber optic ... No \n", - "... ... ... ... ... ... \n", - "7027 Yes Yes DSL ... Yes \n", - "7028 Yes Yes Fiber optic ... Yes \n", - "7029 No No phone service DSL ... No \n", - "7030 Yes Yes Fiber optic ... No \n", - "7031 Yes No Fiber optic ... Yes \n", - "\n", - " TechSupport StreamingTV StreamingMovies Contract PaperlessBilling \\\n", - "0 No No No Month-to-month Yes \n", - "1 No No No One year No \n", - "2 No No No Month-to-month Yes \n", - "3 Yes No No One year No \n", - "4 No No No Month-to-month Yes \n", - "... ... ... ... ... ... \n", - "7027 Yes Yes Yes One year Yes \n", - "7028 No Yes Yes One year Yes \n", - "7029 No No No Month-to-month Yes \n", - "7030 No No No Month-to-month Yes \n", - "7031 Yes Yes Yes Two year Yes \n", - "\n", - " PaymentMethod MonthlyCharges TotalCharges Churn \n", - "0 Electronic check 29.85 29.85 No \n", - "1 Mailed check 56.95 1889.50 No \n", - "2 Mailed check 53.85 108.15 Yes \n", - "3 Bank transfer (automatic) 42.30 1840.75 No \n", - "4 Electronic check 70.70 151.65 Yes \n", - "... ... ... ... ... \n", - "7027 Mailed check 84.80 1990.50 No \n", - "7028 Credit card (automatic) 103.20 7362.90 No \n", - "7029 Electronic check 29.60 346.45 No \n", - "7030 Mailed check 74.40 306.60 Yes \n", - "7031 Bank transfer (automatic) 105.65 6844.50 No \n", - "\n", - "[7032 rows x 22 columns]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data = data.reset_index()\n", - "data\n", - "\n", - "#Pandas reset_index() is a method to reset index of a Data Frame. \n", - "#reset_index() method sets a list of integer ranging from 0 to length of data as index." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "8b145d67", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService',\n", - " 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',\n", - " 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',\n", - " 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn'],\n", - " dtype='object')" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#extratcing columns names with \"object\" Datatype\n", - "cols = data.select_dtypes(include=['object']).columns\n", - "cols\n", - "# 17 object data types are present" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "f0b567d6", - "metadata": {}, - "outputs": [], - "source": [ - "#copying datset\n", - "data2 = data.copy()" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "b6e27dc6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 7032 entries, 0 to 7031\n", - "Data columns (total 22 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 index 7032 non-null int64 \n", - " 1 customerID 7032 non-null int32 \n", - " 2 gender 7032 non-null int32 \n", - " 3 SeniorCitizen 7032 non-null int64 \n", - " 4 Partner 7032 non-null int32 \n", - " 5 Dependents 7032 non-null int32 \n", - " 6 tenure 7032 non-null int64 \n", - " 7 PhoneService 7032 non-null int32 \n", - " 8 MultipleLines 7032 non-null int32 \n", - " 9 InternetService 7032 non-null int32 \n", - " 10 OnlineSecurity 7032 non-null int32 \n", - " 11 OnlineBackup 7032 non-null int32 \n", - " 12 DeviceProtection 7032 non-null int32 \n", - " 13 TechSupport 7032 non-null int32 \n", - " 14 StreamingTV 7032 non-null int32 \n", - " 15 StreamingMovies 7032 non-null int32 \n", - " 16 Contract 7032 non-null int32 \n", - " 17 PaperlessBilling 7032 non-null int32 \n", - " 18 PaymentMethod 7032 non-null int32 \n", - " 19 MonthlyCharges 7032 non-null float64\n", - " 20 TotalCharges 7032 non-null float64\n", - " 21 Churn 7032 non-null int32 \n", - "dtypes: float64(2), int32(17), int64(3)\n", - "memory usage: 741.8 KB\n", - "None\n" - ] - } - ], - "source": [ - "#import the LabelEncoder class from the sklearn library,\n", - "from sklearn.preprocessing import LabelEncoder\n", - "\n", - "#making instance of labelnecoder\n", - "le = LabelEncoder()\n", - "\n", - "\n", - "#Label Encoding is a popular encoding technique for handling categorical variables.\n", - "#In this technique, each label is assigned a unique integer based on alphabetical ordering\n", - "#It can also be used to transform non-numerical labels to numerical labels.\n", - "\n", - "\n", - "for i in cols:\n", - " data2[i] = le.fit_transform(data2[i])\n", - " \n", - "#fit and transform the object data, and then replace the existing text data with the new encoded data.\n", - "\n", - "# checking the datatypes chnges or not\n", - "print (data2.info())" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "2051b0f0", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
indexcustomerIDgenderSeniorCitizenPartnerDependentstenurePhoneServiceMultipleLinesInternetService...DeviceProtectionTechSupportStreamingTVStreamingMoviesContractPaperlessBillingPaymentMethodMonthlyChargesTotalChargesChurn
00536500101010...000001229.8529.850
113953100034100...200010356.951889.500
22255810002100...000001353.85108.151
335524100045010...220010042.301840.750
44650000002101...000001270.70151.651
\n", - "

5 rows × 22 columns

\n", - "
" - ], - "text/plain": [ - " index customerID gender SeniorCitizen Partner Dependents tenure \\\n", - "0 0 5365 0 0 1 0 1 \n", - "1 1 3953 1 0 0 0 34 \n", - "2 2 2558 1 0 0 0 2 \n", - "3 3 5524 1 0 0 0 45 \n", - "4 4 6500 0 0 0 0 2 \n", - "\n", - " PhoneService MultipleLines InternetService ... DeviceProtection \\\n", - "0 0 1 0 ... 0 \n", - "1 1 0 0 ... 2 \n", - "2 1 0 0 ... 0 \n", - "3 0 1 0 ... 2 \n", - "4 1 0 1 ... 0 \n", - "\n", - " TechSupport StreamingTV StreamingMovies Contract PaperlessBilling \\\n", - "0 0 0 0 0 1 \n", - "1 0 0 0 1 0 \n", - "2 0 0 0 0 1 \n", - "3 2 0 0 1 0 \n", - "4 0 0 0 0 1 \n", - "\n", - " PaymentMethod MonthlyCharges TotalCharges Churn \n", - "0 2 29.85 29.85 0 \n", - "1 3 56.95 1889.50 0 \n", - "2 3 53.85 108.15 1 \n", - "3 0 42.30 1840.75 0 \n", - "4 2 70.70 151.65 1 \n", - "\n", - "[5 rows x 22 columns]" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data2.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "694b1ab5", - "metadata": {}, - "outputs": [], - "source": [ - "#Pandas profiling is an open source Python module with which we can quickly do an\n", - "#exploratory data analysis with just a few lines of code.\n", - "\n", - "\n", - "import pandas_profiling\n", - "\n", - "# SweetViz\n", - "# Pandas-Profiling\n", - "# Sweetviz\n", - "# Autoviz\n", - "# D-Tale" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "8f03c4a7", - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "ed2a3f58d6234b028179f208a2d34cae", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Summarize dataset: 0%| | 0/35 [00:00" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pandas_profiling.ProfileReport(data2)\n", - "\n", - "#The main disadvantage of pandas profiling is its use with large datasets.\n", - "#With the increase in the size of the data the time to generate the report also increases a lot." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "c12ed8cf", - "metadata": {}, - "outputs": [], - "source": [ - "def var_summary(x):\n", - " uc = x.mean()+(2*x.std())\n", - " lc = x.mean()-(2*x.std())\n", - " \n", - " for i in x:\n", - " if iuc:\n", - " count = 1\n", - " else:\n", - " count = 0\n", - " outlier_flag = count\n", - " return pd.Series([x.count(), x.isnull().sum(), x.sum(), x.mean(), x.median(), x.std(), x.var(), x.min(), x.quantile(0.01), x.quantile(0.05),x.quantile(0.10),x.quantile(0.25),x.quantile(0.50),x.quantile(0.75), x.quantile(0.90),x.quantile(0.95), x.quantile(0.99),x.max() , lc , uc,outlier_flag],\n", - " index=['N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR', 'MIN', 'P1' , 'P5' ,'P10' ,'P25' ,'P50' ,'P75' ,'P90' ,'P95' ,'P99' ,'MAX','LC','UC','outlier_flag'])\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "eb5ff419", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
NNMISSSUMMEANMEDIANSTDVARMINP1P5...P25P50P75P90P95P99MAXLCUCoutlier_flag
df_index7032.00.024763625.03521.5621443521.5002032.8324484.132408e+060.0070.31351.550...1762.75003521.5005282.25006336.9006689.45006971.69007042.00-544.1027527587.2270410.0
customerID7032.00.024720996.03515.5000003515.5002030.1078794.121338e+060.0070.31351.550...1757.75003515.5005273.25006327.9006679.45006960.69007031.00-544.7157587575.7157580.0
gender7032.00.03549.00.5046931.0000.5000142.500135e-010.000.000.000...0.00001.0001.00001.0001.00001.00001.00-0.4953341.5047200.0
SeniorCitizen7032.00.01142.00.1624000.0000.3688441.360459e-010.000.000.000...0.00000.0000.00001.0001.00001.00001.00-0.5752880.9000880.0
Partner7032.00.03393.00.4825090.0000.4997292.497296e-010.000.000.000...0.00000.0001.00001.0001.00001.00001.00-0.5169501.4819680.0
Dependents7032.00.02099.00.2984930.0000.4576292.094246e-010.000.000.000...0.00000.0001.00001.0001.00001.00001.00-0.6167661.2137510.0
tenure7032.00.0227990.032.42178629.00024.5452606.024698e+021.001.001.000...9.000029.00055.000069.00072.000072.000072.00-16.66873381.5123060.0
PhoneService7032.00.06352.00.9032991.0000.2955718.736218e-020.000.000.000...1.00001.0001.00001.0001.00001.00001.000.3121571.4944410.0
MultipleLines7032.00.06614.00.9405571.0000.9486278.998938e-010.000.000.000...0.00001.0002.00002.0002.00002.00002.00-0.9566972.8378120.0
InternetService7032.00.06136.00.8725821.0000.7372715.435690e-010.000.000.000...0.00001.0001.00002.0002.00002.00002.00-0.6019602.3471250.0
OnlineSecurity7032.00.05550.00.7892491.0000.8599627.395345e-010.000.000.000...0.00001.0002.00002.0002.00002.00002.00-0.9306752.5091730.0
OnlineBackup7032.00.06370.00.9058591.0000.8803947.750930e-010.000.000.000...0.00001.0002.00002.0002.00002.00002.00-0.8549282.6666460.0
DeviceProtection7032.00.06356.00.9038681.0000.8801787.747141e-010.000.000.000...0.00001.0002.00002.0002.00002.00002.00-0.8564892.6642250.0
TechSupport7032.00.05600.00.7963591.0000.8616747.424814e-010.000.000.000...0.00001.0002.00002.0002.00002.00002.00-0.9269882.5197070.0
StreamingTV7032.00.06926.00.9849261.0000.8852857.837295e-010.000.000.000...0.00001.0002.00002.0002.00002.00002.00-0.7856442.7554960.0
StreamingMovies7032.00.06982.00.9928901.0000.8853857.839062e-010.000.000.000...0.00001.0002.00002.0002.00002.00002.00-0.7778802.7636590.0
Contract7032.00.04842.00.6885670.0000.8329346.937791e-010.000.000.000...0.00000.0001.00002.0002.00002.00002.00-0.9773012.3544350.0
PaperlessBilling7032.00.04168.00.5927191.0000.4913632.414375e-010.000.000.000...0.00001.0001.00001.0001.00001.00001.00-0.3900071.5754450.0
PaymentMethod7032.00.011063.01.5732372.0001.0675041.139565e+000.000.000.000...1.00002.0002.00003.0003.00003.00003.00-0.5617723.7082450.0
MonthlyCharges7032.00.0455661.064.79820870.35030.0859749.051658e+0218.2519.2019.650...35.587570.35089.8625102.645107.4225114.7345118.754.626260124.9701560.0
TotalCharges7032.00.016056168.72283.3004411397.4752266.7713625.138252e+0618.8019.9049.605...401.45001397.4753794.73755976.6406923.59008039.88308684.80-2250.2422836816.8431651.0
Churn7032.00.01869.00.2657850.0000.4417821.951711e-010.000.000.000...0.00000.0001.00001.0001.00001.00001.00-0.6177781.1493480.0
\n", - "

22 rows × 21 columns

\n", - "
" - ], - "text/plain": [ - " N NMISS SUM MEAN MEDIAN \\\n", - "df_index 7032.0 0.0 24763625.0 3521.562144 3521.500 \n", - "customerID 7032.0 0.0 24720996.0 3515.500000 3515.500 \n", - "gender 7032.0 0.0 3549.0 0.504693 1.000 \n", - "SeniorCitizen 7032.0 0.0 1142.0 0.162400 0.000 \n", - "Partner 7032.0 0.0 3393.0 0.482509 0.000 \n", - "Dependents 7032.0 0.0 2099.0 0.298493 0.000 \n", - "tenure 7032.0 0.0 227990.0 32.421786 29.000 \n", - "PhoneService 7032.0 0.0 6352.0 0.903299 1.000 \n", - "MultipleLines 7032.0 0.0 6614.0 0.940557 1.000 \n", - "InternetService 7032.0 0.0 6136.0 0.872582 1.000 \n", - "OnlineSecurity 7032.0 0.0 5550.0 0.789249 1.000 \n", - "OnlineBackup 7032.0 0.0 6370.0 0.905859 1.000 \n", - "DeviceProtection 7032.0 0.0 6356.0 0.903868 1.000 \n", - "TechSupport 7032.0 0.0 5600.0 0.796359 1.000 \n", - "StreamingTV 7032.0 0.0 6926.0 0.984926 1.000 \n", - "StreamingMovies 7032.0 0.0 6982.0 0.992890 1.000 \n", - "Contract 7032.0 0.0 4842.0 0.688567 0.000 \n", - "PaperlessBilling 7032.0 0.0 4168.0 0.592719 1.000 \n", - "PaymentMethod 7032.0 0.0 11063.0 1.573237 2.000 \n", - "MonthlyCharges 7032.0 0.0 455661.0 64.798208 70.350 \n", - "TotalCharges 7032.0 0.0 16056168.7 2283.300441 1397.475 \n", - "Churn 7032.0 0.0 1869.0 0.265785 0.000 \n", - "\n", - " STD VAR MIN P1 P5 ... \\\n", - "df_index 2032.832448 4.132408e+06 0.00 70.31 351.550 ... \n", - "customerID 2030.107879 4.121338e+06 0.00 70.31 351.550 ... \n", - "gender 0.500014 2.500135e-01 0.00 0.00 0.000 ... \n", - "SeniorCitizen 0.368844 1.360459e-01 0.00 0.00 0.000 ... \n", - "Partner 0.499729 2.497296e-01 0.00 0.00 0.000 ... \n", - "Dependents 0.457629 2.094246e-01 0.00 0.00 0.000 ... \n", - "tenure 24.545260 6.024698e+02 1.00 1.00 1.000 ... \n", - "PhoneService 0.295571 8.736218e-02 0.00 0.00 0.000 ... \n", - "MultipleLines 0.948627 8.998938e-01 0.00 0.00 0.000 ... \n", - "InternetService 0.737271 5.435690e-01 0.00 0.00 0.000 ... \n", - "OnlineSecurity 0.859962 7.395345e-01 0.00 0.00 0.000 ... \n", - "OnlineBackup 0.880394 7.750930e-01 0.00 0.00 0.000 ... \n", - "DeviceProtection 0.880178 7.747141e-01 0.00 0.00 0.000 ... \n", - "TechSupport 0.861674 7.424814e-01 0.00 0.00 0.000 ... \n", - "StreamingTV 0.885285 7.837295e-01 0.00 0.00 0.000 ... \n", - "StreamingMovies 0.885385 7.839062e-01 0.00 0.00 0.000 ... \n", - "Contract 0.832934 6.937791e-01 0.00 0.00 0.000 ... \n", - "PaperlessBilling 0.491363 2.414375e-01 0.00 0.00 0.000 ... \n", - "PaymentMethod 1.067504 1.139565e+00 0.00 0.00 0.000 ... \n", - "MonthlyCharges 30.085974 9.051658e+02 18.25 19.20 19.650 ... \n", - "TotalCharges 2266.771362 5.138252e+06 18.80 19.90 49.605 ... \n", - "Churn 0.441782 1.951711e-01 0.00 0.00 0.000 ... \n", - "\n", - " P25 P50 P75 P90 P95 \\\n", - "df_index 1762.7500 3521.500 5282.2500 6336.900 6689.4500 \n", - "customerID 1757.7500 3515.500 5273.2500 6327.900 6679.4500 \n", - "gender 0.0000 1.000 1.0000 1.000 1.0000 \n", - "SeniorCitizen 0.0000 0.000 0.0000 1.000 1.0000 \n", - "Partner 0.0000 0.000 1.0000 1.000 1.0000 \n", - "Dependents 0.0000 0.000 1.0000 1.000 1.0000 \n", - "tenure 9.0000 29.000 55.0000 69.000 72.0000 \n", - "PhoneService 1.0000 1.000 1.0000 1.000 1.0000 \n", - "MultipleLines 0.0000 1.000 2.0000 2.000 2.0000 \n", - "InternetService 0.0000 1.000 1.0000 2.000 2.0000 \n", - "OnlineSecurity 0.0000 1.000 2.0000 2.000 2.0000 \n", - "OnlineBackup 0.0000 1.000 2.0000 2.000 2.0000 \n", - "DeviceProtection 0.0000 1.000 2.0000 2.000 2.0000 \n", - "TechSupport 0.0000 1.000 2.0000 2.000 2.0000 \n", - "StreamingTV 0.0000 1.000 2.0000 2.000 2.0000 \n", - "StreamingMovies 0.0000 1.000 2.0000 2.000 2.0000 \n", - "Contract 0.0000 0.000 1.0000 2.000 2.0000 \n", - "PaperlessBilling 0.0000 1.000 1.0000 1.000 1.0000 \n", - "PaymentMethod 1.0000 2.000 2.0000 3.000 3.0000 \n", - "MonthlyCharges 35.5875 70.350 89.8625 102.645 107.4225 \n", - "TotalCharges 401.4500 1397.475 3794.7375 5976.640 6923.5900 \n", - "Churn 0.0000 0.000 1.0000 1.000 1.0000 \n", - "\n", - " P99 MAX LC UC outlier_flag \n", - "df_index 6971.6900 7042.00 -544.102752 7587.227041 0.0 \n", - "customerID 6960.6900 7031.00 -544.715758 7575.715758 0.0 \n", - "gender 1.0000 1.00 -0.495334 1.504720 0.0 \n", - "SeniorCitizen 1.0000 1.00 -0.575288 0.900088 0.0 \n", - "Partner 1.0000 1.00 -0.516950 1.481968 0.0 \n", - "Dependents 1.0000 1.00 -0.616766 1.213751 0.0 \n", - "tenure 72.0000 72.00 -16.668733 81.512306 0.0 \n", - "PhoneService 1.0000 1.00 0.312157 1.494441 0.0 \n", - "MultipleLines 2.0000 2.00 -0.956697 2.837812 0.0 \n", - "InternetService 2.0000 2.00 -0.601960 2.347125 0.0 \n", - "OnlineSecurity 2.0000 2.00 -0.930675 2.509173 0.0 \n", - "OnlineBackup 2.0000 2.00 -0.854928 2.666646 0.0 \n", - "DeviceProtection 2.0000 2.00 -0.856489 2.664225 0.0 \n", - "TechSupport 2.0000 2.00 -0.926988 2.519707 0.0 \n", - "StreamingTV 2.0000 2.00 -0.785644 2.755496 0.0 \n", - "StreamingMovies 2.0000 2.00 -0.777880 2.763659 0.0 \n", - "Contract 2.0000 2.00 -0.977301 2.354435 0.0 \n", - "PaperlessBilling 1.0000 1.00 -0.390007 1.575445 0.0 \n", - "PaymentMethod 3.0000 3.00 -0.561772 3.708245 0.0 \n", - "MonthlyCharges 114.7345 118.75 4.626260 124.970156 0.0 \n", - "TotalCharges 8039.8830 8684.80 -2250.242283 6816.843165 1.0 \n", - "Churn 1.0000 1.00 -0.617778 1.149348 0.0 \n", - "\n", - "[22 rows x 21 columns]" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data2.apply(lambda x: var_summary(x)).T" - ] - }, - { - "cell_type": "markdown", - "id": "ac7d7dec", - "metadata": {}, - "source": [ - "# Linear Regression" - ] - }, - { - "cell_type": "markdown", - "id": "21f26774", - "metadata": {}, - "source": [ - "\n", - "> Linear regression is an approach for modeling the relationship between a scalar dependent variable y and one or more explanatory variables (or independent variables) denoted X. The case of one explanatory variable is called simple linear regression. For more than one explanatory variable, the process is called multiple linear regression
\n", - "> A simple linear regression model is given by Y=mX+b
\n", - "> where m is the slope and b is the y-intercept. Y is the dependent variable and X is the explanatory variable.
\n", - "> Very briefly and simplistically, Linear Regression is a class of techniques for fitting a straight line to a set of data points.\n", - "Linear Regression has dependent variables that have continuous values" - ] - }, - { - "cell_type": "markdown", - "id": "a6f253da", - "metadata": {}, - "source": [ - "A linear regression line has an equation of the form Y = a + bX, where X is the explanatory variable and Y is the dependent variable. The slope of the line is b, and a is the intercept (the value of y when x = 0)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7a0c5ae7", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "96840e52", - "metadata": {}, - "outputs": [], - "source": [ - "X_1 = data2.drop('TotalCharges', axis = 1)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "519b7e27", - "metadata": {}, - "outputs": [], - "source": [ - "y_1 = data2['TotalCharges']" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "5d6fde32", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
df_indexcustomerIDgenderSeniorCitizenPartnerDependentstenurePhoneServiceMultipleLinesInternetService...DeviceProtectionTechSupportStreamingTVStreamingMoviesContractPaperlessBillingPaymentMethodMonthlyChargesTotalChargesChurn
df_index1.000000-0.006309-0.0084630.0080570.000162-0.0047620.006801-0.028257-0.006518-0.007689...-0.0063530.008621-0.000675-0.0172010.0039490.0021430.021046-0.0129380.0018200.010133
customerID-0.0063091.0000000.006235-0.002368-0.026509-0.0118710.007209-0.0069870.004497-0.012335...-0.0067260.001763-0.007650-0.0172070.015949-0.0022250.011754-0.004445-0.000263-0.017858
gender-0.0084630.0062351.000000-0.001819-0.0013790.0103490.005285-0.007515-0.006908-0.002236...0.001348-0.006695-0.005624-0.0089200.000095-0.0119020.016942-0.0137790.000048-0.008545
SeniorCitizen0.008057-0.002368-0.0018191.0000000.016957-0.2105500.0156830.0083920.146287-0.032160...-0.021124-0.1510070.0310190.047088-0.1418200.156258-0.0381580.2198740.1024110.150541
Partner0.000162-0.026509-0.0013790.0169571.0000000.4522690.3819120.0183970.1427170.000513...0.1656140.1264880.1366790.1299070.294094-0.013957-0.1562320.0978250.319072-0.149982
Dependents-0.004762-0.0118710.010349-0.2105500.4522691.0000000.163386-0.001078-0.0249750.044030...0.0797230.1325300.0462140.0220880.240556-0.110131-0.041989-0.1123430.064653-0.163128
tenure0.0068010.0072090.0052850.0156830.3819120.1633861.0000000.0078770.343673-0.029835...0.3726690.3247290.2905720.2967850.6767340.004823-0.3700870.2468620.825880-0.354049
PhoneService-0.028257-0.006987-0.0075150.0083920.018397-0.0010780.0078771.000000-0.0205040.387266...0.004718-0.0181360.0563930.0430250.0030190.016696-0.0054990.2480330.1130080.011691
MultipleLines-0.0065180.004497-0.0069080.1462870.142717-0.0249750.343673-0.0205041.000000-0.108849...0.1226140.0109410.1754030.1817050.1110290.165306-0.1765980.4339050.4532020.038043
InternetService-0.007689-0.012335-0.002236-0.0321600.0005130.044030-0.0298350.387266-0.1088491.000000...0.045558-0.0256260.1081900.0979670.099579-0.1381660.084504-0.322173-0.175691-0.047097
OnlineSecurity-0.0010530.013740-0.014899-0.1279370.1506100.1511980.327283-0.0141630.007306-0.028003...0.1757890.2848750.0443990.0563130.373980-0.157723-0.096593-0.0535760.254473-0.289050
OnlineBackup0.004099-0.002960-0.011920-0.0133550.1530450.0902310.3724340.0240400.1172760.036735...0.1876460.1955810.1470850.1370830.280617-0.012697-0.1255340.1199430.375556-0.195290
DeviceProtection-0.006353-0.0067260.001348-0.0211240.1656140.0797230.3726690.0047180.1226140.045558...1.0000000.2404760.2759470.2893090.350067-0.037596-0.1364600.1639840.389066-0.177883
TechSupport0.0086210.001763-0.006695-0.1510070.1264880.1325300.324729-0.0181360.010941-0.025626...0.2404761.0000000.1611680.1625300.425072-0.113617-0.104544-0.0082370.276890-0.282232
StreamingTV-0.000675-0.007650-0.0056240.0310190.1366790.0462140.2905720.0563930.1754030.108190...0.2759470.1611681.0000000.4353540.2268260.097379-0.1047820.3371560.392472-0.036303
StreamingMovies-0.017201-0.017207-0.0089200.0470880.1299070.0220880.2967850.0430250.1817050.097967...0.2893090.1625300.4353541.0000000.2324780.083901-0.1120090.3357610.398088-0.038802
Contract0.0039490.0159490.000095-0.1418200.2940940.2405560.6767340.0030190.1110290.099579...0.3500670.4250720.2268260.2324781.000000-0.175475-0.229636-0.0727390.450306-0.396150
PaperlessBilling0.002143-0.002225-0.0119020.156258-0.013957-0.1101310.0048230.0166960.165306-0.138166...-0.037596-0.1136170.0973790.083901-0.1754751.000000-0.0613480.3519300.1578300.191454
PaymentMethod0.0210460.0117540.016942-0.038158-0.156232-0.041989-0.370087-0.005499-0.1765980.084504...-0.136460-0.104544-0.104782-0.112009-0.229636-0.0613481.000000-0.192500-0.3305940.107852
MonthlyCharges-0.012938-0.004445-0.0137790.2198740.097825-0.1123430.2468620.2480330.433905-0.322173...0.163984-0.0082370.3371560.335761-0.0727390.351930-0.1925001.0000000.6510650.192858
TotalCharges0.001820-0.0002630.0000480.1024110.3190720.0646530.8258800.1130080.453202-0.175691...0.3890660.2768900.3924720.3980880.4503060.157830-0.3305940.6510651.000000-0.199484
Churn0.010133-0.017858-0.0085450.150541-0.149982-0.163128-0.3540490.0116910.038043-0.047097...-0.177883-0.282232-0.036303-0.038802-0.3961500.1914540.1078520.192858-0.1994841.000000
\n", - "

22 rows × 22 columns

\n", - "
" - ], - "text/plain": [ - " df_index customerID gender SeniorCitizen Partner \\\n", - "df_index 1.000000 -0.006309 -0.008463 0.008057 0.000162 \n", - "customerID -0.006309 1.000000 0.006235 -0.002368 -0.026509 \n", - "gender -0.008463 0.006235 1.000000 -0.001819 -0.001379 \n", - "SeniorCitizen 0.008057 -0.002368 -0.001819 1.000000 0.016957 \n", - "Partner 0.000162 -0.026509 -0.001379 0.016957 1.000000 \n", - "Dependents -0.004762 -0.011871 0.010349 -0.210550 0.452269 \n", - "tenure 0.006801 0.007209 0.005285 0.015683 0.381912 \n", - "PhoneService -0.028257 -0.006987 -0.007515 0.008392 0.018397 \n", - "MultipleLines -0.006518 0.004497 -0.006908 0.146287 0.142717 \n", - "InternetService -0.007689 -0.012335 -0.002236 -0.032160 0.000513 \n", - "OnlineSecurity -0.001053 0.013740 -0.014899 -0.127937 0.150610 \n", - "OnlineBackup 0.004099 -0.002960 -0.011920 -0.013355 0.153045 \n", - "DeviceProtection -0.006353 -0.006726 0.001348 -0.021124 0.165614 \n", - "TechSupport 0.008621 0.001763 -0.006695 -0.151007 0.126488 \n", - "StreamingTV -0.000675 -0.007650 -0.005624 0.031019 0.136679 \n", - "StreamingMovies -0.017201 -0.017207 -0.008920 0.047088 0.129907 \n", - "Contract 0.003949 0.015949 0.000095 -0.141820 0.294094 \n", - "PaperlessBilling 0.002143 -0.002225 -0.011902 0.156258 -0.013957 \n", - "PaymentMethod 0.021046 0.011754 0.016942 -0.038158 -0.156232 \n", - "MonthlyCharges -0.012938 -0.004445 -0.013779 0.219874 0.097825 \n", - "TotalCharges 0.001820 -0.000263 0.000048 0.102411 0.319072 \n", - "Churn 0.010133 -0.017858 -0.008545 0.150541 -0.149982 \n", - "\n", - " Dependents tenure PhoneService MultipleLines \\\n", - "df_index -0.004762 0.006801 -0.028257 -0.006518 \n", - "customerID -0.011871 0.007209 -0.006987 0.004497 \n", - "gender 0.010349 0.005285 -0.007515 -0.006908 \n", - "SeniorCitizen -0.210550 0.015683 0.008392 0.146287 \n", - "Partner 0.452269 0.381912 0.018397 0.142717 \n", - "Dependents 1.000000 0.163386 -0.001078 -0.024975 \n", - "tenure 0.163386 1.000000 0.007877 0.343673 \n", - "PhoneService -0.001078 0.007877 1.000000 -0.020504 \n", - "MultipleLines -0.024975 0.343673 -0.020504 1.000000 \n", - "InternetService 0.044030 -0.029835 0.387266 -0.108849 \n", - "OnlineSecurity 0.151198 0.327283 -0.014163 0.007306 \n", - "OnlineBackup 0.090231 0.372434 0.024040 0.117276 \n", - "DeviceProtection 0.079723 0.372669 0.004718 0.122614 \n", - "TechSupport 0.132530 0.324729 -0.018136 0.010941 \n", - "StreamingTV 0.046214 0.290572 0.056393 0.175403 \n", - "StreamingMovies 0.022088 0.296785 0.043025 0.181705 \n", - "Contract 0.240556 0.676734 0.003019 0.111029 \n", - "PaperlessBilling -0.110131 0.004823 0.016696 0.165306 \n", - "PaymentMethod -0.041989 -0.370087 -0.005499 -0.176598 \n", - "MonthlyCharges -0.112343 0.246862 0.248033 0.433905 \n", - "TotalCharges 0.064653 0.825880 0.113008 0.453202 \n", - "Churn -0.163128 -0.354049 0.011691 0.038043 \n", - "\n", - " InternetService ... DeviceProtection TechSupport \\\n", - "df_index -0.007689 ... -0.006353 0.008621 \n", - "customerID -0.012335 ... -0.006726 0.001763 \n", - "gender -0.002236 ... 0.001348 -0.006695 \n", - "SeniorCitizen -0.032160 ... -0.021124 -0.151007 \n", - "Partner 0.000513 ... 0.165614 0.126488 \n", - "Dependents 0.044030 ... 0.079723 0.132530 \n", - "tenure -0.029835 ... 0.372669 0.324729 \n", - "PhoneService 0.387266 ... 0.004718 -0.018136 \n", - "MultipleLines -0.108849 ... 0.122614 0.010941 \n", - "InternetService 1.000000 ... 0.045558 -0.025626 \n", - "OnlineSecurity -0.028003 ... 0.175789 0.284875 \n", - "OnlineBackup 0.036735 ... 0.187646 0.195581 \n", - "DeviceProtection 0.045558 ... 1.000000 0.240476 \n", - "TechSupport -0.025626 ... 0.240476 1.000000 \n", - "StreamingTV 0.108190 ... 0.275947 0.161168 \n", - "StreamingMovies 0.097967 ... 0.289309 0.162530 \n", - "Contract 0.099579 ... 0.350067 0.425072 \n", - "PaperlessBilling -0.138166 ... -0.037596 -0.113617 \n", - "PaymentMethod 0.084504 ... -0.136460 -0.104544 \n", - "MonthlyCharges -0.322173 ... 0.163984 -0.008237 \n", - "TotalCharges -0.175691 ... 0.389066 0.276890 \n", - "Churn -0.047097 ... -0.177883 -0.282232 \n", - "\n", - " StreamingTV StreamingMovies Contract PaperlessBilling \\\n", - "df_index -0.000675 -0.017201 0.003949 0.002143 \n", - "customerID -0.007650 -0.017207 0.015949 -0.002225 \n", - "gender -0.005624 -0.008920 0.000095 -0.011902 \n", - "SeniorCitizen 0.031019 0.047088 -0.141820 0.156258 \n", - "Partner 0.136679 0.129907 0.294094 -0.013957 \n", - "Dependents 0.046214 0.022088 0.240556 -0.110131 \n", - "tenure 0.290572 0.296785 0.676734 0.004823 \n", - "PhoneService 0.056393 0.043025 0.003019 0.016696 \n", - "MultipleLines 0.175403 0.181705 0.111029 0.165306 \n", - "InternetService 0.108190 0.097967 0.099579 -0.138166 \n", - "OnlineSecurity 0.044399 0.056313 0.373980 -0.157723 \n", - "OnlineBackup 0.147085 0.137083 0.280617 -0.012697 \n", - "DeviceProtection 0.275947 0.289309 0.350067 -0.037596 \n", - "TechSupport 0.161168 0.162530 0.425072 -0.113617 \n", - "StreamingTV 1.000000 0.435354 0.226826 0.097379 \n", - "StreamingMovies 0.435354 1.000000 0.232478 0.083901 \n", - "Contract 0.226826 0.232478 1.000000 -0.175475 \n", - "PaperlessBilling 0.097379 0.083901 -0.175475 1.000000 \n", - "PaymentMethod -0.104782 -0.112009 -0.229636 -0.061348 \n", - "MonthlyCharges 0.337156 0.335761 -0.072739 0.351930 \n", - "TotalCharges 0.392472 0.398088 0.450306 0.157830 \n", - "Churn -0.036303 -0.038802 -0.396150 0.191454 \n", - "\n", - " PaymentMethod MonthlyCharges TotalCharges Churn \n", - "df_index 0.021046 -0.012938 0.001820 0.010133 \n", - "customerID 0.011754 -0.004445 -0.000263 -0.017858 \n", - "gender 0.016942 -0.013779 0.000048 -0.008545 \n", - "SeniorCitizen -0.038158 0.219874 0.102411 0.150541 \n", - "Partner -0.156232 0.097825 0.319072 -0.149982 \n", - "Dependents -0.041989 -0.112343 0.064653 -0.163128 \n", - "tenure -0.370087 0.246862 0.825880 -0.354049 \n", - "PhoneService -0.005499 0.248033 0.113008 0.011691 \n", - "MultipleLines -0.176598 0.433905 0.453202 0.038043 \n", - "InternetService 0.084504 -0.322173 -0.175691 -0.047097 \n", - "OnlineSecurity -0.096593 -0.053576 0.254473 -0.289050 \n", - "OnlineBackup -0.125534 0.119943 0.375556 -0.195290 \n", - "DeviceProtection -0.136460 0.163984 0.389066 -0.177883 \n", - "TechSupport -0.104544 -0.008237 0.276890 -0.282232 \n", - "StreamingTV -0.104782 0.337156 0.392472 -0.036303 \n", - "StreamingMovies -0.112009 0.335761 0.398088 -0.038802 \n", - "Contract -0.229636 -0.072739 0.450306 -0.396150 \n", - "PaperlessBilling -0.061348 0.351930 0.157830 0.191454 \n", - "PaymentMethod 1.000000 -0.192500 -0.330594 0.107852 \n", - "MonthlyCharges -0.192500 1.000000 0.651065 0.192858 \n", - "TotalCharges -0.330594 0.651065 1.000000 -0.199484 \n", - "Churn 0.107852 0.192858 -0.199484 1.000000 \n", - "\n", - "[22 rows x 22 columns]" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data2.corr()" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "0cc6ad35", - "metadata": {}, - "outputs": [], - "source": [ - "# improting library for splitting the data for training and testing\n", - "from sklearn.model_selection import train_test_split\n", - "X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1, test_size = 0.25, random_state = 1111)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "5605adb1", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(5274, 21)\n", - "(1758, 21)\n", - "(5274,)\n", - "(1758,)\n" - ] - } - ], - "source": [ - "#checking the shape of training and testing data\n", - "print(X_train_1.shape)\n", - "print(X_test_1.shape)\n", - "print(y_train_1.shape)\n", - "print(y_test_1.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "bdd8c777", - "metadata": {}, - "outputs": [], - "source": [ - "# * Feature Scaling | Scaling the variables | standardizign the variable | Z - score | Mean = 0 and STD = 1\n", - "# To get all the variables on same scale [towards ZERO]\n", - "from sklearn.preprocessing import StandardScaler\n", - "sc = StandardScaler()\n", - "\n", - "\n", - "\n", - "#The fit method is calculating the mean and variance of each of the features present in our train data.\n", - "#The transform method is transforming all the features using the respective mean and variance. \n", - "\n", - "X_train_1 = sc.fit_transform(X_train_1)# you are finding the MEan and STD{with the fit()\n", - "# }on training data and aslo transforming that\n", - "\n", - "X_test_1= sc.transform(X_test_1) # Only tranforming now\n", - "\n", - "\n", - "#transform method we can use the same mean and variance as it is calculated from our training data to transform our test data." - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "76616edb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "LinearRegression()" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.linear_model import LinearRegression\n", - "lm = LinearRegression() \n", - "lm.fit(X_train_1, y_train_1)" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "3aca06f9", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2277.7827645051198\n" - ] - } - ], - "source": [ - "print(lm.intercept_)" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "6ffbfadc", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Coefficients [ 1.67435056e+00 -1.21592576e+00 9.36384978e+00 7.47791487e-01\n", - " 8.75532709e+00 -1.64395198e+01 1.49795207e+03 -1.32435809e+01\n", - " 3.52073262e+01 -1.32013722e+01 9.14191335e+01 1.14980710e+02\n", - " 8.41858741e+01 8.09522416e+01 4.15371148e+01 4.78567718e+01\n", - " -8.67741524e+01 -8.33581430e+00 5.43558008e+01 1.04663509e+03\n", - " -8.03192908e+01]\n" - ] - } - ], - "source": [ - "# The coefficients\n", - "print('Coefficients', lm.coef_)" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "0e5f32ff", - "metadata": {}, - "outputs": [], - "source": [ - "#Testing\n", - "y_pred = lm.predict(X_test_1)" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "feb5edc3", - "metadata": {}, - "outputs": [], - "source": [ - "y_pred = pd.DataFrame(y_pred, columns=['Predicted'])" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "b0bd8cad", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Predicted
03305.856021
1-1200.141076
22628.853682
33062.706854
42994.841653
......
17532052.786703
17545089.259245
17551658.150121
17563140.118481
17573199.745140
\n", - "

1758 rows × 1 columns

\n", - "
" - ], - "text/plain": [ - " Predicted\n", - "0 3305.856021\n", - "1 -1200.141076\n", - "2 2628.853682\n", - "3 3062.706854\n", - "4 2994.841653\n", - "... ...\n", - "1753 2052.786703\n", - "1754 5089.259245\n", - "1755 1658.150121\n", - "1756 3140.118481\n", - "1757 3199.745140\n", - "\n", - "[1758 rows x 1 columns]" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "y_pred" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "6403ce1b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "3906 1874.30\n", - "5978 20.35\n", - "5738 2333.05\n", - "5743 1796.55\n", - "5271 2531.40\n", - " ... \n", - "1826 1381.80\n", - "2785 5551.15\n", - "1766 1201.15\n", - "5872 2727.30\n", - "1382 1643.25\n", - "Name: TotalCharges, Length: 1758, dtype: float64" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "y_test_1" - ] - }, - { - "cell_type": "markdown", - "id": "2431f6e6", - "metadata": {}, - "source": [ - "### Calculating mean square error ... RMSE\n", - "> RMSE calculate the difference between the actual value and predicted value of the response(dependant) variable
\n", - "> The square root of the mean/average of the square of all of the error.
\n", - "> Compared to the similar Mean Absolute Error, RMSE amplifies and severely punishes large errors.
\n", - "> The lesser the RMSE value, the better is the model." - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "1fee6c37", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Mean Absolute Error: 555.1405460192393\n", - "Mean Squared Error: 478817.97456224787\n", - "Root Mean Squared Error: 691.9667438267882\n" - ] - } - ], - "source": [ - "from sklearn import metrics \n", - "import numpy as np\n", - "\n", - "#Absolute Error is the amount of error in your measurements.\n", - "print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_1, y_pred)) \n", - "print('Mean Squared Error:', metrics.mean_squared_error(y_test_1, y_pred)) \n", - "print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_1, y_pred)))" - ] - }, - { - "cell_type": "markdown", - "id": "bf2dfe8d", - "metadata": {}, - "source": [ - "### Evaluating Model Accuracy\n", - "> R-squared is a statistical measure of how close the data are to the fitted regression line.
\n", - "> R-square signifies percentage of variations in the reponse variable that can be explained by the model.
\n", - "> R-squared = Explained variation / Total variation
\n", - "> Total variation is variation of response variable around it's mean.
\n", - "> R-squared value varies between 0 and 100%. 0% signifies that the model explains none of the variability,
\n", - "> while 100% signifies that the model explains all the variability of the response.
\n", - "> The closer the r-square to 100%, the better is the model.
" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "c87d1815", - "metadata": {}, - "outputs": [], - "source": [ - "#finding R-squared value\n", - "from sklearn.metrics import r2_score\n" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "33f16e28", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.9067679081449613\n" - ] - } - ], - "source": [ - "print(r2_score(y_test_1, y_pred))" - ] - }, - { - "cell_type": "markdown", - "id": "68e0476f", - "metadata": {}, - "source": [ - "# Logistic Regression\n" - ] - }, - { - "cell_type": "markdown", - "id": "5ea47ebf", - "metadata": {}, - "source": [ - "Logistic Regression is used when the dependent variable(target) is categorical. logistic regression is a predictive analysis.\n", - "Logistic regression is used to describe data and to explain the relationship between one dependent binary variable and one or more independent variables." - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "e2cad95c", - "metadata": {}, - "outputs": [], - "source": [ - "#dividing independent and dependent variables\n", - "X = data2.iloc[:,:21]\n", - "y = data2.iloc[:,21:]" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "ed78c027", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
df_indexcustomerIDgenderSeniorCitizenPartnerDependentstenurePhoneServiceMultipleLinesInternetService...OnlineBackupDeviceProtectionTechSupportStreamingTVStreamingMoviesContractPaperlessBillingPaymentMethodMonthlyChargesTotalCharges
00536500101010...2000001229.8529.85
113953100034100...0200010356.951889.50
22255810002100...2000001353.85108.15
335524100045010...0220010042.301840.75
44650000002101...0000001270.70151.65
\n", - "

5 rows × 21 columns

\n", - "
" - ], - "text/plain": [ - " df_index customerID gender SeniorCitizen Partner Dependents tenure \\\n", - "0 0 5365 0 0 1 0 1 \n", - "1 1 3953 1 0 0 0 34 \n", - "2 2 2558 1 0 0 0 2 \n", - "3 3 5524 1 0 0 0 45 \n", - "4 4 6500 0 0 0 0 2 \n", - "\n", - " PhoneService MultipleLines InternetService ... OnlineBackup \\\n", - "0 0 1 0 ... 2 \n", - "1 1 0 0 ... 0 \n", - "2 1 0 0 ... 2 \n", - "3 0 1 0 ... 0 \n", - "4 1 0 1 ... 0 \n", - "\n", - " DeviceProtection TechSupport StreamingTV StreamingMovies Contract \\\n", - "0 0 0 0 0 0 \n", - "1 2 0 0 0 1 \n", - "2 0 0 0 0 0 \n", - "3 2 2 0 0 1 \n", - "4 0 0 0 0 0 \n", - "\n", - " PaperlessBilling PaymentMethod MonthlyCharges TotalCharges \n", - "0 1 2 29.85 29.85 \n", - "1 0 3 56.95 1889.50 \n", - "2 1 3 53.85 108.15 \n", - "3 0 0 42.30 1840.75 \n", - "4 1 2 70.70 151.65 \n", - "\n", - "[5 rows x 21 columns]" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "a65e377e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Churn
00
10
21
30
41
\n", - "
" - ], - "text/plain": [ - " Churn\n", - "0 0\n", - "1 0\n", - "2 1\n", - "3 0\n", - "4 1" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "y.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "233ff9e9", - "metadata": {}, - "outputs": [], - "source": [ - "# improting library for splitting the data for training and testing\n", - "from sklearn.model_selection import train_test_split\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1111)" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "a850482e", - "metadata": {}, - "outputs": [], - "source": [ - "#importing lib for data to be align in standard scaling manner\n", - "from sklearn.preprocessing import StandardScaler\n", - "sc = StandardScaler()\n", - "\n", - "X_train = sc.fit_transform(X_train)\n", - "X_test = sc.transform(X_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "id": "806e70da", - "metadata": {}, - "outputs": [], - "source": [ - "#importing library for logistic regression\n", - "from sklearn.linear_model import LogisticRegression " - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "e95406b8", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Lenovo\\anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", - " return f(*args, **kwargs)\n" - ] - }, - { - "data": { - "text/plain": [ - "LogisticRegression()" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#Creating intsnce of logistic regression and then fitting the data\n", - "logistic_reg= LogisticRegression()\n", - "logistic_reg.fit(X_train,y_train)" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "id": "c58ac14e", - "metadata": {}, - "outputs": [], - "source": [ - "#predicting the test data.....\n", - "y_pred=logistic_reg.predict(X_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "id": "271ab74f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([0, 0, 0, ..., 1, 0, 0])" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "y_pred" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "7ae64f67", - "metadata": {}, - "outputs": [], - "source": [ - "#predcition on train data\n", - "y_pred_train = logistic_reg.predict(X_train)" - ] - }, - { - "cell_type": "markdown", - "id": "afb9a422", - "metadata": {}, - "source": [ - "# Confusion matrix\n", - "A Confusion matrix is an N x N matrix used for evaluating the performance of a classification model, where N is the number of target classes. The matrix compares the actual target values with those predicted by the machine learning model. This gives us a holistic view of how well our classification model is performing and what kinds of errors it is making." - ] - }, - { - "cell_type": "markdown", - "id": "ffc7557b", - "metadata": {}, - "source": [ - "Sklearn has two great functions: confusion_matrix() and classification_report().\n", - "\n", - "Sklearn confusion_matrix() returns the values of the Confusion matrix.\n", - "The output is, however, slightly different from what we have studied so far.\n", - "It takes the rows as Actual values and the columns as Predicted values. \n", - "The rest of the concept remains the same.\n", - "Sklearn classification_report() outputs precision, recall and f1-score for each target class. \n", - "In addition to this, it also has some extra values: micro avg, macro avg, and weighted avg" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "id": "f95277b8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[1165, 138],\n", - " [ 206, 249]], dtype=int64)" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#importing library for confusion matrix\n", - "from sklearn import metrics\n", - "cnf_matrix = metrics.confusion_matrix(y_test, y_pred)\n", - "cnf_matrix" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "aca39196", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Accuracy: 0.8043230944254836\n" - ] - } - ], - "source": [ - "#finding accuracy of model\n", - "log_acc = metrics.accuracy_score(y_test, y_pred)\n", - "print('Accuracy: ',log_acc)" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "734ff962", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "sns.heatmap(cnf_matrix, annot=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "4831e0e1", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " precision recall f1-score support\n", - "\n", - " 0 0.85 0.89 0.87 1303\n", - " 1 0.64 0.55 0.59 455\n", - "\n", - " accuracy 0.80 1758\n", - " macro avg 0.75 0.72 0.73 1758\n", - "weighted avg 0.80 0.80 0.80 1758\n", - "\n" - ] - } - ], - "source": [ - "#importing library for classification_report and finding report\n", - "from sklearn.metrics import classification_report\n", - "print(classification_report(y_test,y_pred))" - ] - }, - { - "cell_type": "markdown", - "id": "8f60be40", - "metadata": {}, - "source": [ - "# Desicion Tree" - ] - }, - { - "cell_type": "markdown", - "id": "2b6101db", - "metadata": {}, - "source": [ - "A decision tree is a tree-like graph with nodes representing the place where we pick an attribute and ask a question; edges represent the answers the to the question; and the leaves represent the actual output or class label. They are used in non-linear decision making with simple linear decision surface." - ] - }, - { - "cell_type": "markdown", - "id": "adac44fc", - "metadata": {}, - "source": [ - "**Advantages of decision trees:**\n", - "\n", - "- Can be used for regression or classification\n", - "- Can be displayed graphically\n", - "- Highly interpretable\n", - "- Can be specified as a series of rules, and more closely approximate human decision-making than other models\n", - "- Prediction is fast\n", - "- Features don't need scaling\n", - "- Automatically learns feature interactions\n", - "- Tends to ignore irrelevant features\n", - "- Non-parametric (will outperform linear models if relationship between features and response is highly non-linear)\n", - "- Robust to the outliers\n", - "- Impact of Missing values is Minimal" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "4aa06922", - "metadata": {}, - "outputs": [], - "source": [ - "# Decision Tree Classification\n", - "\n", - "# Importing the libraries\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "id": "a4136018", - "metadata": {}, - "outputs": [], - "source": [ - "# Spliting the overall data into X and Y [this is what required in ML]\n", - "\n", - "X = data2.iloc[:, :21]\n", - "\n", - "y = data2.iloc[:, 21:]" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "id": "c2b22324", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
df_indexcustomerIDgenderSeniorCitizenPartnerDependentstenurePhoneServiceMultipleLinesInternetService...OnlineBackupDeviceProtectionTechSupportStreamingTVStreamingMoviesContractPaperlessBillingPaymentMethodMonthlyChargesTotalCharges
00536500101010...2000001229.8529.85
113953100034100...0200010356.951889.50
22255810002100...2000001353.85108.15
335524100045010...0220010042.301840.75
44650000002101...0000001270.70151.65
..................................................................
702770384843101124120...0222211384.801990.50
702870391524001172121...22022111103.207362.90
702970403358001111010...0000001229.60346.45
70307041592311104121...0000001374.40306.60
703170422221100066101...02222210105.656844.50
\n", - "

7032 rows × 21 columns

\n", - "
" - ], - "text/plain": [ - " df_index customerID gender SeniorCitizen Partner Dependents \\\n", - "0 0 5365 0 0 1 0 \n", - "1 1 3953 1 0 0 0 \n", - "2 2 2558 1 0 0 0 \n", - "3 3 5524 1 0 0 0 \n", - "4 4 6500 0 0 0 0 \n", - "... ... ... ... ... ... ... \n", - "7027 7038 4843 1 0 1 1 \n", - "7028 7039 1524 0 0 1 1 \n", - "7029 7040 3358 0 0 1 1 \n", - "7030 7041 5923 1 1 1 0 \n", - "7031 7042 2221 1 0 0 0 \n", - "\n", - " tenure PhoneService MultipleLines InternetService ... OnlineBackup \\\n", - "0 1 0 1 0 ... 2 \n", - "1 34 1 0 0 ... 0 \n", - "2 2 1 0 0 ... 2 \n", - "3 45 0 1 0 ... 0 \n", - "4 2 1 0 1 ... 0 \n", - "... ... ... ... ... ... ... \n", - "7027 24 1 2 0 ... 0 \n", - "7028 72 1 2 1 ... 2 \n", - "7029 11 0 1 0 ... 0 \n", - "7030 4 1 2 1 ... 0 \n", - "7031 66 1 0 1 ... 0 \n", - "\n", - " DeviceProtection TechSupport StreamingTV StreamingMovies Contract \\\n", - "0 0 0 0 0 0 \n", - "1 2 0 0 0 1 \n", - "2 0 0 0 0 0 \n", - "3 2 2 0 0 1 \n", - "4 0 0 0 0 0 \n", - "... ... ... ... ... ... \n", - "7027 2 2 2 2 1 \n", - "7028 2 0 2 2 1 \n", - "7029 0 0 0 0 0 \n", - "7030 0 0 0 0 0 \n", - "7031 2 2 2 2 2 \n", - "\n", - " PaperlessBilling PaymentMethod MonthlyCharges TotalCharges \n", - "0 1 2 29.85 29.85 \n", - "1 0 3 56.95 1889.50 \n", - "2 1 3 53.85 108.15 \n", - "3 0 0 42.30 1840.75 \n", - "4 1 2 70.70 151.65 \n", - "... ... ... ... ... \n", - "7027 1 3 84.80 1990.50 \n", - "7028 1 1 103.20 7362.90 \n", - "7029 1 2 29.60 346.45 \n", - "7030 1 3 74.40 306.60 \n", - "7031 1 0 105.65 6844.50 \n", - "\n", - "[7032 rows x 21 columns]" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "b75e77b9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Churn
00
10
21
30
41
......
70270
70280
70290
70301
70310
\n", - "

7032 rows × 1 columns

\n", - "
" - ], - "text/plain": [ - " Churn\n", - "0 0\n", - "1 0\n", - "2 1\n", - "3 0\n", - "4 1\n", - "... ...\n", - "7027 0\n", - "7028 0\n", - "7029 0\n", - "7030 1\n", - "7031 0\n", - "\n", - "[7032 rows x 1 columns]" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "y" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "id": "801d9d1f", - "metadata": {}, - "outputs": [], - "source": [ - "# Splitting the dataset into the Training set and Test set\n", - "from sklearn.model_selection import train_test_split\n", - "\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1111)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "id": "335e2530", - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.tree import DecisionTreeClassifier\n" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "id": "94c8f8ba", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "DecisionTreeClassifier(max_depth=2, min_samples_leaf=10, min_samples_split=20,\n", - " random_state=0)" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#importing library for Descision Tree and we have two criterion for DT 'gini' and 'Entropy'\n", - "# you can use gini when there is binary classification otherwise use entropy\n", - "\n", - "from sklearn.tree import DecisionTreeClassifier\n", - "\n", - "#max_depth - the maximum height upto which the trees inside the forest can grow(to avoid overfitting)\n", - "#min_samples_split- minimum amount of samples an internal node must hold in order to split into further nodes(default value -2.)\n", - "#min_samples_leaf - minimum amount of samples that a node must hold after getting split(default value -1.)\n", - "\n", - "#making instance of DT\n", - "classifier = DecisionTreeClassifier(criterion = 'gini', random_state = 0,\n", - " max_depth = 2, min_samples_leaf = 10, min_samples_split = 20\n", - " )\n", - "#fitting the training data\n", - "classifier.fit(X_train, y_train)" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "id": "6189a89b", - "metadata": {}, - "outputs": [], - "source": [ - "#prediction using 'testing' data\n", - "y_pred = classifier.predict(X_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "id": "9af2cc14", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([0, 0, 0, ..., 1, 0, 0])" - ] - }, - "execution_count": 57, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "y_pred" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "id": "419ee81f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.7468714448236633" - ] - }, - "execution_count": 58, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#checking score via importing library\n", - "from sklearn.metrics import accuracy_score\n", - "DT_acc = accuracy_score(y_pred,y_test)\n", - "DT_acc" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "id": "4a5dad9a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[985, 318],\n", - " [127, 328]], dtype=int64)" - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "\n", - "from sklearn.metrics import confusion_matrix\n", - "\n", - "cm = confusion_matrix(y_test, y_pred)\n", - "cm" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "id": "de8d0b01", - "metadata": {}, - "outputs": [], - "source": [ - "#checking Auc_Roc_Score via importing library\n", - "from sklearn.metrics import roc_auc_score" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "id": "c1ee3c9e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.7384134668094761" - ] - }, - "execution_count": 61, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "roc_auc_score(y_test,y_pred)" - ] - }, - { - "cell_type": "markdown", - "id": "fd76fbed", - "metadata": {}, - "source": [ - "# Hypertuning\n", - " \n" - ] - }, - { - "cell_type": "markdown", - "id": "710cfaff", - "metadata": {}, - "source": [ - "it is used to increases the performance of a model via providing best parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "id": "aeadf4e2", - "metadata": {}, - "outputs": [], - "source": [ - "#importing library for hypertuning\n", - "from sklearn.model_selection import GridSearchCV" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "id": "f42a1258", - "metadata": {}, - "outputs": [], - "source": [ - "pGrid = {'max_depth': range(2, 10), # 8\n", - " 'min_samples_leaf': range(10, 51, 10), # 5\n", - " 'min_samples_split': range(20, 81, 20)} # 4\n", - "#intance of GScv\n", - "gscv = GridSearchCV(estimator = DecisionTreeClassifier(), param_grid = pGrid, cv = 5,\n", - " scoring = 'recall', n_jobs = -1, verbose = True)" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "id": "92c9ddce", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fitting 5 folds for each of 160 candidates, totalling 800 fits\n" - ] - }, - { - "data": { - "text/plain": [ - "GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,\n", - " param_grid={'max_depth': range(2, 10),\n", - " 'min_samples_leaf': range(10, 51, 10),\n", - " 'min_samples_split': range(20, 81, 20)},\n", - " scoring='recall', verbose=True)" - ] - }, - "execution_count": 64, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#fitiing the data\n", - "gscv.fit(X,y)" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "id": "da422896", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'max_depth': 2, 'min_samples_leaf': 10, 'min_samples_split': 20}" - ] - }, - "execution_count": 65, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#finding best params for model\n", - "gscv.best_params_" - ] - }, - { - "cell_type": "markdown", - "id": "7d584d80", - "metadata": {}, - "source": [ - "**Disadvantages of decision trees:**\n", - " \n", - "- Performance is (generally) not competitive with the best supervised learning methods\n", - " - Use Ensembles \n", - "- Can easily overfit the training data (tuning is required / PRUNING standard concept )\n", - "\n", - "- Small variations in the data can result in a completely different tree (high variance)\n", - " - Use Ensembles to reduce the variance\n", - " \n", - "- Recursive binary splitting makes \"locally optimal\" decisions that may not result in a globally optimal tree\n", - "- Doesn't tend to work well if the classes are highly unbalanced\n", - "- Doesn't tend to work well with very small datasets" - ] - }, - { - "cell_type": "markdown", - "id": "0fc68484", - "metadata": {}, - "source": [ - "# RANDOM FOREST" - ] - }, - { - "cell_type": "markdown", - "id": "008226d8", - "metadata": {}, - "source": [ - "Random forests or random decision forests are an ensemble learning method for classification, regression and other tasks that operate by constructing a multitude of decision trees at training time and outputting the class that is the mode of the classes or mean/average prediction of the individual trees. " - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "id": "cd472afa", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - ":4: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", - " classifier2.fit(X_train, y_train)\n" - ] - }, - { - "data": { - "text/plain": [ - "RandomForestClassifier(n_estimators=70, random_state=0)" - ] - }, - "execution_count": 66, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.ensemble import RandomForestClassifier\n", - "\n", - "classifier2 = RandomForestClassifier(n_estimators = 70, criterion = 'gini', random_state = 0)\n", - "classifier2.fit(X_train, y_train)" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "id": "576a4b1d", - "metadata": {}, - "outputs": [], - "source": [ - "y_pred = classifier2.predict(X_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "id": "74301722", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.8037542662116041" - ] - }, - "execution_count": 68, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.metrics import accuracy_score\n", - "RF_acc = accuracy_score(y_test,y_pred)\n", - "RF_acc" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "id": "c4942c23", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.7052693277558972" - ] - }, - "execution_count": 69, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.metrics import roc_auc_score\n", - "roc_auc_score(y_test,y_pred)" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "id": "b314ec0b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[1185, 118],\n", - " [ 227, 228]], dtype=int64)" - ] - }, - "execution_count": 70, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.metrics import confusion_matrix\n", - "cm = confusion_matrix(y_test, y_pred)\n", - "cm" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "id": "9db435c7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fitting 5 folds for each of 25 candidates, totalling 125 fits\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Lenovo\\anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_search.py:918: UserWarning: One or more of the test scores are non-finite: [0.79209177 0.79337159 0.79237576 0.7928022 0.78995826 0.79066919\n", - " 0.78313331 0.78711502 0.78967457 0.78981591 0.7807154 0.78099858\n", - " 0.78555 0.78554929 0.78839465 0.78199432 0.78270576 0.78412904\n", - " 0.78597694 0.7825627 nan nan nan nan\n", - " nan]\n", - " warnings.warn(\n", - "C:\\Users\\Lenovo\\anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_search.py:880: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", - " self.best_estimator_.fit(X, y, **fit_params)\n" - ] - }, - { - "data": { - "text/plain": [ - "GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,\n", - " param_grid={'max_features': [5, 10, 15, 20, 25],\n", - " 'n_estimators': [70, 80, 90, 100, 120]},\n", - " verbose=True)" - ] - }, - "execution_count": 71, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#n_estimators :- int, default=100\n", - "#The number of trees in the forest- the n_estimator parameter controls the number of trees inside the classifier.\n", - "#max_features helps to find the number of features to take into account in order to make the best split\n", - "\n", - "\n", - "pargrid_rf = {'n_estimators': [70, 80, 90, 100, 120],\n", - " 'max_features': [5,10,15,20,25]}\n", - "\n", - "gscv_rf = GridSearchCV(estimator = RandomForestClassifier(), \n", - " param_grid = pargrid_rf, \n", - " cv = 5,\n", - " verbose = True, \n", - " n_jobs = -1)\n", - "\n", - "gscv_rf.fit(X, y)" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "id": "6e4b32fb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'max_features': 5, 'n_estimators': 80}" - ] - }, - "execution_count": 72, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "gscv_rf.best_params_" - ] - }, - { - "cell_type": "markdown", - "id": "dd76d339", - "metadata": {}, - "source": [ - "Importing libraries to draw tree structre" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "id": "ebd9cc40", - "metadata": {}, - "outputs": [], - "source": [ - "from matplotlib import pyplot as plt\n", - "from sklearn import tree" - ] - }, - { - "cell_type": "code", - "execution_count": 74, - "id": "cf12c14e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Text(418.5, 453.0, 'X[16] <= 0.5\\ngini = 0.392\\nsamples = 5274\\nvalue = [3860, 1414]'),\n", - " Text(209.25, 271.8, 'X[10] <= 0.5\\ngini = 0.49\\nsamples = 2931\\nvalue = [1669, 1262]'),\n", - " Text(104.625, 90.59999999999997, 'gini = 0.5\\nsamples = 1985\\nvalue = [970, 1015]'),\n", - " Text(313.875, 90.59999999999997, 'gini = 0.386\\nsamples = 946\\nvalue = [699, 247]'),\n", - " Text(627.75, 271.8, 'X[19] <= 99.85\\ngini = 0.121\\nsamples = 2343\\nvalue = [2191, 152]'),\n", - " Text(523.125, 90.59999999999997, 'gini = 0.077\\nsamples = 1912\\nvalue = [1835, 77]'),\n", - " Text(732.375, 90.59999999999997, 'gini = 0.287\\nsamples = 431\\nvalue = [356, 75]')]" - ] - }, - "execution_count": 74, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.figure(figsize = (15,10))\n", - "tree.plot_tree(classifier, filled = True)" - ] - }, - { - "cell_type": "markdown", - "id": "d9452db6", - "metadata": {}, - "source": [ - "# KNN" - ] - }, - { - "cell_type": "markdown", - "id": "f16bc237", - "metadata": {}, - "source": [ - "1.Supervised Learning technique.\n", - "2.K-NN algorithm can be used for Regression & Classification - mostly Classification problems.\n", - "3.K-NN is a non-parametric algoritham- which means it does not make any assumption on underlying data.\n", - "4.It is also called a lazy learner algorithm because it does not learn from the training set immediately instead it stores the dataset.\n", - "5.At the time of classification, it performs an action on the dataset.\n", - "6.Example: Suppose, we have an image of a creature that looks similar to cat and dog, but we want to know either it is a cat or dog. So for this identification, we can use the KNN algorithm," - ] - }, - { - "cell_type": "markdown", - "id": "a6f49184", - "metadata": {}, - "source": [ - "Step-1: Select the number K of the neighbors\n", - "Step-2: Calculate the Euclidean distance of K number of neighbors\n", - "Step-3: Take the K nearest neighbors as per the calculated Euclidean distance.\n", - "Step-4: Among these k neighbors, count the number of the data points in each category.\n", - "Step-5: Assign the new data points to that category for which the number of the neighbor is maximum.\n", - "Step-6: Our model is ready." - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "id": "240ea111", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Lenovo\\anaconda3\\lib\\site-packages\\sklearn\\neighbors\\_classification.py:179: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", - " return self._fit(X, y)\n" - ] - }, - { - "data": { - "text/plain": [ - "KNeighborsClassifier(n_neighbors=11)" - ] - }, - "execution_count": 75, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Fitting K-NN to the Training set\n", - "from sklearn.neighbors import KNeighborsClassifier\n", - "classifier = KNeighborsClassifier(n_neighbors = 11, metric = 'minkowski', p = 2)\n", - "classifier.fit(X_train, y_train)\n", - "\n", - "\n", - "\n", - "\n", - "#p : integer, optional (default = 2)\n", - "#Power parameter for the Minkowski metric. \n", - "#When p = 1, this is equivalent to using manhattan_distance (l1), \n", - "# and euclidean_distance (l2) for p = 2.\n", - "\n", - "#metric : string or callable, default ‘minkowski’\n", - "\n", - "#The Minkowski distance is a metric in a normed vector space which can be considered as a \n", - "# generalization of both the Euclidean distance and the Manhattan distance.\n", - "\n", - "#the distance metric to use for the tree. The default metric is minkowski, \n", - "#and with p=2 is equivalent to the standard Euclidean metric." - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "id": "669eae48", - "metadata": {}, - "outputs": [], - "source": [ - "# Predicting the Test set results\n", - "y_pred = classifier.predict(X_test)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "id": "5122b9e8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[1215, 88],\n", - " [ 392, 63]], dtype=int64)" - ] - }, - "execution_count": 77, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "\n", - "# Making the Confusion Matrix\n", - "from sklearn.metrics import confusion_matrix\n", - "cm = confusion_matrix(y_test, y_pred)\n", - "cm" - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "id": "4c6f892a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.726962457337884" - ] - }, - "execution_count": 78, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.metrics import accuracy_score\n", - "KNN_acc = accuracy_score(y_pred,y_test)\n", - "KNN_acc" - ] - }, - { - "cell_type": "code", - "execution_count": 79, - "id": "d0df133e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.5866428745101111" - ] - }, - "execution_count": 79, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.metrics import roc_auc_score\n", - "roc_auc_score(y_pred,y_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 80, - "id": "9dd24d51", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 80, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "import seaborn as sns\n", - "sns.heatmap(cm, annot = True)" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "id": "ebece630", - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.model_selection import GridSearchCV" - ] - }, - { - "cell_type": "code", - "execution_count": 82, - "id": "61072235", - "metadata": {}, - "outputs": [], - "source": [ - "pGrid = {'n_neighbors': range(10,200),\n", - " 'leaf_size': range(10, 51, 10), }\n", - "\n", - "\n", - "gscv = GridSearchCV(estimator = KNeighborsClassifier(), param_grid = pGrid, cv = 5,\n", - " scoring = 'recall', n_jobs = -1, verbose = True)" - ] - }, - { - "cell_type": "code", - "execution_count": 83, - "id": "509a2400", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fitting 5 folds for each of 950 candidates, totalling 4750 fits\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Lenovo\\anaconda3\\lib\\site-packages\\sklearn\\neighbors\\_classification.py:179: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", - " return self._fit(X, y)\n" - ] - }, - { - "data": { - "text/plain": [ - "GridSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=-1,\n", - " param_grid={'leaf_size': range(10, 51, 10),\n", - " 'n_neighbors': range(10, 200)},\n", - " scoring='recall', verbose=True)" - ] - }, - "execution_count": 83, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "gscv.fit(X,y)" - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "id": "599febe0", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'leaf_size': 10, 'n_neighbors': 11}" - ] - }, - "execution_count": 84, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "gscv.best_params_" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c4066319", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.8" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} +done