-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathBike_Price_Prediction
100 lines (94 loc) · 3.01 KB
/
Bike_Price_Prediction
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder
import random
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor,ExtraTreesRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
bike=pd.read_csv('BIKE DETAILS.csv')
df=bike.copy()
df.shape
df.isnull().sum()
df.info()
plt.figure(figsize=(10,4))
df['year'].value_counts().plot(kind='bar')
plt.show()
plt.figure(figsize=(10,4))
sns.barplot(df['year'],df['selling_price'])
plt.xticks(rotation='vertical')
plt.show()
sns.histplot(df['seller_type'])
plt.show()
sns.barplot(df['seller_type'],df['selling_price'])
plt.show()
sns.histplot(df['owner'])
plt.show()
sns.barplot(df['owner'],df['selling_price'])
plt.show()
sns.distplot(df['km_driven'])
plt.show()
sns.scatterplot(df['km_driven'],df['selling_price'])
plt.show()
df['ex_imputed']=df['ex_showroom_price']
df['ex_imputed'][df['ex_imputed'].isnull()] = df['ex_imputed'].dropna().sample(df['ex_showroom_price'].isnull().sum()).values
sns.distplot(df['ex_showroom_price'])
plt.show()
sns.distplot(df['ex_imputed'])
plt.show()
df['ex_showroom_price']=df['ex_imputed']
df.drop('ex_imputed',axis=1,inplace=True)
df['ex_showroom_price']=df['ex_showroom_price'].astype('int')
sns.scatterplot(df['ex_showroom_price'],df['selling_price'])
plt.show()
a=df['name'].str.split(' ')
for i in range(len(a)):
a[i]=a[i][0]
a=a.str.replace('Royal','Royal Enfield')
for i in range(len(a)):
if a.value_counts()[a[i]] < 2:
a[i]=a.mode()[0]
df['Company']=a
df.drop('name',axis=1,inplace=True)
df['Company'].value_counts().plot(kind='bar')
plt.show()
sns.barplot(df['Company'],df['selling_price'])
plt.xticks(rotation='vertical')
plt.show()
sns.distplot(np.log(df['selling_price']))
plt.show()
df['selling_price']=np.log(df['selling_price'])
df.corr()['selling_price']
sns.heatmap(df.corr(),annot=True,cmap='Blues')
plt.show()
df.isnull().sum()
df.info()
X=df.drop(['selling_price'],axis=1)
y=df['selling_price']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.15,random_state=2)
df.head()
### Linear Regression
step1 = ColumnTransformer(transformers=[
('col_tnf',OneHotEncoder(sparse=False,drop='first'),[1,2,5])
# We will not count price that's why our counting change
],remainder='passthrough')
step2 = LinearRegression()
pipe = Pipeline([
('step1',step1),
('step2',step2)
])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))