-
Notifications
You must be signed in to change notification settings - Fork 0
/
prediction.py
114 lines (84 loc) · 4.27 KB
/
prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import matplotlib.pyplot as plt # Matlab-style plotting
import seaborn as sns
from source.featureEngineering import getImportantFeatures
from source.models import getENet, getKRR, getGBoost, getXGB, getLGB, getLasso
from source.stackingModels import AveragingModels
color = sns.color_palette()
sns.set_style('darkgrid')
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import norm, skew #for some statistics
from sklearn.model_selection import KFold, cross_val_score
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points
#------------------------------DEFINING CONSTANTS-----------------------------------------------------------------------
N_FOLDS= 5
#------------------------------DEFINING METHODS-------------------------------------------------------------------------
def rmsle_cv(model, X_train, y_train):
kf = KFold(N_FOLDS, shuffle=True, random_state=42).get_n_splits(X_train.values)
rmse= np.sqrt(-cross_val_score(model, X_train.values, y_train, scoring="neg_mean_squared_error",cv = kf))
return(rmse)
#-------------------------------LOADING DATA----------------------------------------------------------------------------
train = pd.read_csv('../data/train_house.csv')
test = pd.read_csv('../data/test_house.csv')
#Now drop the 'Id' colum since it's unnecessary for the prediction process.
train.drop("Id", axis = 1, inplace = True)
test.drop("Id", axis = 1, inplace = True)
#------------------Finding Outliers-----------------------------------------------------------------------------------
fig, ax = plt.subplots()
ax.scatter(x = train['GrLivArea'], y = train['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()
#Deleting outliers
train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index)
# -------------------------analysis on price variable ------------------------------------------------------------------
sns.distplot(train['SalePrice'] , fit=norm)
(mu, sigma) = norm.fit(train['SalePrice'])
#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')
#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt)
plt.show()
#---------------make price variable more normally distributed-----------------------------------------------------------
train["SalePrice"] = np.log1p(train["SalePrice"])
#Check the new distribution
sns.distplot(train['SalePrice'] , fit=norm)
#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')
#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt)
plt.show()
#---------------------Cleaning Features and getting important features---------------------------------------------
X_train,X_test,y_train=getImportantFeatures(train,test)
#---------------------Testing Different Models--------------------------------------------------------------------------
lasso=getLasso()
score = rmsle_cv(lasso,X_train,y_train)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
ENet=getENet()
score =rmsle_cv(ENet,X_train,y_train)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
krr=getKRR()
score = rmsle_cv(krr,X_train,y_train)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
gboost=getGBoost()
score = rmsle_cv(gboost,X_train,y_train)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
Xgb=getXGB()
score = rmsle_cv(Xgb,X_train,y_train)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
lgb=getLGB()
score = rmsle_cv(lgb,X_train,y_train)
print("LGBM score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))
#---------------------Stacking Models-----------------------------------------------------------------------------------
averaged_models = AveragingModels(models = (ENet, gboost, krr))
score = rmsle_cv(averaged_models,X_train,y_train)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))