-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRandom Forest.py
113 lines (80 loc) · 3.32 KB
/
Random Forest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
loan = pd.read_csv("train.csv")
loan.info()
#Finding Misiing value from dataset
loan.apply(lambda x:sum(x.isnull()),axis=0)
#Filling Gender
loan['Gender'].fillna('Male',inplace=True)
#Filling Married
loan['Married'].fillna('Yes',inplace=True)
#Filling Dependent
loan['Dependents'].isnull().sum()
loan['Dependents'].value_counts()
loan['Dependents'].fillna('2',inplace=True)
#Filling Self_Employee
loan['Self_Employed'] = loan['Self_Employed'].map({"Yes":1,"No":0})
loan.loc[ (pd.isnull(loan['Self_Employed'])) & (loan['Loan_Status'] == 'Y'), 'Self_Employed'] = 0
loan.loc[ (pd.isnull(loan['Self_Employed'])) & (loan['Loan_Status'] == 'N'), 'Self_Employed'] = 1
#Filling Loan amount
impute_grps = loan.pivot_table(values=["LoanAmount"], index=["Education","Self_Employed"], aggfunc=np.mean)
print(impute_grps)
for i,row in loan.loc[loan['LoanAmount'].isnull(),:].iterrows():
ind = tuple([row['Education'],row['Self_Employed']])
loan.loc[i,'LoanAmount'] = impute_grps.loc[ind].values[0]
#Filling Loan_Amount_Term
loan['Loan_Amount_Term'].fillna(360,inplace=True)
loan['Loan_Status'] = loan['Loan_Status'].map(lambda x: 1 if x=='Y' else 0)
loan['Credit_History'].fillna(loan['Loan_Status'], inplace = True)
#Total Income
loan['TotalIncome'] = loan['ApplicantIncome'] + loan['CoapplicantIncome']
X = loan.iloc[:, [8,10,11,13]].values
y = loan.iloc[:, 12].values
#Lable Encoder for creating categorical vlaue to Numerical value
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelencoder_x_1 = LabelEncoder()
X[:,2] = labelencoder_x_1.fit_transform(X[:,2])
onehotencoder = OneHotEncoder(categorical_features=[2])
X = onehotencoder.fit_transform(X).toarray()
X = X[: , 1:]
labelencoder_y = LabelEncoder()
y= labelencoder_y.fit_transform(y)
'''
labelencoder_x_2 = LabelEncoder()
X[:,5] = labelencoder_x_2.fit_transform(X[:,5])
onehotencoder_1 = OneHotEncoder(categorical_features=[5])
X = onehotencoder.fit_transform(X).toarray()
loan['Dependents_1'] = pd.Categorical.from_array(loan.Dependents).codes
X
'''
#Spliting data in to training and testing dataset
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train , y_test = train_test_split(X,y,test_size = 0.25,random_state = 0)
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
#Fitting Random Forest to training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10,criterion='entropy',random_state=0)
classifier.fit(X_train,y_train)
#Predicting the test set result
y_pred = classifier.predict(X_test)
#Making the confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
#Acuuracy
import numpy as np
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)*100
#Creating Kfold for Kernel logical regression
from sklearn.model_selection import cross_val_score
accuracy = cross_val_score(estimator = classifier,X =X_train,y = y_train, cv = 10 )
#check 10 different accuracy of the model
accuracy
#Average of all the models
accuracy.mean()
#Standard Deviation
accuracy.std()