-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathensemble_model.py
77 lines (68 loc) · 2.45 KB
/
ensemble_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import os
import sklearn
import pandas as pd
import xgboost as xgb
import numpy as np
import lightgbm as lgbm
from datetime import datetime
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import f1_score
TRAIN_VALUES = 'DATA/train_values.csv'
TRAIN_LABELS = 'DATA/train_labels.csv'
TEST_VALUES = 'DATA/test_values.csv'
clf1 = GaussianProcessClassifier()
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()
clf4 = MLPClassifier()
clf5 = KNeighborsClassifier(7)
clf6 = SVC(kernel = 'linear')
clf7 = SVC(kernel = 'rbf')
clf8 = SVC(kernel = 'poly')
clf9 = AdaBoostClassifier()
clf10 = xgb.XGBClassifier()
clf11 = QuadraticDiscriminantAnalysis()
clf12 = lgbm.LGBMClassifier()
classifiers = [
#('gaus', clf1),
#('rf', clf2),
#('gnb', clf3),
#('mlp', clf4),
#('knn', clf5),
#('svc_linear', clf6),
#('svc_rbf', clf7),
#('poly', clf8),
('adaboost', clf9),
('xgb', clf10),
#('quad', clf11),
('lgbm', clf12)
]
eclf1 = VotingClassifier(estimators=classifiers, voting='soft', n_jobs = -1)
train_x = pd.read_csv(TRAIN_VALUES)
train_y = pd.read_csv(TRAIN_LABELS)
test_x = pd.read_csv(TEST_VALUES)
categorical_columns = list(train_x.columns[8:15]) + ['legal_ownership_status']
for col in categorical_columns:
train_x[col] = pd.Categorical((train_x[col])).codes
test_x[col] = pd.Categorical((test_x[col])).codes
results = []
print('Time started: ' + datetime.now().strftime("%H:%M:%S"))
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in kf.split(train_x.values[:, 1:]):
eclf1 = eclf1.fit(train_x.values[train_index, 1:], train_y.values[train_index, 1])
preds = eclf1.score(train_x.values[test_index, 1:], train_y.values[test_index, 1])
f1_score_val = f1_score(preds, train_y.values[test_index, 1])
results.append(f1_score_val)
print(results)
print(np.mean(results))
print('Time end: ' + datetime.now().strftime("%H:%M:%S"))