-
Notifications
You must be signed in to change notification settings - Fork 0
/
cross_validation.py
204 lines (159 loc) · 6.47 KB
/
cross_validation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import pandas as pd
import numpy as np
from sklearn import (
metrics,
model_selection,
svm,
ensemble
)
# Read CSVs from memory and remove duplicates
dataset_accidents = pd.read_csv(
'data/accidents.csv',
header=0
)
dataset_accidents = dataset_accidents.drop_duplicates()
dataset_vehicles = pd.read_csv(
'data/vehicles.csv',
header=0
)
dataset_vehicles = dataset_vehicles.drop_duplicates()
# There were some accidents placed in the coast of Morocco,
# we suppose all of them are noise and they get discarded.
# Moreover, they are all from the majority class, so, it's
# not too problematic to load information with this drop
dataset_accidents = dataset_accidents[
dataset_accidents['latitude'] >= 40
]
# The information about location is redundant with the police_force
# column, since the whole country is divided in regions controlled
# by each force.
dataset_accidents = dataset_accidents.drop(
['location_easting_osgr', 'location_northing_osgr',
'lsoa_of_accident_location', 'latitude', 'longitude'],
axis=1
)
# Variables police_force, local_authority_highway and
# local_authority_district represent the same information, so
# only the most concise information is kept
dataset_accidents = dataset_accidents.drop(
['local_authority_district', 'local_authority_highway'],
axis=1
)
# Temporal information converted into more relevant data
dataset_accidents['weekday'] = pd.to_datetime(
dataset_accidents['date']
).dt.weekday_name
dataset_accidents['weekend'] = (dataset_accidents['weekday'].isin(
['Friday', 'Saturday', 'Sunday']
))*1
dataset_accidents['day_period'] = pd.to_datetime(
dataset_accidents['time']
).dt.hour
dataset_accidents['day_period'] = pd.cut(
dataset_accidents['day_period'], bins=[0, 7, 9, 13, 16, 20, 24], right=False
)
dataset_accidents = dataset_accidents.drop(['date', 'time', 'weekday'], axis=1)
# Variables with too much invalid or non computed values
dataset_accidents = dataset_accidents.drop(
[
'pedestrian_crossing-human_control',
'pedestrian_crossing-physical_facilities',
'carriageway_hazards'
],
axis=1
)
# Urban and rural into boolean variable
dataset_accidents['urban_area'] = 1*(
dataset_accidents['urban_or_rural_area'] == 'Urban'
)
dataset_accidents = dataset_accidents.drop('urban_or_rural_area', axis=1)
# Class is more representative than the number of the road
dataset_accidents = dataset_accidents.drop(
['1st_road_number', '2nd_road_number'], axis=1
)
# MERGING BOTH DATASETS USING PRIMARY KEY TO HAVE INFORMATION
# FROM TWO SOURCES
dataset = dataset_accidents.merge(dataset_vehicles, on='accident_id')
# Drop accident_id since it's irrelevant after merging
dataset = dataset.drop('accident_id', axis=1)
# We delete the sex of the driver on purpose since we prefer not to
# take this information into account
dataset = dataset.drop('Sex_of_Driver', axis=1)
# A code identifier about the car is not important to predict the severity of an accident
dataset = dataset.drop('Vehicle_Reference', axis=1)
# Percentage of serious accidents is approx constant independently of IMD value, so discarded
dataset = dataset.drop(['Vehicle_IMD_Decile', 'Driver_IMD_Decile'], axis=1)
# Left or right hand drive is not useful since almost every value
# is no and the proportion of serious accidents is almost constant
dataset = dataset.drop(['Was_Vehicle_Left_Hand_Drive?'], axis=1)
# Almost every vehicle is not articulated
dataset['Towing_and_Articulation'] = (~dataset['Towing_and_Articulation'].isin(
['-1', 'No tow/articulation']
))
# Deleting this variable because almost every value is none
dataset = dataset.drop('Hit_Object_in_Carriageway', axis=1)
# Since we don't know the meaning of this variable and we cannot
# obtain it, we decide to drop this variable since it's not very informative
dataset = dataset.drop(
'Driver_Home_Area_Type',
axis=1
)
# This variable gets converted into 0-1 encoding whether
# the vehicle has abandoned the carriageway or not
dataset['Carriageway_Left'] = (dataset['Vehicle_Leaving_Carriageway'] != 'Did not leave carriageway')*1
dataset = dataset.drop('Vehicle_Leaving_Carriageway', axis=1)
# Deleting it since it is not very informative variable for the algorithm
dataset = dataset.drop('Journey_Purpose_of_Driver', axis=1)
# Junction_Location collides with another column from accidents. Deleting
dataset = dataset.drop('Junction_Location', axis=1)
dataset['Skidding_and_Overturning'] = dataset[
dataset['Skidding_and_Overturning'].isin(['-1', 'None'])
]
# MISSING DATA HANDLING
# Missing values for categorical data are replaced with the mode of the variable (common point imputation)
dataset.loc[dataset['Vehicle_Type'] == '-1', 'Vehicle_Type'] = dataset['Vehicle_Type'].mode()
dataset.loc[
dataset['Vehicle_Location-Restricted_Lane'] == '-1', 'Vehicle_Location-Restricted_Lane'
] = dataset['Vehicle_Location-Restricted_Lane'].mode()
dataset.loc[
dataset['Hit_Object_off_Carriageway'] == '-1', 'Hit_Object_off_Carriageway'
] = dataset['Hit_Object_off_Carriageway'].mode()
# Obtention of remaining onehot variables
dataset = pd.get_dummies(dataset)
dataset = pd.get_dummies(
dataset,
columns=['road_type', 'weather_conditions']
)
# Deletion of columns relative to -1
dataset = dataset[dataset.columns.drop(list(dataset.filter(regex='-1')))]
# Dataset permutation
dataset = dataset.sample(frac=1, random_state=42)
x = dataset.drop('target', axis=1)
y = dataset['target']
skf = model_selection.StratifiedKFold(n_splits=5)
classifier = ensemble.RandomForestClassifier(
n_estimators=100, max_depth=20, class_weight='balanced'
)
split_indices = skf.split(x, y)
scores = []
f1_scores = []
confusion_matrices = []
for train_index, test_index in split_indices:
print("New split")
print("Train size: {}, test size: {}".format(
len(train_index), len(test_index)
))
train_set, train_labels = x.loc[train_index], y[train_index]
test_set, test_labels = x.loc[test_index], y[test_index]
classifier.fit(train_set, train_labels)
labels_pred = classifier.predict(test_set)
curr_score = metrics.accuracy_score(test_labels, labels_pred)
curr_f1 = metrics.f1_score(test_labels, labels_pred, average='binary')
conf_matrix = metrics.confusion_matrix(test_labels, labels_pred)
print("Acc: {}, F1: {}".format(curr_score, curr_f1))
print("Confusion matrix")
print(conf_matrix)
scores.append(curr_score)
f1_scores.append(curr_f1)
print("Medias")
print("Acc: {}, F1: {}".format(np.mean(scores), np.mean(f1_scores)))