-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpassengers_up_preprocessing.py
235 lines (174 loc) · 8.24 KB
/
passengers_up_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
import pickle
import pandas as pd
import numpy as np
from util import haversine
from scipy.stats import gaussian_kde
import numpy as np
import pandas as pd
def add_density(df: pd.DataFrame) -> pd.DataFrame:
# Drop duplicates to get unique station entries
data = df.drop_duplicates('station_id').copy()
# Extract latitude and longitude columns
latitudes = data['latitude']
longitudes = data['longitude']
# Perform KDE
values = np.vstack([longitudes, latitudes])
kde = gaussian_kde(values)
# Open a file and use dump()
with open('stations_density.pkl', 'wb') as file:
# A new file will be created
pickle.dump(kde, file)
# with open('stations_density.pkl', 'rb') as file:
# kde = pickle.load(file)
kde_values = kde(values).T
# Use .loc to set the density values explicitly
data.loc[:, 'density'] = kde_values
# Merge the KDE values back to the original DataFrame based on 'station_id'
df = df.merge(data[['station_id', 'density']], on='station_id', how='left')
return df
def count_lines_per_station(df: pd.DataFrame) -> pd.DataFrame:
"""
For each station, count how many different bus lines stop at that station.
:param df: DataFrame containing the bus schedule with columns ['station_id', 'line_id']
:return: DataFrame with an additional column 'line_count' indicating the count of unique lines per station
"""
# Group by 'station_id' and count unique 'line_id' values
line_counts = df.groupby('station_id')['line_id'].nunique().reset_index()
# Rename the column for clarity
line_counts.columns = ['station_id', 'line_count']
# Merge the counts back to the original DataFrame
df = df.merge(line_counts, on='station_id', how='left')
return df
def remove_rows(df: pd.DataFrame) -> pd.DataFrame:
pass
def calculate_time_to_next_station():
pass
def load_data(file_path='train_bus_schedule.csv'):
df = pd.read_csv(file_path, encoding="ISO-8859-8")
return df
# Calculate the number of unique stations within 1 km radius for each unique station
def calculate_nearby_stations(unique_stations):
def wrappper(station):
lat1, lon1 = station['latitude'], station['longitude']
distances = unique_stations.apply(lambda row: haversine(lat1, lon1, row['latitude'], row['longitude']), axis=1)
return (distances <= 1).sum() - 1 # Subtract 1 to exclude the station itself
return wrappper
# Calculate the time difference for previous station
def calculate_time_diff(arrival_time, previous_arrival_time):
if pd.isnull(previous_arrival_time):
return np.nan
diff = arrival_time - previous_arrival_time
if diff.total_seconds() >= 0:
return diff.total_seconds()
else:
return (pd.Timedelta('1 days') + diff).total_seconds()
def add_time_from_next_prev(df: pd.DataFrame) -> pd.DataFrame:
# Get the arrival time of the previous station
df['previous_arrival_time'] = df.groupby('trip_id_unique')['arrival_time'].shift(1)
df['next_arrival_time'] = df.groupby('trip_id_unique')['arrival_time'].shift(-1)
# Calculate the time difference in seconds for the previous and next stations
df['time_from_prev_station'] = df.apply(
lambda row: calculate_time_diff(row['arrival_time'], row['previous_arrival_time']), axis=1
)
df['time_to_next_station'] = df.apply(
lambda row: calculate_time_diff(row['next_arrival_time'], row['arrival_time']), axis=1
)
return df
def add_distance_from_prev_next(df: pd.DataFrame) -> pd.DataFrame:
# Sort the dataframe by trip_id_unique and station_index
df = df.sort_values(by=['trip_id_unique', 'station_index'])
# Calculate distances between consecutive stations within the same trip
df['prev_latitude'] = df.groupby('trip_id_unique')['latitude'].shift()
df['prev_longitude'] = df.groupby('trip_id_unique')['longitude'].shift()
df['next_latitude'] = df.groupby('trip_id_unique')['latitude'].shift(-1)
df['next_longitude'] = df.groupby('trip_id_unique')['longitude'].shift(-1)
df['distance_from_prev'] = df.apply(
lambda row: haversine(row['prev_latitude'], row['prev_longitude'], row['latitude'], row['longitude'])
if pd.notnull(row['prev_latitude']) else 0, axis=1) * 1000
df['distance_from_next'] = df.apply(
lambda row: haversine(row['latitude'], row['longitude'], row['next_latitude'], row['next_longitude'])
if pd.notnull(row['prev_latitude']) else 0, axis=1) * 1000
return df
def map_time_period(hour):
if 7 <= hour < 11:
return 'Morning'
elif 11 <= hour < 15:
return 'Noon'
elif 15 <= hour < 19:
return 'Afternoon'
elif 19 <= hour < 23:
return 'Evening'
else:
return 'Night'
def add_time_category(df: pd.DataFrame) -> pd.DataFrame:
df['hour'] = (df['arrival_time'].dt.total_seconds() / 60 / 60).astype(int)
df['start_hour'] = df.groupby('trip_id_unique')['arrival_time'].transform('min').dt.components['hours']
df['time_period'] = df['start_hour'].apply(map_time_period)
return df
def add_total_trip_distance(df: pd.DataFrame) -> pd.DataFrame:
df = df.sort_values(by=['trip_id_unique', 'station_index'])
trip_distances = df.groupby('trip_id_unique')['distance_from_prev'].sum().reset_index()
trip_distances.columns = ['trip_id_unique', 'total_distance']
# Merge the total distances back into the original dataframe
df = df.merge(trip_distances, on='trip_id_unique')
return df
def add_lines_dummies(df: pd.DataFrame) -> pd.DataFrame:
return pd.get_dummies(df, columns=['line_id', 'direction'])
def add_max_line_count(df: pd.DataFrame) -> pd.DataFrame:
"""
For every line and direction, find the maximum line count and add it to the DataFrame.
:param df: DataFrame containing the bus schedule with columns ['line_id', 'direction', 'station_id']
:return: DataFrame with an additional column 'max_line_count' indicating the maximum line count for each line and direction
"""
# Calculate the maximum line count for each line_id and direction
max_line_counts = df.groupby(['line_id', 'direction'])['line_count'].max().reset_index()
max_line_counts.columns = ['line_id', 'direction', 'max_line_count']
# Merge the maximum line count back to the original DataFrame
df = df.merge(max_line_counts, on=['line_id', 'direction'], how='left')
return df
def save_as_csv(df: pd.DataFrame):
df.to_csv('passengers_up_preprocessed.csv', encoding="ISO-8859-8")
def preprocessing(df: pd.DataFrame) -> pd.DataFrame:
df['arrival_time'] = pd.to_timedelta(df['arrival_time'])
df = add_time_category(df)
# Sort by 'trip_id' and 'station_index'
df = add_distance_from_prev_next(df)
# Group by trip_id_unique and sum distances
# df = add_total_trip_distance(df)
df = add_density(df)
df = count_lines_per_station(df)
# dummies for lines
df = add_max_line_count(df)
# df = add_lines_dummies(df)
df = add_total_trip_distance(df)
df = df.drop(['alternative', 'door_closing_time', 'passengers_continue_menupach',
'prev_latitude', 'prev_longitude', 'next_latitude', 'next_longitude',
'latitude', 'longitude'], axis=1)
df = adjust_rows(df)
save_as_csv(df)
return df
def adjust_rows(df: pd.DataFrame) -> pd.DataFrame:
# Remove negative values
df['passengers_continue'] = abs(df['passengers_continue'])
# distance & time from prev outliers
# distance & time from next outliers
columns = ['total_distance', 'passengers_continue']
for col in columns:
df = kill_outlires(df, col)
return df
def kill_outlires(df: pd.DataFrame, field) -> pd.DataFrame:
# Calculate mean and standard deviation of trip_duration_minutes
mean_duration = df[field].mean()
std_duration = df[field].std()
# Filter out trips where duration is within 2 standard deviations from the mean
filtered_trip_durations = df[(df[field] >= mean_duration - 2 * std_duration) &
(df[field] <= mean_duration + 2 * std_duration)]
return filtered_trip_durations
def create_db():
df = load_data()
df = preprocessing(df)
save_as_csv(df)
if __name__ == '__main__':
create_db()
df = load_data('passengers_up_preprocessed.csv')
print(df.head(10))