-
Notifications
You must be signed in to change notification settings - Fork 0
/
fidas-parser.py
executable file
·233 lines (199 loc) · 8.69 KB
/
fidas-parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Fidas Frog dataparser.
Code for cleaning up fidas frog files of medatata and converting them to plain csv files
and merging with GPS files
Example:
$ python fidas-parser.py -i <inputpath> [-m <mergeheader>] [-g <gpsfile>] [-o <outputpath>]'
Copyright (c) 2017, Open Lab Newcastle University, UK.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
"""
__author__ = "Aare Puussaar"
__copyright__ = "Copyright (c) 2017, Newcastle University, UK."
__license__ = "MIT"
__maintainer__ = "Aare Puussaar"
__email__ = "[email protected]"
__version__ = "0.6.1"
__status__ = "Development"
import sys, getopt, datetime, os, glob
import pandas as pd
from datetime import datetime, timedelta
def addGPS(readings,gps,gheader,tformat="%Y-%m-%d %H:%M:%S"):
"""Function for getting nearest timestamp gps location
TODO - needs testing
Args:
readings: pandas dataframe object of readings
gps: pandas dataframe object of gps coordinates
rheader: header column of readings to index by
gheader: header column of gps data to index by
Returns:
starttime: starttime of readings
loc: data start location in the file
"""
#if readings are in epoch
#Timezone issues with the parser
#readings.timestamp = readings.timestamp.astype("datetime64[s]") - timedelta(hours=1)
readings.timestamp = readings.timestamp.astype("datetime64[s]")
gps[gheader] = pd.to_datetime(gps[gheader],format=tformat)
#remove millis
gps[gheader] = gps[gheader].values.astype('datetime64[s]')
#remove dublicates
gps = gps.drop_duplicates(gheader,keep='last')
gps_dt = pd.Series(gps[gheader].values, gps[gheader])
#gps_dt.reindex(readings["timestamp"], method="nearest")
readings["nearest"] = gps_dt.reindex(readings["timestamp"], method="nearest").values
merged_df = pd.merge(readings, gps, how='left', left_on=['nearest'], right_on = [gheader])
return merged_df
def getStarts(source):
"""Function for getting the startdate and data start location
Args:
source: input file.
Returns:
starttime: starttime of readings
loc: data start location in the file
deviceid: device id of the sensor
"""
loc = 0
with open(source, encoding='ISO-8859-1') as f:
for line in f:
if line.startswith('Start at:'):
s=line
if line.startswith('Operator:'):
deviceid = line
if line.startswith('timestamp'):
break
loc+=1
#extract datetime from string
#starttime = pd.to_datetime(str(s.split(":", 1)[1].strip()).replace("-", "").replace("/", "-"))
starttime = pd.to_datetime(str(s.split(":", 1)[1].strip()),format="%d/%m/%Y - %H:%M:%S")
deviceid = str(deviceid.split(":", 1)[1].strip())
#convert from nanoseconds
starttime = starttime.value/10**9
return starttime, loc, deviceid
def convertTime(data,starttime,human=True, version=2):
"""Function for adding start time for readings
Args:
data: pandas dataframe object of readings
starttime: starttime of readings
human: boolean for converting to human readable time from epoch
Returns:
data: converted pandas dataframe object of readings
"""
if(version<2):
data.timestamp = (data.timestamp+starttime).astype(int)
if(human):
data.timestamp = data.timestamp.astype("datetime64[s]")
return data
def privacyZone(data,minutes):
"""Function for cutting off start end of dataset
Args:
data: pandas dataframe object of readings
minutes: minutes to cut off
Returns:
data: converted pandas dataframe object of readings
"""
data2 = data.set_index("timestamp")
tmin = pd.to_datetime(data2.index.min() + timedelta(minutes=minutes))
tmax = pd.to_datetime(data2.index.max() - timedelta(minutes=minutes))
data = data[data.timestamp.apply(lambda x: x > tmin) & data.timestamp.apply(lambda x: x < tmax)]
return data
def processFile(filepath,gpsfile,gpsheader,outputpath):
"""Function for processing the sensor file
TODO - needs testing
Args:
filepath: filepath to the sensor file
gpsfile: filepath to the GPS file
gpsheader: header column of GPS data file to index by
outputpath: filepath to of the output directory
"""
print('Working on ' + filepath)
start,loc,deviceid = getStarts(filepath)
data = pd.read_csv(filepath, skiprows=loc, index_col=False, sep='\t', header=0, encoding='ISO-8859-1')
data = convertTime(data,start)
#add deviceid
data["device_id"] = deviceid
if gpsheader is not None:
gps=pd.read_csv(gpsfile,sep=',',header=0)
data = addGPS(data,gps,gpsheader)
#data = addGPS(data,gps,"YYYY-MO-DD HH-MI-SS_SSS")
#clean up column names
data.columns = data.columns.str.lower().str.replace(":","").str.strip().str.replace(" ", "_")
#custom header
header = ["timestamp","pm_1","pm_2.5","pm_4","pm_10","pm_tot.","dcn", "latitude", "longitude"]
if outputpath is None:
outputpath = os.path.basename(filepath) + '_id-' + deviceid + '.csv'
else:
outputpath = outputpath + '/' + os.path.basename(filepath) + '_id-' + deviceid + '.csv'
data.to_csv(outputpath, index=False, columns = header, encoding='utf-8',date_format='%Y-%m-%d %H:%M:%S')
print("Successfully written", outputpath)
def main(argv):
inputpath = None
outputpath = None
gpsheader = None
gpsfile = None
if(len(argv)<1):
print('usage: fidas-parser.py -i <inputpath> [-m <mergeformat>] [-g <gpsfile>] [-o <outputpath>]')
sys.exit(2)
try:
opts, args = getopt.getopt(argv,"hi:m:g:o:",["ipath=","mform=","gfile=","ofile="])
except getopt.GetoptError:
print('usage: fidas-parser.py -i <inputpath> [-m <mergeformat>] [-g <gpsfile>] [-o <outputpath>]')
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print('python fidas-parser.py -i <inputpath> [-m <mergeformat>] [-g <gpsfile>] [-o <outputpath>]')
sys.exit()
elif opt in ("-i", "--ipath"):
inputpath = arg
elif opt in ("-o", "--ofpath"):
outputpath = arg
elif opt in ("-m","mform="):
gpsheader= arg
elif opt in ("-g","gpsfile="):
gpsfile = arg
if (os.path.isdir(inputpath)):
for filename in os.listdir(inputpath):
if filename.endswith(".txt"):
filepath = os.path.join(inputpath, filename)
processFile(filepath,gpsfile,gpsheader,outputpath)
continue
#Merge files
if outputpath is None:
allFiles = glob.glob("*.csv")
else:
allFiles = glob.glob(outputpath + "/*.csv")
frame = pd.DataFrame()
list_ = []
for file_ in allFiles:
df = pd.read_csv(file_,index_col=None, header=0)
list_.append(df)
frame = pd.concat(list_)
if outputpath is None:
frame.to_csv('combined.csv',index=False)
else:
frame.to_csv(outputpath +'/combined.csv',index=False)
elif (os.path.isfile(inputpath)):
processFile(inputpath,gpsfile,gpsheader,outputpath)
else:
print("Please provide correct path to a file or folder containing readings.")
print('usage: fidas-parser.py -i <inputpath> [-m <mergeformat>] [-g <gpsfile>] [-o <outputpath>]')
if __name__ == "__main__":
main(sys.argv[1:])