-
Notifications
You must be signed in to change notification settings - Fork 2
/
pdf-to-csv.py
executable file
·200 lines (170 loc) · 7.9 KB
/
pdf-to-csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#!/usr/bin/env python
# Paul Sladen <[email protected]> - 2014-05-27
# This programme is hereby placed in the public domain, and may be
# distributed and modified without limit, for any purpose.
#
# ./pdf-to-csv.py > inmarsat-su-log-redacted.csv
#
# Tool to deterministically extract log records from the redacted
# Signalling Unit logfile distributed in PDF format on 23 May 2014
# and published by Inmarsat and the Malaysia DCA on 27 May 2014:
published_url = 'http://www.dca.gov.my/mainpage/MH370%20Data%20Communication%20Logs.pdf'
sha1_checksum = '7b49c5fdec65055ba0a866ccbe468fd38df0d67d'
file_datetime = '2014-03-27T03:04:37Z'
# Sanity check (from `pdftotext *.pdf - | grep -c '/03/2014'`)
# The minus one is the duplicated example on the front page
total_records = 1116 - 1
local_filename = 'MH370 Data Communication Logs.pdf'
# The aircraft
callsign = "9M-MRO"
icao_hex = 0x75008F
# Logfile time-stamp format
timestamp_format = "%d/%m/%Y %H:%M:%S.%f"
# Other partial sources
# http://www.cnn.com/video/data/2.0/video/world/2014/05/27/pkg-quest-inmarsat-data.cnn.html
# Has a number of brief shorts of the original textual .csv (~5 pages A4 word-wrapped)
# Some numbers can probably be cross-referenced and filled from this
import subprocess
import hashlib
import re
import time
import urllib2
import os
import sys
import datetime
def sanity_check(pdf_contents, pdf_checksum, pdftotext):
# Check the PDF input file you have matches the one used originally
hasher = hashlib.sha1()
checksum = hasher.update(pdf_contents)
assert pdf_checksum == hasher.hexdigest()
# Check the version of the 'pdftotext' command-line tool matches
# (other versions of pdftotext will work)
assert pdftotext == subprocess.check_output(
['pdftotext', '-v'],
stderr=subprocess.STDOUT).splitlines()[0].split(' ', 1)[1]
def timestamp(record):
# Parse the logfile's timestamp format, for when we need it
return datetime.datetime.strptime(record.split(',', 1)[0], timestamp_format)
def main():
global local_filename
# Use name on command-line if given
if len(sys.argv) > 1:
local_filename = sys.argv[1]
# Do we already have a copy of the Inmarsat PDF?
if not os.path.isfile(local_filename):
# If not, fetch the PDF, cache the PDF locally
connection = urllib2.urlopen(published_url)
open(local_filename, 'wb').write(connection.read())
# (Try) to open the local PDF file
pdf = open(local_filename, 'rb').read()
# Be deterministic, and check everyone has the same starting point
# Same input file, and same tools
sanity_check(pdf, sha1_checksum, pdftotext='version 0.18.4')
# Convert the "logfile" PDF to a textfile, to start to work from
# (Note that we could work from the raw PDF stream, or from HTML instead)
text = subprocess.check_output(['pdftotext', '-layout', local_filename, '-'])
pages = text.split('\f')
# Start collecting!
redacted_records = []
# Page 1, the introduction, with the real logfile headings
# Page 1 also carries a full (unredacted) example
for p in pages[0:1]:
line = p.splitlines()
# Recover column headings from within the preamble blurb
# The middle of the three lines are in split 'Frame Number' and needs a ' ' space retaining
field_headings = (line[14].lstrip() + ' ' + line[15].lstrip() + line[16].lstrip()).replace(', ',',')
#csv.append(csv_headings)
#print csv_headings
# Cache the only (complete) unredacted log record, from 16:42:04.408
full_164204_408 = ''.join(map(str.lstrip, line[20:22]))
assert icao_hex == int(full_164204_408.split(',')[1],8)
#csv.append(unredacted_164204_408)
#print `unredacted_164204_408`
# Page 2, is a page trying to explain doppler correction based on
# IMU-derived location while failing to mention the existance of
# the IMU/IMS.
# Pages 3 .. X, are the redacted log with various degrees of
# reformatting.
for p in pages[2:]:
lines = p.splitlines()
#for l in lines:
# try:
# bfo_column = l.index(' BFO') - 2
# break
# except ValueError: continue
#for l in lines:
# try:
# bto_column = l.index(' BTO') - 2
# break
# except ValueError: continue
for i in xrange(len(lines)):
l = lines[i]
if l.lstrip()[1:].startswith('/03/2014'):
commas = re.sub('\s{2,}', ',', l.lstrip())
# Cope with lack of double-space in PDF before 'IOR' in some records
commas = commas.replace(' IOR', ',IOR')
# Alternatively, try to look for missing numbers and add commas
# if (len(l) < bfo_column) or l.find('0x20 - Access Request') > 0:
# commas += ','
# if (len(l) < bto_column) or l.find('0x20 - Access Request') > 0:
# commas += ','
# Cope with vertical line-wrap across three-lines in the PDF
# (Vertical centreing in spreadsheet cells)
if lines[i-1].find('0x10 - Log-on Request') >= 0:
separated = commas.split(',')
# Half of the message appears on the /previous/ text line,
# and half on the /following/ line
prev_line = lines[i-1].lstrip().rstrip()
next_line = lines[i+1].lstrip().rstrip()
# Insert where the sixth field would have been
separated.insert(6, prev_line + ' ' + next_line)
# Reassemble the repaired line
commas = ','.join(separated)
# Add one, or two commas to fill out empty BFO/BTO to nine columns
# Check that it's only the BFO/BTO columns missing
missing_fields = 9 - len(commas.split(','))
assert missing_fields <= 3
commas += ',' * missing_fields
redacted_records.append(commas)
# For debugging: dump redacted-data only
# This was used for diffing the output against 'davetaz's extraction with:
# diff --strip-trailing-cr -du
if False:
print '\n'.join(sorted(redacted_records, key=timestamp))
sys.exit()
# Space out the redacted data to match the full logfile field layout
# There are several columns where we don't have the data
# The only one we can fill out is the ICAO 24-bit AES ID.
full_records = []
# Save the SITA Owner for filling the missing field
owner = full_164204_408.split(',')[2]
for r in redacted_records:
field = r.split(',')
reordered_record = ','.join(field[0:1] + # timestamp
[oct(icao_hex)[1:]] + # 24-bit octal ICAO AES ID, no leading zero - insert
[owner] + # Owner - insert
field[1:4] + # Channel Name, Ocean Region, GES ID
1 * [''] + # Bearer
field[4:6] + # Channel ID, Channel Type
4 * [''] + # Framing
field[6:7] + # SU Type
11 * [''] + # (Redacted fields, content and RSST)
field[7:8] + # BFO
1 * [''] + # Estimated BER
field[8:] # BTO
)
# Patch in the full-form log entry from the page at the correct place
if field[0] == full_164204_408.split(',')[0]:
reordered_record = full_164204_408
full_records.append(reordered_record)
# Resort the logfile time order, to cope
# with the circuit-switch calls being split out
# into an appendix on the last pages.
sorted_full_records = sorted(full_records, key=timestamp)
# Output a sorted, usable .csv logfile
# in approximately the original logfile format
assert len(sorted_full_records) == total_records
sorted_full_records.insert(0, field_headings)
print '\n'.join(sorted_full_records)
if __name__ == '__main__':
main()