Skip to content

Commit

Permalink
Added script for processing census data
Browse files Browse the repository at this point in the history
Handles XLSX spreadsheets, outputs JSON into files based on the area
code.
  • Loading branch information
jtsymon committed Jul 30, 2016
1 parent 3b04793 commit 1a5fb98
Showing 1 changed file with 97 additions and 0 deletions.
97 changes: 97 additions & 0 deletions census.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#!/usr/bin/env python

from openpyxl import load_workbook
import re
import json
import sys
import os

YEAR = '2013'
if len(sys.argv) != 2:
print("Usage: census.py <dataset>\n (where dataset can be found in input-datasets)")
exit(1)
NAME = sys.argv[1]

wb = load_workbook(
filename='input-datasets/%s.xlsx' % NAME,
read_only=True
)

ws = wb["2 Area unit"]

rows = iter(ws.rows)

# skip crap at top of table
for _ in range(0, 8):
next(rows)

# parse headers
headers = {}

area_unit_column = None
area_unit_matcher = re.compile(r'area unit code', re.I)

census_year_matcher = re.compile(r'^[\d]+ Census', re.I)

h1 = next(rows)
h2 = next(rows)
header_prefix = h1[0].value
for i in range(0, len(h2)):
# read the headers for this column
header_suffix = h2[i].value
next_prefix = h1[i].value

# handle empty (merged) cells
if next_prefix is not None:
header_prefix = next_prefix
elif header_suffix is None:
break

# skip old data
if census_year_matcher.match(header_prefix) and not header_prefix.startswith(YEAR):
continue

# headers generally have categories, we need those
if header_suffix is None:
header = [ header_prefix ]
elif header_prefix is not header_suffix:
header = [ header_prefix, header_suffix ]
else:
header = [ header_suffix ]

headers[i] = header

if area_unit_matcher.match(header[0]):
if area_unit_column is not None:
raise Exception("Detected multiple conflicting area unit columns!")
area_unit_column = i

if area_unit_column is None:
raise Exception("Failed to detect area unit column!")

output = 'processed-datasets/%s' % NAME
os.makedirs(output, exist_ok=True)
os.chdir(output)
for row in rows:
area_unit = row[area_unit_column].value
# Skip empty rows (should just be the final/total row at the end)
if area_unit is None:
continue
obj = {}
# Create the data structure from the row
for column in headers.keys():
pos = obj
last_key = None
for key in headers[column]:
if last_key is not None:
pos = pos[last_key]
if key not in pos:
pos[key] = {}
last_key = key
pos[last_key] = row[column].value
# Write the data to a file
fh = open(area_unit, 'w')
try:
json.dump(obj, fh)
finally:
fh.close()

0 comments on commit 1a5fb98

Please sign in to comment.