-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added script for processing census data
Handles XLSX spreadsheets, outputs JSON into files based on the area code.
- Loading branch information
Showing
1 changed file
with
97 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
#!/usr/bin/env python | ||
|
||
from openpyxl import load_workbook | ||
import re | ||
import json | ||
import sys | ||
import os | ||
|
||
YEAR = '2013' | ||
if len(sys.argv) != 2: | ||
print("Usage: census.py <dataset>\n (where dataset can be found in input-datasets)") | ||
exit(1) | ||
NAME = sys.argv[1] | ||
|
||
wb = load_workbook( | ||
filename='input-datasets/%s.xlsx' % NAME, | ||
read_only=True | ||
) | ||
|
||
ws = wb["2 Area unit"] | ||
|
||
rows = iter(ws.rows) | ||
|
||
# skip crap at top of table | ||
for _ in range(0, 8): | ||
next(rows) | ||
|
||
# parse headers | ||
headers = {} | ||
|
||
area_unit_column = None | ||
area_unit_matcher = re.compile(r'area unit code', re.I) | ||
|
||
census_year_matcher = re.compile(r'^[\d]+ Census', re.I) | ||
|
||
h1 = next(rows) | ||
h2 = next(rows) | ||
header_prefix = h1[0].value | ||
for i in range(0, len(h2)): | ||
# read the headers for this column | ||
header_suffix = h2[i].value | ||
next_prefix = h1[i].value | ||
|
||
# handle empty (merged) cells | ||
if next_prefix is not None: | ||
header_prefix = next_prefix | ||
elif header_suffix is None: | ||
break | ||
|
||
# skip old data | ||
if census_year_matcher.match(header_prefix) and not header_prefix.startswith(YEAR): | ||
continue | ||
|
||
# headers generally have categories, we need those | ||
if header_suffix is None: | ||
header = [ header_prefix ] | ||
elif header_prefix is not header_suffix: | ||
header = [ header_prefix, header_suffix ] | ||
else: | ||
header = [ header_suffix ] | ||
|
||
headers[i] = header | ||
|
||
if area_unit_matcher.match(header[0]): | ||
if area_unit_column is not None: | ||
raise Exception("Detected multiple conflicting area unit columns!") | ||
area_unit_column = i | ||
|
||
if area_unit_column is None: | ||
raise Exception("Failed to detect area unit column!") | ||
|
||
output = 'processed-datasets/%s' % NAME | ||
os.makedirs(output, exist_ok=True) | ||
os.chdir(output) | ||
for row in rows: | ||
area_unit = row[area_unit_column].value | ||
# Skip empty rows (should just be the final/total row at the end) | ||
if area_unit is None: | ||
continue | ||
obj = {} | ||
# Create the data structure from the row | ||
for column in headers.keys(): | ||
pos = obj | ||
last_key = None | ||
for key in headers[column]: | ||
if last_key is not None: | ||
pos = pos[last_key] | ||
if key not in pos: | ||
pos[key] = {} | ||
last_key = key | ||
pos[last_key] = row[column].value | ||
# Write the data to a file | ||
fh = open(area_unit, 'w') | ||
try: | ||
json.dump(obj, fh) | ||
finally: | ||
fh.close() |