From 1a5fb989e42d6212de5188f7e3ad0fd1303440cd Mon Sep 17 00:00:00 2001 From: Jeremy Symon Date: Sat, 30 Jul 2016 16:29:12 +1200 Subject: [PATCH] Added script for processing census data Handles XLSX spreadsheets, outputs JSON into files based on the area code. --- census.py | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 census.py diff --git a/census.py b/census.py new file mode 100644 index 0000000..4987a88 --- /dev/null +++ b/census.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python + +from openpyxl import load_workbook +import re +import json +import sys +import os + +YEAR = '2013' +if len(sys.argv) != 2: + print("Usage: census.py \n (where dataset can be found in input-datasets)") + exit(1) +NAME = sys.argv[1] + +wb = load_workbook( + filename='input-datasets/%s.xlsx' % NAME, + read_only=True +) + +ws = wb["2 Area unit"] + +rows = iter(ws.rows) + +# skip crap at top of table +for _ in range(0, 8): + next(rows) + +# parse headers +headers = {} + +area_unit_column = None +area_unit_matcher = re.compile(r'area unit code', re.I) + +census_year_matcher = re.compile(r'^[\d]+ Census', re.I) + +h1 = next(rows) +h2 = next(rows) +header_prefix = h1[0].value +for i in range(0, len(h2)): + # read the headers for this column + header_suffix = h2[i].value + next_prefix = h1[i].value + + # handle empty (merged) cells + if next_prefix is not None: + header_prefix = next_prefix + elif header_suffix is None: + break + + # skip old data + if census_year_matcher.match(header_prefix) and not header_prefix.startswith(YEAR): + continue + + # headers generally have categories, we need those + if header_suffix is None: + header = [ header_prefix ] + elif header_prefix is not header_suffix: + header = [ header_prefix, header_suffix ] + else: + header = [ header_suffix ] + + headers[i] = header + + if area_unit_matcher.match(header[0]): + if area_unit_column is not None: + raise Exception("Detected multiple conflicting area unit columns!") + area_unit_column = i + +if area_unit_column is None: + raise Exception("Failed to detect area unit column!") + +output = 'processed-datasets/%s' % NAME +os.makedirs(output, exist_ok=True) +os.chdir(output) +for row in rows: + area_unit = row[area_unit_column].value + # Skip empty rows (should just be the final/total row at the end) + if area_unit is None: + continue + obj = {} + # Create the data structure from the row + for column in headers.keys(): + pos = obj + last_key = None + for key in headers[column]: + if last_key is not None: + pos = pos[last_key] + if key not in pos: + pos[key] = {} + last_key = key + pos[last_key] = row[column].value + # Write the data to a file + fh = open(area_unit, 'w') + try: + json.dump(obj, fh) + finally: + fh.close()