Added script for processing census data

Handles XLSX spreadsheets, outputs JSON into files based on the area code.
Arcton · Jul 30, 2016 · 1a5fb98 · 1a5fb98
1 parent 3b04793
commit 1a5fb98
Showing 1 changed file with 97 additions and 0 deletions.
diff --git a/census.py b/census.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+
+from openpyxl import load_workbook
+import re
+import json
+import sys
+import os
+
+YEAR = '2013'
+if len(sys.argv) != 2:
+    print("Usage: census.py <dataset>\n (where dataset can be found in input-datasets)")
+    exit(1)
+NAME = sys.argv[1]
+
+wb = load_workbook(
+    filename='input-datasets/%s.xlsx' % NAME,
+    read_only=True
+)
+
+ws = wb["2 Area unit"]
+
+rows = iter(ws.rows)
+
+# skip crap at top of table
+for _ in range(0, 8):
+    next(rows)
+
+# parse headers
+headers = {}
+
+area_unit_column = None
+area_unit_matcher = re.compile(r'area unit code', re.I)
+
+census_year_matcher = re.compile(r'^[\d]+ Census', re.I)
+
+h1 = next(rows)
+h2 = next(rows)
+header_prefix = h1[0].value
+for i in range(0, len(h2)):
+    # read the headers for this column
+    header_suffix = h2[i].value
+    next_prefix = h1[i].value
+
+    # handle empty (merged) cells
+    if next_prefix is not None:
+        header_prefix = next_prefix
+    elif header_suffix is None:
+        break
+
+    # skip old data
+    if census_year_matcher.match(header_prefix) and not header_prefix.startswith(YEAR):
+        continue
+
+    # headers generally have categories, we need those
+    if header_suffix is None:
+        header = [ header_prefix ]
+    elif header_prefix is not header_suffix:
+        header = [ header_prefix, header_suffix ]
+    else:
+        header = [ header_suffix ]
+
+    headers[i] = header
+
+    if area_unit_matcher.match(header[0]):
+        if area_unit_column is not None:
+            raise Exception("Detected multiple conflicting area unit columns!")
+        area_unit_column = i
+
+if area_unit_column is None:
+    raise Exception("Failed to detect area unit column!")
+
+output = 'processed-datasets/%s' % NAME
+os.makedirs(output, exist_ok=True)
+os.chdir(output)
+for row in rows:
+    area_unit = row[area_unit_column].value
+    # Skip empty rows (should just be the final/total row at the end)
+    if area_unit is None:
+        continue
+    obj = {}
+    # Create the data structure from the row
+    for column in headers.keys():
+        pos = obj
+        last_key = None
+        for key in headers[column]:
+            if last_key is not None:
+                pos = pos[last_key]
+            if key not in pos:
+                pos[key] = {}
+            last_key = key
+        pos[last_key] = row[column].value
+    # Write the data to a file
+    fh = open(area_unit, 'w')
+    try:
+        json.dump(obj, fh)
+    finally:
+        fh.close()