mysociety · struan · Sep 16, 2016 · Sep 16, 2016
diff --git a/pyscraper/process_hansard.py b/pyscraper/process_hansard.py
@@ -5,11 +5,13 @@
 import os
 import datetime
 import re
-from os.path import join
+import json
+from os.path import join, exists
 from miscfuncs import toppath
 
 from new_hansard import ParseDay
 
+recess_file = join(toppath, 'recessdates.json')
 
 today = datetime.date.today()
 yesterday = today - datetime.timedelta(1)
@@ -33,6 +35,22 @@
     if m and os.path.isdir(fn) and ARGS.date_from <= m.group(2) <= ARGS.date_to:
         dirs.append(fn)
 
+if exists(recess_file):
+    with open(recess_file) as f:
+        recess_dates = json.load(f)
+else:
+    recess_dates = {'commons': {'recesses':[]}}
+
+# if it's Tuesday to Saturday, we are looking for yesterday's files and we didn't find any
+# check to see if it was a recess otherwise complain about missing files
+if 2 <= today.isoweekday() < 7 and len(dirs) == 0 and ARGS.date_from == yesterday.isoformat() and ARGS.date_to == today.isoformat():
+    is_recess = False
+    for date in recess_dates['commons']['recesses']:
+        if date['start'] < yesterday.isoformat() < date['end']:
+            is_recess = True
+    if not is_recess:
+        print "Yesterday (%s) was not a recess but we didn't fetch any files for Parliament" % yesterday.isoformat()
+
 # process the directories in date order so we do any revisions in the correct
 # order
 dirs.sort(key=lambda x: re.match('.*/%s' % dir_match, x).group(1))

diff --git a/pyscraper/scrape_recess_dates.py b/pyscraper/scrape_recess_dates.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+# vim:sw=4:ts=4:et:nowrap
+
+import urllib
+import mx.DateTime
+import json
+from os.path import join
+from bs4 import BeautifulSoup
+
+from miscfuncs import toppath
+
+recess_file = join(toppath, 'recessdates.json')
+
+def get_recess_dates(url):
+    page = urllib.urlopen(url)
+    content = page.read()
+    page.close()
+
+    soup = BeautifulSoup(content, 'html.parser')
+
+    dates = soup.find(id='ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_ctlMainBody_wrapperDiv')
+
+    today = mx.DateTime.today().date
+    recess_dates = []
+    for row in dates.find_all('tr'):
+        cells = row.find_all('td')
+        if len(cells) == 3:
+            name = cells[0].text
+            start_date = mx.DateTime.DateFrom(cells[1].text).date
+            end_date = mx.DateTime.DateFrom(cells[2].text).date
+
+            recess_dates.append({ 'name': name, 'start': start_date, 'end': end_date})
+
+    return { 'last_update': today, 'recesses': recess_dates}
+
+urls = {
+    'lords': 'http://www.parliament.uk/about/faqs/house-of-lords-faqs/lords-recess-dates/',
+    'commons': "http://www.parliament.uk/about/faqs/house-of-commons-faqs/business-faq-page/recess-dates/"
+}
+
+data = {}
+for house in urls.keys():
+    data[house] = get_recess_dates(urls[house])
+
+with open(recess_file, 'w') as f:
+    json.dump(data, f, indent=4, sort_keys=True)
+