Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Detect when Parliament data hasn't arrived and warn #74

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion pyscraper/process_hansard.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@
import os
import datetime
import re
from os.path import join
import json
from os.path import join, exists
from miscfuncs import toppath

from new_hansard import ParseDay

recess_file = join(toppath, 'recessdates.json')

today = datetime.date.today()
yesterday = today - datetime.timedelta(1)
Expand All @@ -33,6 +35,22 @@
if m and os.path.isdir(fn) and ARGS.date_from <= m.group(2) <= ARGS.date_to:
dirs.append(fn)

if exists(recess_file):
with open(recess_file) as f:
recess_dates = json.load(f)
else:
recess_dates = {'commons': {'recesses':[]}}

# if it's Tuesday to Saturday, we are looking for yesterday's files and we didn't find any
# check to see if it was a recess otherwise complain about missing files
if 2 <= today.isoweekday() < 7 and len(dirs) == 0 and ARGS.date_from == yesterday.isoformat() and ARGS.date_to == today.isoformat():
is_recess = False
for date in recess_dates['commons']['recesses']:
if date['start'] < yesterday.isoformat() < date['end']:
is_recess = True
if not is_recess:
print "Yesterday (%s) was not a recess but we didn't fetch any files for Parliament" % yesterday.isoformat()

# process the directories in date order so we do any revisions in the correct
# order
dirs.sort(key=lambda x: re.match('.*/%s' % dir_match, x).group(1))
Expand Down
47 changes: 47 additions & 0 deletions pyscraper/scrape_recess_dates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env python
# vim:sw=4:ts=4:et:nowrap

import urllib
import mx.DateTime
import json
from os.path import join
from bs4 import BeautifulSoup

from miscfuncs import toppath

recess_file = join(toppath, 'recessdates.json')

def get_recess_dates(url):
page = urllib.urlopen(url)
content = page.read()
page.close()

soup = BeautifulSoup(content, 'html.parser')

dates = soup.find(id='ctl00_ctl00_FormContent_SiteSpecificPlaceholder_PageContent_ctlMainBody_wrapperDiv')

today = mx.DateTime.today().date
recess_dates = []
for row in dates.find_all('tr'):
cells = row.find_all('td')
if len(cells) == 3:
name = cells[0].text
start_date = mx.DateTime.DateFrom(cells[1].text).date
end_date = mx.DateTime.DateFrom(cells[2].text).date

recess_dates.append({ 'name': name, 'start': start_date, 'end': end_date})

return { 'last_update': today, 'recesses': recess_dates}

urls = {
'lords': 'http://www.parliament.uk/about/faqs/house-of-lords-faqs/lords-recess-dates/',
'commons': "http://www.parliament.uk/about/faqs/house-of-commons-faqs/business-faq-page/recess-dates/"
}

data = {}
for house in urls.keys():
data[house] = get_recess_dates(urls[house])

with open(recess_file, 'w') as f:
json.dump(data, f, indent=4, sort_keys=True)