-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathget-historic-person-urls
108 lines (94 loc) · 3.74 KB
/
get-historic-person-urls
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/env python
import json
import re
import string
import requests
import requests_cache
from bs4 import BeautifulSoup
requests_cache.install_cache(allowable_codes=(200, 404))
def parse_date(d, typ):
if not d or d == '?':
return ''
try:
return int(d)
except ValueError:
pass
if typ == 'end' and d == 'January 6, 1835':
d = 'December 29, 1834'
m = re.search('\d{4}', d)
return int(m.group(0))
JSON = 'members/people.json'
data = json.load(open(JSON))
people = {}
for person in data['persons']:
for name in person.get('other_names', []):
if name['note'] != 'Main':
continue
if 'family_name' not in name:
continue
lookup = '%s, %s (%s)' % (name['family_name'], name['given_name'], name.get('honorific_prefix', ''))
people.setdefault(lookup, []).append(person)
presence = {}
for mship in data['memberships']:
if 'person_id' not in mship:
continue
presence.setdefault(mship['person_id'], {})
presence[mship['person_id']]['min'] = min(int(mship['start_date'][:4]), presence.get(mship['person_id'], {}).get('min', 9999))
presence[mship['person_id']]['max'] = max(int(mship.get('end_date', '9999-12-31')[:4]), presence.get(mship['person_id'], {}).get('max', 0))
result = {}
for l in string.ascii_lowercase:
if l == 'x':
continue
r = requests.get('https://api.parliament.uk/historic-hansard/people/%s' % l)
r.encoding = 'utf-8'
html = r.text
soup = BeautifulSoup(html, "html.parser")
lis = soup.find('ol').find_all('li')
for li in lis:
a = li.find('a')
url = a['href']
print('\r\x1b[K' + url, end=' ')
html = requests.get('https://api.parliament.uk/historic-hansard/people/%s/index.html' % url).text
if html == 'Page not found':
html = requests.get('https://api.parliament.uk/historic-hansard/people/%s' % url).text
if '<h2>Constituencies</h2>' not in html:
continue # Not an MP
fr = to = ''
for ship in re.findall("<li class='constituency'><a.*?</a> (.*?) - ?(.*)</li>", html):
if not fr:
fr = parse_date(ship[0], 'start')
t = parse_date(ship[1], 'end')
if not to or t > to:
to = t
if fr > 1935 or to < 1918:
continue # Do not care
name = a.string
m = re.match('(.+), (.*) \((.*)\)$', name)
family, given, honorific = m.groups()
if name == 'Power, Patrick (Mr)':
to = 1913 # Historic Hansard site wrong for this (and many other Irish entries)
if name == 'Millar, James (Sir)':
fr = 1911 # This actually is 1910, but TWFY appears to be missing some 1910-only entries
matches = len(people.get(name, []))
# This would only be needed for modern MPs lacking honorifics
# if matches == 0:
# name = '%s, %s ()' % (family, given)
# matches = len(people.get(name, []))
if matches:
found = False
for m in people[name]:
pid = m['id']
if fr == presence[pid]['min'] and to == presence[pid]['max']:
if pid in result:
raise Exception
m['identifiers'].append({'identifier': url, 'scheme': 'historichansard_url'})
result[pid] = url
people[name] = [mm for mm in people[name] if mm['id'] != pid]
found = True
if not found:
print('\n', url, presence[pid]['min'], presence[pid]['max'], fr, to)
raise Exception
else:
print('\n', matches, url)
raise Exception
json.dump(data, open(JSON, 'w'), indent=2, sort_keys=True)