From 681f2288758888bd334c662967ef7e243747fba8 Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Mon, 6 Mar 2017 17:28:08 +0000 Subject: [PATCH 01/17] handle mutiple para tags in a debate question Fix for the parser failing to pick up all the text if there is more than one hs_Para element instite a Question tag --- pyscraper/new_hansard.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pyscraper/new_hansard.py b/pyscraper/new_hansard.py index 056d0bf08..a101e4fb5 100755 --- a/pyscraper/new_hansard.py +++ b/pyscraper/new_hansard.py @@ -617,6 +617,15 @@ def parse_question(self, question): p.text = re.sub('\n', ' ', text) tag.append(p) + + if len(para) > 1: + for p in para: + text = self.get_single_line_text_from_element(p) + if text != '': + p = etree.Element('p') + p.text = re.sub('\n', ' ', text) + tag.append(p) + self.root.append(tag) def parse_indent(self, tag): From 12eef35aeef26dcf49c9ffce4faecb311383a4f7 Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Wed, 15 Mar 2017 14:51:43 +0000 Subject: [PATCH 02/17] Keep a track of all the tag we've processed Store the UID and HRSContentID of handled tags so we can later compare to a list of all IDs in the document --- pyscraper/new_hansard.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/pyscraper/new_hansard.py b/pyscraper/new_hansard.py index a101e4fb5..b7e30c11e 100755 --- a/pyscraper/new_hansard.py +++ b/pyscraper/new_hansard.py @@ -72,6 +72,7 @@ def match_by_pims(self, pims_id): class BaseParseDayXML(object): input_root = None resolver = PimsList() + seen_elements = set() type_to_xpath = { 'debate': ( @@ -210,6 +211,25 @@ def is_pre_new_parser(self): return is_pre + def mark_seen(self, tag): + if tag.get('UID'): + if tag.get('UID') == '17020192000008': + print('17020192000010') + self.seen_elements.add(tag.get('UID')) + + if tag.get('HRSContentId'): + self.seen_elements.add(tag.get('HRSContentId')) + + def mark_xpath_seen(self, tag, xpath): + inner = tag.xpath(xpath, namespaces=self.ns_map) + if len(inner) > 0: + self.mark_seen(inner[0]) + + def mark_xpath_all_seen(self, tag, xpath): + inner = tag.xpath(xpath, namespaces=self.ns_map) + for t in inner: + self.mark_seen(t) + def get_tag_name_no_ns(self, tag): # remove annoying namespace for brevities sake tag_name = str(tag.tag) @@ -959,6 +979,9 @@ def handle_tag(self, tag_name, tag): else: handled = False + if handled: + self.mark_seen(tag) + return handled def parse_day(self, xml_file, out): @@ -1345,6 +1368,9 @@ def handle_tag(self, tag_name, tag): else: handled = super(PBCParseDayXML, self).handle_tag(tag_name, tag) + if handled: + self.mark_seen(tag) + return handled def get_sitting(self, xml_file): @@ -1627,6 +1653,9 @@ def handle_tag(self, tag_name, tag): else: handled = super(LordsParseDayXML, self).handle_tag(tag_name, tag) + if handled: + self.mark_seen(tag) + return handled From 391caef20206f14e5ab79a4750a9ff30a3bc94e9 Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Wed, 15 Mar 2017 14:58:53 +0000 Subject: [PATCH 03/17] throw an exception if it looks like we missed a tag Get a list of all tag IDs in the document and compare to the list we've processed and throw an exception if they don't match. --- pyscraper/new_hansard.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pyscraper/new_hansard.py b/pyscraper/new_hansard.py index b7e30c11e..52c031f9f 100755 --- a/pyscraper/new_hansard.py +++ b/pyscraper/new_hansard.py @@ -1029,6 +1029,16 @@ def parse_day(self, xml_file, out): # make sure we add any outstanding speech. self.clear_current_speech() + all_IDS = set( + self.input_root[0].xpath( + './/@UID|.//@HRSContentId' + ) + ) + diff = all_IDS.difference(self.seen_elements) + if len(diff) > 0: + raise Exception( + 'missed some elements', diff + ) return True def get_date(self, xml_file): From b7b303a78c91bace8bd6543e0332fc11f751b11b Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Fri, 17 Mar 2017 09:56:23 +0000 Subject: [PATCH 04/17] mark as seen tags not always accessed via handle_tag Copes with tags that are mostly processed from inside another tag --- pyscraper/new_hansard.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pyscraper/new_hansard.py b/pyscraper/new_hansard.py index 52c031f9f..bbfa0571f 100755 --- a/pyscraper/new_hansard.py +++ b/pyscraper/new_hansard.py @@ -339,6 +339,7 @@ def parse_member(self, tag): member_tag = tag if member_tag is not None: + self.mark_seen(member_tag) if member_tag.get('PimsId') == '-1': return self.handle_minus_member(member_tag) if member_tag.get('PimsId') == '0': @@ -530,6 +531,7 @@ def parse_debated_motion(self, motion): ) text = '' if len(following) == 1: + self.mark_seen(following[0]) text = u' - '.join([ self.get_single_line_text_from_element(motion), self.get_single_line_text_from_element(following[0]) @@ -558,6 +560,7 @@ def parse_WHDebate(self, debate): namespaces=self.ns_map ) if len(chair) == 1: + self.mark_seen(chair[0]) chair_text = self.get_single_line_text_from_element(chair[0]) text = u'\n{0} — {1}\n'.format(text, chair_text) @@ -581,6 +584,7 @@ def parse_question(self, question): tag.set('id', self.get_speech_id()) member = question.xpath('.//ns:Member', namespaces=self.ns_map)[0] + self.mark_seen(member) member = self.parse_member(member) if member is not None: tag.set('person_id', member['person_id']) @@ -682,6 +686,7 @@ def parse_para_with_member(self, para, member, **kwargs): if member is not None: self.new_speech(member, para.get('url')) elif members: + self.mark_seen(members[0]) m_name = None bs = members[0].xpath('./ns:B', namespaces=self.ns_map) if len(bs) == 1: @@ -716,6 +721,7 @@ def parse_para_with_member(self, para, member, **kwargs): if 'pwmotiontext' in kwargs: tag.set('pwmotiontext', kwargs['pwmotiontext']) + self.mark_seen(para) self.current_speech.append(tag) # TODO: this needs to parse out the various things that filtersentence @@ -723,8 +729,10 @@ def parse_para_with_member(self, para, member, **kwargs): # it will need to be a port of that to create proper XML elements # using etree def parse_para(self, para): + self.mark_seen(para) member = None for tag in para: + self.mark_seen(tag) tag_name = self.get_tag_name_no_ns(tag) if tag_name == 'B' or tag_name == 'Member': member = self.parse_member(tag) @@ -736,6 +744,7 @@ def parse_brev(self, brev): def parse_votelist(self, votes, direction, vote_list, is_teller=False): for vote in votes: + self.mark_seen(vote) tag = etree.Element('mpname') member = self.parse_member(vote) tag.set('person_id', member['person_id']) @@ -877,6 +886,7 @@ def parse_division(self, division): self.parse_para(para) def parse_time(self, tag): + self.mark_seen(tag) time_txt = u''.join(tag.xpath('.//text()')) if time_txt == '': return @@ -902,6 +912,7 @@ def parse_time(self, tag): def parse_procedure(self, procedure): tag = etree.Element('p') text = self.get_single_line_text_from_element(procedure) + self.mark_seen(procedure) if len(text) == 0: return @@ -1323,9 +1334,11 @@ def parse_brev(self, brev): self.parse_para_with_member(brev, None, css_class="indent") def parse_para(self, para): + self.mark_seen(para) has_i = False has_witness = False for tag in para.iter(): + self.mark_seen(tag) tag_name = self.get_tag_name_no_ns(tag) if tag_name == 'Witness': has_witness = True @@ -1472,6 +1485,7 @@ def parse_member(self, member): # In cases where there are unattributes exclamations then PimsId # is set to 0. Often the name will be "Noble Lords" or the like if member.get('PimsId') == 0: + self.mark_seen(member) found_member = { 'person_id': 'unknown', 'name': u''.join(member.xpath('.//text()')) @@ -1480,6 +1494,7 @@ def parse_member(self, member): return found_member def parse_newdebate(self, tag): + self.mark_seen(tag) time = tag.xpath('.//ns:hs_time', namespaces=self.ns_map) if len(time): self.parse_time(time[0]) @@ -1549,6 +1564,7 @@ def parse_clause_heading(self, heading): if self.current_speech is None: self.new_speech(None, heading.get('url')) self.current_speech.append(tag) + self.mark_seen(heading) def parse_division(self, division): ayes_count = \ @@ -1618,6 +1634,7 @@ def parse_division(self, division): def parse_votelist(self, votes, direction, vote_list): for vote in votes: + self.mark_seen(vote) tag = etree.Element('lord') member_name = self.get_single_line_text_from_element(vote) is_teller = False From 4b90d6462ac6a61232695afdb0684f1d2da80460 Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Fri, 17 Mar 2017 09:58:00 +0000 Subject: [PATCH 05/17] parse english and welsh only division counts Fixes #63 --- pyscraper/new_hansard.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pyscraper/new_hansard.py b/pyscraper/new_hansard.py index bbfa0571f..229405064 100755 --- a/pyscraper/new_hansard.py +++ b/pyscraper/new_hansard.py @@ -885,6 +885,21 @@ def parse_division(self, division): continue self.parse_para(para) + # FIXME - we should actually store the numbers + england_tags = division.xpath('./ns:EnglandWales/ns:hs_Para/* | ./ns:England/ns:hs_Para/*', namespaces=self.ns_map) + if len(england_tags): + self.mark_xpath_all_seen(division, './ns:EnglandWales | ./ns:England') + self.mark_xpath_all_seen(division, './ns:EnglandWales/ns:hs_Para | ./ns:England/ns:hs_Para') + details = etree.Element('p') + text = '' + for england_tag in england_tags: + self.mark_seen(england_tag) + content = tag.text + if content: + text += content + details.text = text + self.current_speech.append(details) + def parse_time(self, tag): self.mark_seen(tag) time_txt = u''.join(tag.xpath('.//text()')) From e7cc27ec7357dbf96d99351693e6cb89181fb004 Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Fri, 17 Mar 2017 09:59:27 +0000 Subject: [PATCH 06/17] mark all the paragraph and time tags in a division as seen --- pyscraper/new_hansard.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pyscraper/new_hansard.py b/pyscraper/new_hansard.py index 229405064..c1b56c0ca 100755 --- a/pyscraper/new_hansard.py +++ b/pyscraper/new_hansard.py @@ -882,6 +882,14 @@ def parse_division(self, division): if re.search(r'House\s*divided', text) or \ re.search(r'Committee\s*divided', text) or \ re.search(r'Division\s*No', text): + self.mark_seen(para) + for tag in para: + tag_name = self.get_tag_name_no_ns(tag) + if tag_name == 'Right': + times = tag.xpath('.//ns:Time', namespaces=self.ns_map) + if len(times) > 0: + self.parse_time(times[0]) + self.mark_seen(tag) continue self.parse_para(para) From 519f3c374bc916e39484345c8e46890f6abac058 Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Fri, 17 Mar 2017 10:14:11 +0000 Subject: [PATCH 07/17] mark various bits of structure as seen There's lots of tags that we don't directly parse as we're interested in sub tags or they are parsed as part of the parent. Mark these as seen. --- pyscraper/new_hansard.py | 55 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/pyscraper/new_hansard.py b/pyscraper/new_hansard.py index c1b56c0ca..0dd1866fd 100755 --- a/pyscraper/new_hansard.py +++ b/pyscraper/new_hansard.py @@ -426,6 +426,12 @@ def parse_oral_heading(self, heading): def parse_major(self, heading, **kwargs): text = self.get_text_from_element(heading) + + # housekeeping for making sure we see all the elements + departments = heading.xpath('.//ns:DepartmentName', namespaces=self.ns_map) + for department in departments: + self.mark_seen(department) + if text.strip() == 'Prayers': return self.clear_current_speech() @@ -594,6 +600,7 @@ def parse_question(self, question): question.xpath('.//ns:Number/text()', namespaces=self.ns_map) ) if number != '': + self.mark_xpath_seen(question, './/ns:Number') tag.set('oral-qnum', number) tag.set('colnum', self.current_col) @@ -793,6 +800,8 @@ def get_division_tag(self, division, yes_text, no_text): div_number = \ division.xpath('.//ns:Number/text()', namespaces=self.ns_map) + self.mark_xpath_seen(division, './/ns:Number') + tag.set('divnumber', u''.join(div_number)) tag.set('colnum', self.current_col) tag.set('time', self.current_time) @@ -822,6 +831,9 @@ def parse_division(self, division): noes_count = \ division.xpath('./ns:hs_Para/ns:NoesNumber/text()', namespaces=self.ns_map) + self.mark_xpath_seen(division, './ns:hs_Para/ns:NoesNumber') + self.mark_xpath_seen(division, './ns:hs_Para/ns:AyesNumber') + ayes_count_text = u''.join(ayes_count) noes_count_text = u''.join(noes_count) @@ -862,6 +874,22 @@ def parse_division(self, division): './/ns:TellerNamesNoes//ns:Member', namespaces=self.ns_map ) + # count the various bits of structure as seen for later + structure = division.xpath( + './/*[ns:NamesAyes or ns:NamesNoes or ns:TellerNamesNoes or ns:TellerNamesAyes]//ns:hs_Para', + namespaces=self.ns_map + ) + columns = division.xpath('.//ns:TwoColumn', namespaces=self.ns_map) + structure.extend(columns) + wrappers = division.xpath( + './/ns:NamesAyes | .//ns:NamesNoes | .//ns:TellerNamesNoes | .//ns:TellerNamesAyes', + namespaces=self.ns_map + ) + structure.extend(wrappers) + + for s in structure: + self.mark_seen(s) + aye_list = etree.Element('mplist') aye_list.set('vote', 'aye') aye_list = self.parse_votelist(ayes, 'aye', aye_list) @@ -1325,6 +1353,8 @@ def get_division_tag(self, division, yes_text, no_text): div_number = \ division.xpath('.//ns:Number/text()', namespaces=self.ns_map) + self.mark_xpath_seen(division, './/ns:Number') + tag.set('id', self.get_speech_id()) tag.set('divnumber', u''.join(div_number)) tag.set('ayes', yes_text) @@ -1524,6 +1554,18 @@ def parse_newdebate(self, tag): heading = tag.xpath('.//ns:hs_DebateHeading', namespaces=self.ns_map) debate_type = tag.xpath('.//ns:hs_DebateType', namespaces=self.ns_map) + self.mark_xpath_seen(tag, './/ns:hs_DebateHeading') + # TODO: sometimes there is a link to the bill + self.mark_xpath_seen(tag, './/ns:hs_DebateHeading/ns:a') + self.mark_xpath_seen(tag, './/ns:hs_DebateType') + + # This seems to happen occasionally and it's just filler to throw away + amendment = tag.xpath('.//ns:hs_AmendmentHeading', namespaces=self.ns_map) + if len(amendment) > 0: + text = self.get_single_line_text_from_element(amendment[0]) + if text == 'Motion': + self.mark_seen(amendment[0]) + if len(heading): if len(debate_type): text = self.get_single_line_text_from_element(debate_type[0]) @@ -1545,8 +1587,10 @@ def parse_newdebate(self, tag): if len(member_tags): if want_member: member = self.parse_member(member_tags[0]) + self.mark_xpath_seen(tag, './/ns:hs_TabledBy') else: tabledby_tags = tag.xpath('.//ns:hs_TabledBy', namespaces=self.ns_map) + self.mark_seen(tabledby_tags[0]) self.parse_para_with_member(tabledby_tags[0], None, css_class='italic', strip_member=False) questions = tag.xpath('.//ns:hs_Question', namespaces=self.ns_map) @@ -1595,6 +1639,9 @@ def parse_division(self, division): noes_count = \ division.xpath('.//ns:NotContentsNumber/text()', namespaces=self.ns_map) + self.mark_xpath_seen(division, './/ns:ContentsNumber') + self.mark_xpath_seen(division, './/ns:NotContentsNumber') + ayes_count_text = u''.join(ayes_count) noes_count_text = u''.join(noes_count) @@ -1618,6 +1665,9 @@ def parse_division(self, division): div_number = \ division.xpath('.//ns:DivisionNumber/text()', namespaces=self.ns_map) + self.mark_xpath_seen(division, './/ns:DivisionNumber') + self.mark_xpath_seen(division, './/ns:hs_DivNo') + tag.set('divnumber', u''.join(div_number)) tag.set('colnum', self.current_col) tag.set('time', self.current_time) @@ -1634,6 +1684,11 @@ def parse_division(self, division): noes = division.xpath( './/ns:NamesNotContents//ns:hs_DivListNames', namespaces=self.ns_map ) + self.mark_xpath_all_seen(division, './/ns:hs_DivListHead') + self.mark_xpath_seen(division, './/ns:NamesNotContents') + self.mark_xpath_seen(division, './/ns:NamesContents') + self.mark_xpath_seen(division, './/ns:NamesNotContents//ns:hs_DivListNames') + self.mark_xpath_seen(division, './/ns:NamesContents//ns:hs_DivListNames') aye_list = etree.Element('lordlist') aye_list.set('vote', 'content') From 810f5b1379d1c589da22ff53b2d286007fb033e2 Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Wed, 15 Mar 2017 16:32:38 +0000 Subject: [PATCH 08/17] correctly parse tables We didn't use namespaces before so they weren't being parsed properly. Correct this and track the tags. --- pyscraper/gidmatching.py | 2 +- pyscraper/new_hansard.py | 15 ++++++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/pyscraper/gidmatching.py b/pyscraper/gidmatching.py index 4f44ce70e..de03bedd2 100644 --- a/pyscraper/gidmatching.py +++ b/pyscraper/gidmatching.py @@ -65,7 +65,7 @@ def PrepareXMLForDiff(scrapeversion): if m: para = m.group(1) else: - assert re.match("\s*]*>|&\w+;|[^<>\s]+", para)) diff --git a/pyscraper/new_hansard.py b/pyscraper/new_hansard.py index 0dd1866fd..a7409ad25 100755 --- a/pyscraper/new_hansard.py +++ b/pyscraper/new_hansard.py @@ -766,18 +766,23 @@ def parse_votelist(self, votes, direction, vote_list, is_teller=False): return vote_list def parse_table(self, wrapper): - rows = wrapper.xpath('.//row') + rows = wrapper.xpath('.//ns:row', namespaces=self.ns_map) tag = etree.Element('table') body = etree.Element('tbody') url = None for row in rows: + self.mark_seen(row) row_tag = etree.Element('tr') row_tag.set('pid', self.get_pid()) - for entry in row.xpath('(.//hs_brev|.//hs_Para)'): - if url is None: - url = entry.get('url') - row_tag.append(list(entry)) + for entry in row.xpath('.//ns:entry', namespaces=self.ns_map): + cell_tag = etree.Element('td') + cell_tag.text = self.get_single_line_text_from_element(entry) + row_tag.append(cell_tag) + for para in entry.xpath('.//ns:hs_Para | .//ns:hs_para | .//ns:hs_brev', namespaces=self.ns_map): + if url is None: + url = para.get('url') + self.mark_seen(para) body.append(row_tag) From c8fe25f02677998c7a2b5072619aff2505fd5783 Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Wed, 15 Mar 2017 16:38:42 +0000 Subject: [PATCH 09/17] improve question parsing and track the tags Make sure we are coping with questions where part of the question isn't in the tail of QuestionText but is in following tags. Also cope with oddities like multiple question number tags. --- pyscraper/new_hansard.py | 44 +++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/pyscraper/new_hansard.py b/pyscraper/new_hansard.py index a7409ad25..b5b8d7fe0 100755 --- a/pyscraper/new_hansard.py +++ b/pyscraper/new_hansard.py @@ -606,23 +606,38 @@ def parse_question(self, question): tag.set('colnum', self.current_col) tag.set('time', self.current_time) - para = question.xpath('.//ns:hs_Para', namespaces=self.ns_map) - tag.set('url', self.get_speech_url(para[0].get('url'))) + first_para = question.xpath('.//ns:hs_Para', namespaces=self.ns_map)[0] + tag.set('url', self.get_speech_url(first_para.get('url'))) + self.mark_seen(first_para) p = etree.Element('p') p.set('pid', self.get_pid()) uin = question.xpath('.//ns:Uin', namespaces=self.ns_map) - if len(uin) == 1: + if len(uin) > 0: + self.mark_seen(uin[0]) uin_text = u''.join(uin[0].xpath('.//text()')) m = re.match('\[\s*(\d+)\s*\]', uin_text) if m is not None: no = m.groups(1)[0] p.set('qnum', no) - text = question.xpath( + # sometimes there are mutiple question numbers and it seems + # to be accidental so this is just checking that and also + # marking the additional ones as processed. + if len(uin) > 0 and len(uin) != 1: + uin_text = u''.join(uin[0].xpath('.//text()')) + for u in uin: + text = u''.join(u.xpath('.//text()')) + if text and text != uin_text: + raise Exception('Multiple numbers for a question') + self.mark_seen(u) + + text = first_para.xpath( './/ns:QuestionText/text()', namespaces=self.ns_map ) text = u''.join(text) + self.mark_xpath_seen(first_para, './/ns:QuestionText') + """ sometimes the question text is after the tag rather than inside it in which case we want to grab all the @@ -639,7 +654,7 @@ def parse_question(self, question): """ if text == '': - q_text = question.xpath( + q_text = first_para.xpath( './/ns:QuestionText/following-sibling::text()', namespaces=self.ns_map ) @@ -648,17 +663,18 @@ def parse_question(self, question): p.text = re.sub('\n', ' ', text) tag.append(p) - - if len(para) > 1: - for p in para: - text = self.get_single_line_text_from_element(p) - if text != '': - p = etree.Element('p') - p.text = re.sub('\n', ' ', text) - tag.append(p) - self.root.append(tag) + # and sometimes there is more question text in following siblings + # so we need to handle those too + following_tags = first_para.xpath( + './following-sibling::*', + namespaces=self.ns_map + ) + for t in following_tags: + tag_name = self.get_tag_name_no_ns(t) + self.handle_tag(tag_name, t) + def parse_indent(self, tag): self.parse_para_with_member(tag, None, css_class='indent') From f0425a8a3fb59a84cb6c5a3511fdf976d1ef9874 Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Wed, 15 Mar 2017 16:41:00 +0000 Subject: [PATCH 10/17] handle clause tags in the following heading Clause tags actually relate to the text after them so ignore them at the top level and then go back and parse them as part of the following heading tag. Then add them as the first part of the first speech under the heading. Fixes #53 --- pyscraper/new_hansard.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/pyscraper/new_hansard.py b/pyscraper/new_hansard.py index b5b8d7fe0..5b2adb6e2 100755 --- a/pyscraper/new_hansard.py +++ b/pyscraper/new_hansard.py @@ -141,7 +141,6 @@ class BaseParseDayXML(object): 'hs_AmendmentLevel2', 'hs_AmendmentLevel3', 'hs_AmendmentLevel4', - 'hs_8Clause', 'hs_newline10', 'hs_Question', 'hs_6CntrCapsHdg', @@ -160,6 +159,7 @@ class BaseParseDayXML(object): 'hs_TimeCode', 'hs_6bPetitions', 'hs_3MainHdg', + 'hs_8Clause', 'hs_Venue' ] root = None @@ -484,6 +484,23 @@ def parse_minor(self, heading): self.root.append(tag) self.output_heading = True + # if there is a clause immediately before then assume it's the clause + # we are about to debate and put it in the first speech + previous = heading.xpath( + './preceding-sibling::*', + namespaces=self.ns_map + ) + if len(previous): + clause = previous[-1] + if self.get_tag_name_no_ns(clause) == 'hs_8Clause': + self.mark_seen(clause) + text = self.get_single_line_text_from_element(clause) + if self.current_speech is None: + self.new_speech(None, clause.get('url')) + clause_tag = etree.Element('p') + clause_tag.text = text + self.current_speech.append(clause_tag) + def parse_generic(self, heading): if self.next_speech_num == 0: self.parse_major(heading) @@ -1189,6 +1206,7 @@ class PBCParseDayXML(BaseParseDayXML): ignored_tags = [ 'hs_CLHeading', 'hs_CLAttended', + 'hs_8Clause', 'hs_6fCntrItalHdg', ] From cf011ad61481267ee4c4e47c97a017318a4c9b92 Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Wed, 15 Mar 2017 16:58:34 +0000 Subject: [PATCH 11/17] cope with multiple debate heading and procedure tags in new debate If there's more than one heading or procedure in a new debate tag then make those into paragraphs in the first speech of the debate. --- pyscraper/new_hansard.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pyscraper/new_hansard.py b/pyscraper/new_hansard.py index 5b2adb6e2..ac56dda36 100755 --- a/pyscraper/new_hansard.py +++ b/pyscraper/new_hansard.py @@ -1615,10 +1615,6 @@ def parse_newdebate(self, tag): sys.stderr.write('newdebate with no heading', namespaces=self.ns_map) return - #procedure = tag.xpath('.//ns:hs_Procedure', namespaces=self.ns_map) - #if len(procedure) == 1: - # self.handle_para(procedure[0]) - want_member = tag.get('BusinessType') in ('Question', 'GeneralDebate') member = None @@ -1636,6 +1632,16 @@ def parse_newdebate(self, tag): for question in questions: self.parse_para_with_member(question, member if want_member else None) + # put in the rest of the headings as paragraphs at the start + heading = tag.xpath('.//ns:hs_DebateHeading | .//ns:hs_Procedure', namespaces=self.ns_map) + if len(heading) > 1: + for h in heading[1:]: + self.handle_tag('hs_para', h) + + paras = tag.xpath('.//ns:hs_para', namespaces=self.ns_map) + for para in paras: + self.handle_tag('hs_para', para) + def parse_amendment_heading(self, heading): self.new_speech(None, heading.get('url')) self.parse_para_with_member(heading, None) From 17b220b126c1139555074396b195e7413c686260 Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Wed, 15 Mar 2017 17:02:09 +0000 Subject: [PATCH 12/17] parse time tags inside a division tag --- pyscraper/new_hansard.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyscraper/new_hansard.py b/pyscraper/new_hansard.py index ac56dda36..a2ad41561 100755 --- a/pyscraper/new_hansard.py +++ b/pyscraper/new_hansard.py @@ -1679,6 +1679,10 @@ def parse_clause_heading(self, heading): self.mark_seen(heading) def parse_division(self, division): + time = division.xpath('.//ns:hs_time', namespaces=self.ns_map) + if len(time): + self.parse_time(time[0]) + ayes_count = \ division.xpath('.//ns:ContentsNumber/text()', namespaces=self.ns_map) noes_count = \ From fb88a0692dc99a32129c5666194309d46200caa3 Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Wed, 15 Mar 2017 17:03:34 +0000 Subject: [PATCH 13/17] gather up all the text inside a division tag --- pyscraper/new_hansard.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyscraper/new_hansard.py b/pyscraper/new_hansard.py index a2ad41561..2e1514cc6 100755 --- a/pyscraper/new_hansard.py +++ b/pyscraper/new_hansard.py @@ -1751,13 +1751,13 @@ def parse_division(self, division): self.root.append(tag) - paras = division.xpath('./ns:hs_Procedure', namespaces=self.ns_map) + paras = division.xpath('./ns:hs_Procedure | ./ns:hs_para', namespaces=self.ns_map) for para in paras: text = u''.join(para.xpath('.//text()')) - if re.search(r'Contents', text) or \ - re.search(r'Division\s*on', text): + if re.search(r'Contents', text): + self.mark_seen(para) continue - self.parse_para(para) + self.parse_para_with_member(para, None) def parse_votelist(self, votes, direction, vote_list): for vote in votes: From 08af3bc0f6c839632054f5bb255b218dad219282 Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Wed, 15 Mar 2017 17:05:37 +0000 Subject: [PATCH 14/17] hs_2cDebatedMotion is actually a major heading --- pyscraper/new_hansard.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyscraper/new_hansard.py b/pyscraper/new_hansard.py index 2e1514cc6..6b90c0aa1 100755 --- a/pyscraper/new_hansard.py +++ b/pyscraper/new_hansard.py @@ -112,6 +112,7 @@ class BaseParseDayXML(object): 'hs_6bBillsPresented', # FIXME should grab text of following tag 'hs_6fCntrItalHdg', 'hs_2cSO24Application', + 'hs_2cDebatedMotion', ] chair_headings = [ 'hs_76fChair', @@ -126,7 +127,6 @@ class BaseParseDayXML(object): 'hs_6bcBigBoldHdg', ] generic_headings = [ - 'hs_2cDebatedMotion', 'hs_2cGenericHdg', 'hs_2GenericHdg', ] From dcb33e56b386bf400ebcd2e686c6ffc1cae8e047 Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Wed, 15 Mar 2017 17:37:58 +0000 Subject: [PATCH 15/17] better parsing for Lords Amemdments rather than just parsing it all into a single line of text parse all the paragraphs and indents so that we try and retain a bit more structure. --- pyscraper/new_hansard.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/pyscraper/new_hansard.py b/pyscraper/new_hansard.py index 6b90c0aa1..ee818bbc4 100755 --- a/pyscraper/new_hansard.py +++ b/pyscraper/new_hansard.py @@ -1656,12 +1656,17 @@ def parse_tabledby(self, tabledby): ) def parse_amendment(self, amendment): - self.parse_para_with_member( - amendment, - None, - css_class='italic', - pwmotiontext='unrecognized' - ) + # Amendments are often things like: + # + # 54: + # Clause 67, page 30, line 9, leave out “high” and insert + # “higher” + # + # so we need to parse the tags to make sure we get the + # indenting etc + for tag in amendment.getchildren(): + tag_name = self.get_tag_name_no_ns(tag) + self.handle_tag(tag_name, tag) def parse_clause_heading(self, heading): tag = etree.Element('p') From 8c26cad7c76b35ebb003e841abcdcf64444dc1a0 Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Thu, 16 Mar 2017 09:50:11 +0000 Subject: [PATCH 16/17] Script to reparse all hansard zip file contents Scans the list of seen files and then picks out the latest one and then re-parses that. Assumes that the files are ordered in date order in the list. --- pyscraper/reparse.py | 97 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100755 pyscraper/reparse.py diff --git a/pyscraper/reparse.py b/pyscraper/reparse.py new file mode 100755 index 000000000..2584b6c47 --- /dev/null +++ b/pyscraper/reparse.py @@ -0,0 +1,97 @@ +#! /usr/bin/env python + +import argparse +import os +import re +from os.path import join + +from miscfuncs import toppath +from new_hansard import ParseDay + +index_filename = join(toppath, 'seen_hansard_xml.txt') +reparse_filename = join(toppath, 'reparse_hansard_xml.txt') +zip_directory = join(toppath, 'cmpages', 'hansardzips') +zip_dir_slash = "%s/" % zip_directory +line_re = re.compile(r'^[^:]*:(.*/)([^/]*)$') +files = {} + +parser = argparse.ArgumentParser(description='Process Hansard XML.') +parser.add_argument('-v', '--verbose', action='count') +ARGS = parser.parse_args() + +# make sure we only look at a file once +class Entries(list): + def __init__(self): + entries = [] + if os.path.exists(reparse_filename): + with open(reparse_filename) as f: + entries = [e.strip().replace(zip_dir_slash, '') for e in f.readlines()] + super(Entries, self).__init__(entries) + + def dump(self): + with open(reparse_filename, 'w') as f: + f.writelines("{0}\n".format(entry) for entry in self) + +entries = Entries() + + +def handle_file(filename, debate_type): + file_key = '{0}:{1}'.format( + debate_type, + filename.strip().replace(zip_dir_slash, '') + ) + if file_key in entries: + if ARGS.verbose: + print "already seen {0}, not re-parsing again".format(filename) + return False + + parser.reset() + if ARGS.verbose: + print "looking at {0}".format(filename) + ret = parser.handle_file(filename, debate_type, ARGS.verbose) + + if ret == 'failed': + print "ERROR parsing {0} {1}".format(filename, debate_type) + elif ret == 'not-present': + print "Nothing to parse in {0} {1}".format(filename, debate_type) + elif ret == 'same': + print "parsed {0} {1}, no changes from {2}".format( + filename, debate_type, parser.prev_file + ) + elif ret in ('change', 'new'): + print "parsed {0} {1} to {2}".format( + filename, debate_type, parser.output_file + ) + else: + print "parsed {0} {1} to {2}, unknown return {3}".format( + filename, debate_type, parser.output_file, ret + ) + entries.append(file_key) + + return True + +with open(index_filename) as lines: + for line in lines: + matches = line_re.search(line.strip()) + if matches: + files[matches.group(2)] = matches.group(1) + matches.group(2) + +parser = ParseDay() + +try: + for filename in files.values(): + f = join(toppath, 'cmpages', 'hansardzips', filename) + if 'CHAN' in f: + handle_file(f, 'debate') + handle_file(f, 'westminhall') + elif 'LHAN' in f: + handle_file(f, 'lords') + elif 'PBC' in f: + handle_file(f, 'standing') + +# this is just to make sure we record progress +except Exception: + entries.dump() + raise + +entries.dump() From 97d679cea618533cbf1f4caba4705c848fb12894 Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Fri, 17 Mar 2017 12:39:28 +0000 Subject: [PATCH 17/17] handle times with a newline between hours and minutes --- pyscraper/new_hansard.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyscraper/new_hansard.py b/pyscraper/new_hansard.py index ee818bbc4..941690751 100755 --- a/pyscraper/new_hansard.py +++ b/pyscraper/new_hansard.py @@ -979,7 +979,7 @@ def parse_time(self, tag): time_txt = u''.join(tag.xpath('.//text()')) if time_txt == '': return - matches = re.match('(\d+)(?:[:.](\d+))?[\xa0\s]*(am|pm)', time_txt) + matches = re.match('(\d+)(?:[:.\n](\d+))?[\xa0\s]*(am|pm)', time_txt) if matches: hours = int(matches.group(1)) minutes = int(matches.group(2) or 0)