From daf097f0b9b910aafb8ab3a4c39bcc965f73d44e Mon Sep 17 00:00:00 2001 From: Matthew Somerville Date: Mon, 13 Mar 2023 15:14:21 +0000 Subject: [PATCH] Upgrade python 2 to 3. --- filtersentence_xml.py | 19 +- get-historic-debates | 10 +- get-historic-person-urls | 6 +- london-mayors-questions/questions.py | 18 +- members/parl-old-check-party.py | 8 +- members/wikipedia-commons.py | 26 +- members/wikipedia-lords.py | 16 +- members/wikipedia-standingdown.py | 12 +- pyscraper/base_resolver.py | 16 +- pyscraper/contextexception.py | 7 - pyscraper/get_links_from_ep.py | 2 +- pyscraper/gettwittermps.py | 8 +- pyscraper/gidmatching.py | 22 +- pyscraper/lazyrunall.py | 8 +- pyscraper/lords/resolvenames.py | 25 +- pyscraper/miscfuncs.py | 785 +++++++++--------- pyscraper/new_hansard.py | 102 ++- pyscraper/ni/parse.py | 13 +- pyscraper/ni/resolvenames.py | 28 +- pyscraper/ni/scrape.py | 26 +- pyscraper/ni/wikipedia-mla.py | 22 +- pyscraper/parlphrases.py | 5 +- pyscraper/patchtool.py | 31 +- pyscraper/process_hansard.py | 18 +- pyscraper/pullgluepages.py | 3 +- pyscraper/regmem/filter.py | 22 +- pyscraper/regmem/pullgluepages.py | 53 +- pyscraper/resolvemembernames.py | 59 +- pyscraper/runfilters.py | 26 +- pyscraper/sp/common.py | 18 +- pyscraper/sp/fastest-msps.py | 21 +- pyscraper/sp/get-official-reports-new.py | 39 +- pyscraper/sp/parse-official-reports-new.py | 82 +- pyscraper/sp/resolvenames.py | 26 +- pyscraper/sp/wikipedia-msp.py | 26 +- pyscraper/test.py | 58 +- pyscraper/unpack_hansard_zips.py | 8 +- pyscraper/wa/parse.py | 33 +- pyscraper/xmlfilewrite.py | 11 +- scripts/2016_data_update/dadem_import_ni.py | 6 +- scripts/2016_data_update/dadem_import_sp.py | 8 +- scripts/2021-lam-update | 8 +- scripts/2021-msp-update | 6 +- scripts/add-new-lords | 16 +- scripts/datadotparl/crawl-members | 2 +- scripts/datadotparl/json-add-new-parl-ids | 2 +- scripts/datadotparl/mp-party-check | 2 +- scripts/datadotparl/one-off-add-pims-ids | 2 +- scripts/datadotparl/one-off-sync-lord-parties | 2 +- scripts/datadotparl/update-members | 2 +- scripts/fetch-mp-eu-ref-positions | 8 +- scripts/fetch-pw-json | 4 +- scripts/fetch_london_assembly.py | 30 +- scripts/fetch_scottish_ministers.py | 11 +- .../fetch_wikidata_from_everypolitician.py | 2 +- scripts/json-add-membership | 2 +- scripts/json-body-end | 2 +- scripts/json-change-party | 2 +- scripts/json-edit-person | 2 +- scripts/json-end-membership | 8 +- scripts/json-merge-people | 2 +- scripts/json-new-ids | 14 +- scripts/json-nia-2017-new | 10 +- scripts/json-nia-2022-new | 6 +- scripts/popolo/menu.py | 1 + scripts/popolo/utils.py | 2 +- scripts/quickupdate | 10 +- scripts/welsh-parliament/dual-posts.py | 2 +- scripts/welsh-parliament/memberships.py | 14 +- scripts/welsh-parliament/official-ids.py | 2 +- scripts/welsh-parliament/organizations.py | 14 +- scripts/welsh-parliament/persons.py | 14 +- scripts/welsh-parliament/posts.py | 16 +- scripts/ynmp/update.py | 18 +- wrans-2014/parse.py | 26 +- 75 files changed, 943 insertions(+), 1053 deletions(-) diff --git a/filtersentence_xml.py b/filtersentence_xml.py index 49dd08569..2e64fbb73 100644 --- a/filtersentence_xml.py +++ b/filtersentence_xml.py @@ -1,16 +1,10 @@ -#! /usr/bin/python - from datetime import datetime import re -import string from lxml import etree from contextexception import ContextException from parlphrases import parlPhrases - -from wrans.emblinks import rreglink, rregemail, rehtlink, ConstructHTTPlink - from resolvemembernames import memberList @@ -81,12 +75,12 @@ def TokenStandingOrder(mstandingo, phrtok): 'phrase', ' class="standing-order" code="%s"' % mstandingo.group(1) ) +rehtlink = re.compile('(?= string.atoi(qcpart.group(2)): - print ' non-following column leadoff ', qoffrep.group(0) + if int(qcpartlead) >= int(qcpart.group(2)): + print(' non-following column leadoff ', qoffrep.group(0)) # raise Exception, ' non-following column leadoff ' if qcolsuffix == 'WH': @@ -218,8 +212,7 @@ def TokenHonFriend(mhonfriend, phrtok): # remove any xml entities from the name orgname = res[1] - # if you put the .encode("latin-1") on the res[1] it doesn't work when there are strange characters. - return ('phrase', (' class="honfriend" person_id="%s" name="%s"' % (nid, orgname)).encode("latin-1")) + return ('phrase', ' class="honfriend" person_id="%s" name="%s"' % (nid, orgname)) # the array of tokens which we will detect on the way through @@ -312,4 +305,4 @@ def GetPara(self): else: res.append(tok[2]) - return string.join(res, '') + return ''.join(res) diff --git a/get-historic-debates b/get-historic-debates index ee5c5fff4..5c4d36e79 100644 --- a/get-historic-debates +++ b/get-historic-debates @@ -11,7 +11,7 @@ import requests import requests_cache from xml.sax.saxutils import escape, quoteattr from lxml import html, etree -from urlparse import urljoin +from urllib.parse import urljoin requests_cache.install_cache(cache_name='debates', allowable_codes=(200, 404)) BASE_SOURCE_URL = 'https://api.parliament.uk' @@ -183,9 +183,9 @@ def walk(ol, typ, prefix=''): title = link.text_content() url = BASE_SOURCE_URL + link.get('href') if re.match('ORALL? ANS[WN]?ERS? [Tt][Oo] [OQU]UESTIONS?[.,]?$', title): - next_prefix = u'Oral Answers to Questions — ' + next_prefix = 'Oral Answers to Questions — ' elif re.match('ORDERS OF THE\.? DAY[.,:]?$', title): - next_prefix = u'Orders of the Day — ' + next_prefix = 'Orders of the Day — ' else: next_prefix = '' out += output_xml('%s-heading' % typ, '%s%s' % (prefix, escape(title)), url) @@ -225,13 +225,13 @@ for year in range(1919, 1935+1): if "id='commons'" not in res.content: continue - print '\r\x1b[K%d' % year, month, day, + print('\r\x1b[K%d' % year, month, day, end=' ') date = '%d-%02d-%02d' % (year, months.index(month)+1, day) col = 0 tree = html.fromstring(res.content) ol = tree.cssselect('h3#commons + ol')[0] - out = u'\n' + out = '\n' out += walk(ol, 'major') out += '\n' diff --git a/get-historic-person-urls b/get-historic-person-urls index 7719ae1d9..23e777986 100644 --- a/get-historic-person-urls +++ b/get-historic-person-urls @@ -55,7 +55,7 @@ for l in string.ascii_lowercase: for li in lis: a = li.find('a') url = a['href'] - print '\r\x1b[K' + url, + print('\r\x1b[K' + url, end=' ') html = requests.get('https://api.parliament.uk/historic-hansard/people/%s/index.html' % url).text if html == 'Page not found': html = requests.get('https://api.parliament.uk/historic-hansard/people/%s' % url).text @@ -99,10 +99,10 @@ for l in string.ascii_lowercase: people[name] = [mm for mm in people[name] if mm['id'] != pid] found = True if not found: - print '\n', url, presence[pid]['min'], presence[pid]['max'], fr, to + print('\n', url, presence[pid]['min'], presence[pid]['max'], fr, to) raise Exception else: - print '\n', matches, url + print('\n', matches, url) raise Exception json.dump(data, open(JSON, 'w'), indent=2, sort_keys=True) diff --git a/london-mayors-questions/questions.py b/london-mayors-questions/questions.py index 596ce7e26..73db32a89 100755 --- a/london-mayors-questions/questions.py +++ b/london-mayors-questions/questions.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python +#! /usr/bin/env python3 import os import logging @@ -84,7 +84,7 @@ def writeScraperState(state, output_folder): with open(output_file, 'w') as state_json_file: logger.debug('Writing state file') state_json_file.write(json_string) - except TypeError, e: + except TypeError as e: logger.error('Could not serialise to valid JSON: {}'.format(str(e))) @@ -234,14 +234,14 @@ def parseQuestionPage(content): question_title = main_content.h1.text.strip() - logger.debug(u'Question title is {}'.format(question_title)) + logger.debug('Question title is {}'.format(question_title)) # Extract who asked it asked_by_name = main_content.find('div', class_='field--name-field-asked-by').find('div', class_='field__item').text.strip() asked_by_person = getSpeakerObjectFromName(asked_by_name) - logger.debug(u'Question asked by {}'.format(asked_by_person['name'])) + logger.debug('Question asked by {}'.format(asked_by_person['name'])) # Try to extract the actual question @@ -326,7 +326,7 @@ def parseAnswersFromQuestionPage(page_content): answered_by_name = answer_article.find('div', class_='field--name-field-answered-by').find('div', class_='field__item').text.strip() answered_by_person = getSpeakerObjectFromName(answered_by_name) - logger.debug(u'Question answered by {}'.format(answered_by_person['name'])) + logger.debug('Question answered by {}'.format(answered_by_person['name'])) answer_paragraphs = [] @@ -410,12 +410,12 @@ def getPersonIDFromName(name): def getSpeakerObjectFromName(name): ''' Given a name, try to find a speaker ID and return a whole object. ''' - name = name.replace(u'\u00a0', ' ') + name = name.replace('\u00a0', ' ') name = stripPatternsFromName(name) id = getPersonIDFromName(name) if not id: if 'Liz Peace' not in name: - logger.warning(u'Could not match name {} to any assembly member'.format(name)) + logger.warning('Could not match name {} to any assembly member'.format(name)) id = 'unknown' return { @@ -427,7 +427,7 @@ def getSpeakerObjectFromName(name): def cleanParagraphText(text): # Remove non-breaking spaces followed by a space. - text = text.replace(u'\u00a0 ', ' ') + text = text.replace('\u00a0 ', ' ') # Strip trailing whitespace text = text.strip() @@ -618,7 +618,7 @@ def loadMembershipsFromFile(members_file): if name not in person_ids_by_name: person_ids_by_name[name] = membership['person_id'] - logger.debug(u'Added ID map for for {}'.format(name)) + logger.debug('Added ID map for for {}'.format(name)) else: if person_ids_by_name[name] != membership['person_id']: raise Exception('Multiple people with name {}'.format(name)) diff --git a/members/parl-old-check-party.py b/members/parl-old-check-party.py index 22864bf9b..badcef9df 100644 --- a/members/parl-old-check-party.py +++ b/members/parl-old-check-party.py @@ -6,7 +6,7 @@ # fixing (or longer term, fix it automatically). import re -import urllib +import urllib.request import lxml.objectify import sys @@ -86,10 +86,10 @@ def __init__(self, lord): self.type = TYPES.index('Elected Hereditary') # One of the 92 def __str__(self): - return u'%s (%s) - %s' % ( self.longTitle, PARTIES[self.party], self.status ) + return '%s (%s) - %s' % ( self.longTitle, PARTIES[self.party], self.status ) # Fetch the current live information -lords = urllib.urlopen('http://data.parliament.uk/resources/members/api/lords/all/').read() +lords = urllib.request.urlopen('http://data.parliament.uk/resources/members/api/lords/all/').read() lords = [ Lord(lord) for lord in lxml.objectify.fromstring(lords).peer ] for lord in lords: @@ -113,5 +113,5 @@ def __str__(self): if PARTIES[lord.party] == 'UK Independence Party' and lordsList.lords[match]['party'] == 'UKIP': continue if PARTIES[lord.party] == 'Plaid Cymru' and lordsList.lords[match]['party'] == 'PC': continue if PARTIES[lord.party] == 'Plaid Cymru' and lordsList.lords[match]['party'] == 'PC': continue - print PARTIES[lord.party], lordsList.lords[match]['party'] + print(PARTIES[lord.party], lordsList.lords[match]['party']) diff --git a/members/wikipedia-commons.py b/members/wikipedia-commons.py index 9caca7e11..511684e64 100755 --- a/members/wikipedia-commons.py +++ b/members/wikipedia-commons.py @@ -1,6 +1,4 @@ -#!/usr/bin/env python -# -*- coding: latin-1 -*- -# $Id: bbcconv.py,v 1.4 2005/03/25 23:33:35 theyworkforyou Exp $ +#!/usr/bin/env python3 # Screen scrape list of links to Lords on Wikipedia, so we can link to the articles. @@ -11,8 +9,7 @@ import datetime import sys -import urllib -import urlparse +import urllib.parse import re # import sets @@ -49,25 +46,22 @@ cons = cons2 name = name2 url = url2 - cons = cons.decode('utf-8') cons = cons.replace('&', '&') - name = name.decode('utf-8') try: (id, canonname, canoncons) = memberList.matchfullnamecons(name, cons, date_parl[year]) - except Exception, e: - print >>sys.stderr, e + except Exception as e: + print(e, file=sys.stderr) if not id: continue wikimembers[id] = url -print ''' -''' -k = wikimembers.keys() -k.sort() +print(''' +''') +k = sorted(wikimembers) for id in k: - url = urlparse.urljoin(wiki_index_url, wikimembers[id]) - print '' % (id, url) -print '' + url = urllib.parse.urljoin(wiki_index_url, wikimembers[id]) + print('' % (id, url)) +print('') #wikimembers = sets.Set(wikimembers.keys()) #print "len: ", len(wikimembers) diff --git a/members/wikipedia-lords.py b/members/wikipedia-lords.py index b6962b78b..34ac829ba 100755 --- a/members/wikipedia-lords.py +++ b/members/wikipedia-lords.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Screen scrape list of links to Lords on Wikipedia, so we can link to the articles. @@ -9,7 +9,7 @@ import datetime import sys -import urlparse +import urllib.parse import re sys.path.append("../pyscraper") @@ -32,19 +32,19 @@ id = None try: id = lordsList.GetLordIDfname(name, None, date_today) - except Exception, e: + except Exception as e: continue if not id: continue wikimembers[id] = url -print ''' -''' +print(''' +''') for id, url in sorted(wikimembers.items()): - url = urlparse.urljoin(wiki_index_url, url) - print '' % (id, url) -print '' + url = urllib.parse.urljoin(wiki_index_url, url) + print('' % (id, url)) +print('') #print "len: ", len(wikimembers) diff --git a/members/wikipedia-standingdown.py b/members/wikipedia-standingdown.py index 5b952cc58..ae3f41f74 100755 --- a/members/wikipedia-standingdown.py +++ b/members/wikipedia-standingdown.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Screen scrape list of who's standing down in the 2010 general election @@ -7,9 +7,7 @@ # certain conditions. However, it comes with ABSOLUTELY NO WARRANTY. # For details see the file LICENSE.html in the top level of the source. -import datetime import sys -import urlparse import re sys.path.append("../pyscraper") @@ -19,8 +17,8 @@ page = open('../rawdata/MPs_standing_down_in_2010').read() -print ''' -''' +print(''' +''') m = re.findall('
  • ]*>([^<]*)', page) for row in m: url, name = row @@ -28,6 +26,6 @@ if name in ('Iris Robinson', 'Ashok Kumar', 'David Taylor'): continue id, canonname, canoncons = memberList.matchfullnamecons(name, None, today) pid = memberList.membertoperson(id) - print (' ' % (pid, name)).encode('iso-8859-1') -print '' + print((' ' % (pid, name)).encode('iso-8859-1')) +print('') diff --git a/pyscraper/base_resolver.py b/pyscraper/base_resolver.py index 0da4de395..dc54bbb2c 100644 --- a/pyscraper/base_resolver.py +++ b/pyscraper/base_resolver.py @@ -67,12 +67,12 @@ def import_people_membership(self, mship, posts, orgs): return if mship["id"] in self.membertopersonmap: - raise Exception, "Same member id %s appeared twice" % mship["id"] + raise Exception("Same member id %s appeared twice" % mship["id"]) self.membertopersonmap[mship["id"]] = mship['person_id'] self.persontomembermap.setdefault(mship['person_id'], []).append(mship["id"]) if self.members.get(mship["id"]): - raise Exception, "Repeated identifier %s in members JSON file" % mship["id"] + raise Exception("Repeated identifier %s in members JSON file" % mship["id"]) self.members[mship["id"]] = mship if 'end_date' not in mship: @@ -90,14 +90,14 @@ def import_people_membership(self, mship, posts, orgs): mship_start_date <= mship_end_date and mship_end_date <= cons['end_date']): if consid and consid != cons['id']: - raise Exception, "Two constituency ids %s %s overlap with MP %s" % (consid, cons['id'], mship['id']) + raise Exception("Two constituency ids %s %s overlap with MP %s" % (consid, cons['id'], mship['id'])) consid = cons['id'] if not consid: - raise Exception, "Constituency '%s' not found" % mship["constituency"] + raise Exception("Constituency '%s' not found" % mship["constituency"]) # check name in members file is same as default in cons file backformed_cons = self.considtonamemap[consid] if backformed_cons != mship["constituency"]: - raise Exception, "Constituency '%s' in members file differs from first constituency '%s' listed in cons file" % (mship["constituency"], backformed_cons) + raise Exception("Constituency '%s' in members file differs from first constituency '%s' listed in cons file" % (mship["constituency"], backformed_cons)) # check first date ranges don't overlap, MPs only # Only check modern MPs as we might have overlapping data previously @@ -108,7 +108,7 @@ def import_people_membership(self, mship, posts, orgs): or cons['start_date'] <= mship['end_date'] <= cons['end_date'] \ or mship['start_date'] <= cons['start_date'] <= mship['end_date'] \ or mship['start_date'] <= cons['end_date'] <= mship['end_date']: - raise Exception, "%s %s Two MP entries for constituency %s with overlapping dates" % (mship, cons, consid) + raise Exception("%s %s Two MP entries for constituency %s with overlapping dates" % (mship, cons, consid)) # then add in self.considtomembermap.setdefault(consid, []).append(mship) @@ -124,7 +124,7 @@ def import_people_names(self, person): if person['id'] not in self.persontomembermap: return self.persons[person['id']] = person - memberships = map(lambda x: self.members[x], self.persontomembermap[person['id']]) + memberships = [self.members[x] for x in self.persontomembermap[person['id']]] for other_name in person.get('other_names', []): if other_name.get('note') == 'Main': self.import_people_main_name(other_name, memberships) @@ -213,7 +213,7 @@ def name_on_date(self, person_id, date): if nm['lordofname']: name += ' of %s' % nm['lordofname'] return name - raise Exception, 'No found for %s on %s' % (person['id'], date) + raise Exception('No found for %s on %s' % (person['id'], date)) def membertoperson(self, memberid): return self.membertopersonmap[memberid] diff --git a/pyscraper/contextexception.py b/pyscraper/contextexception.py index ec52f2f85..9d1b3b7d3 100755 --- a/pyscraper/contextexception.py +++ b/pyscraper/contextexception.py @@ -1,12 +1,6 @@ #! $Id: contextexception.py,v 1.12 2004/12/23 12:27:09 goatchurch Exp $ # vim:sw=8:ts=8:et:nowrap -import os -import string -import re -import sys -import shutil - class ContextException(Exception): def __init__(self, description, stamp = None, fragment = None): @@ -22,4 +16,3 @@ def __str__(self): if self.stamp: ret = ret + repr(self.stamp) + "\n" return ret - diff --git a/pyscraper/get_links_from_ep.py b/pyscraper/get_links_from_ep.py index af331a928..30db86271 100755 --- a/pyscraper/get_links_from_ep.py +++ b/pyscraper/get_links_from_ep.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python3 import operator from lxml import etree diff --git a/pyscraper/gettwittermps.py b/pyscraper/gettwittermps.py index 79895b9a2..3a10319dd 100755 --- a/pyscraper/gettwittermps.py +++ b/pyscraper/gettwittermps.py @@ -1,12 +1,12 @@ -#!/usr/bin/python +#!/usr/bin/env python3 -import urllib2 +import urllib.request import csv import xml.sax uri = "http://spreadsheets.google.com/tq?tqx=out:csv&key=0AjWA_TWMI4t_dFI5MWRWZkRWbFJ6MVhHQzVmVndrZnc&hl=en_GB" -f = urllib2.urlopen(uri) +f = urllib.request.urlopen(uri) csv_data = f.read() lines = csv_data.split("\n") rows = csv.reader(lines.__iter__(), delimiter=',', quotechar='"') @@ -34,7 +34,7 @@ def endElement(self,name): output_filename = "../members/twitter-commons.xml" fp = open(output_filename,"w") -fp.write(''' +fp.write(''' ''') diff --git a/pyscraper/gidmatching.py b/pyscraper/gidmatching.py index d2ab7aac1..c11ad2a50 100644 --- a/pyscraper/gidmatching.py +++ b/pyscraper/gidmatching.py @@ -1,12 +1,6 @@ -import sys import re -import os -import xml.sax -import tempfile -import string import miscfuncs import difflib -from pprint import pprint #from xmlfilewrite import PrevParsedFile class PrevParsedFile: @@ -25,7 +19,7 @@ def GetMinIndex(indx, a): assert indx[0] == 0 and a < indx[-1] i0, i1 = 0, len(indx) - 1 while i0 + 1 < i1: - im = (i0 + i1) / 2 + im = (i0 + i1) // 2 assert i0 != im and i1 != im if indx[im] <= a: i0 = im @@ -47,7 +41,7 @@ def PrepareXMLForDiff(scrapeversion): # new_chk = chk[2] new_chk = re.sub( r'(?s)(<(p|tr)\s[^>]*>)(.*?)(<\/\2>)', - lambda m: (u''.join((m.group(1), re.sub('\n', ' ', m.group(3)), m.group(4)))), + lambda m: (''.join((m.group(1), re.sub('\n', ' ', m.group(3)), m.group(4)))), chk[2] ) essxindx.append(len(essxlist)) @@ -162,12 +156,12 @@ def DoFactorDiff(essflatbindx, essflatblist, essxindx, essxlist, chks, flatb): # missing speech else: - print chks[ix] + print(chks[ix]) if lastmatchg: - print "Missing speech matched to last matched speech" + print("Missing speech matched to last matched speech") matchlist = [ lastmatchg ] else: - print "No match on first speech problem." + print("No match on first speech problem.") matchlist = [] matchtype = "missing" @@ -248,7 +242,7 @@ def FactorChangesWrans(majblocks, scrapeversion): for qqnum in qqnums: if qblock: if qblock.headingqb.qGID != qnummapq[qqnum].headingqb.qGID: - print qblock.headingqb.qGID, qnummapq[qqnum].headingqb.qGID + print(qblock.headingqb.qGID, qnummapq[qqnum].headingqb.qGID) assert qblock.headingqb.qGID == qnummapq[qqnum].headingqb.qGID elif qqnum != '0' and qqnum in qnummapq: # 0 is when there is a missing qnum qblock = qnummapq[qqnum] @@ -266,7 +260,7 @@ def FactorChangesWrans(majblocks, scrapeversion): qmissblockscorebest = max(qmissblocksscore) qblock = qnummapq[qmissblockscorebest[1]] if miscfuncs.IsNotQuiet(): - print "Missing qnum; mapping %s to %s with score %f" % (qebchk[0], qblock.headingqb.qGID, qmissblockscorebest[0]) + print("Missing qnum; mapping %s to %s with score %f" % (qebchk[0], qblock.headingqb.qGID, qmissblockscorebest[0])) assert qmissblockscorebest[0] > 0.8 # otherwise it's not really a match and we need to look harder. # perhaps it's matched to a block in the new file which newly has a qnum, and we then have to scan against all of them. @@ -324,7 +318,7 @@ def FactorChangesWrans(majblocks, scrapeversion): # sometimes we get more than one question. # when we find a mismatch we'll deal with it as a special paragraph problem, or not bother. if len(qebchkquesids) != len(qblock.queses): - print len(qebchkquesids), len(qblock.queses), qblock.queses[0].qGID + print(len(qebchkquesids), len(qblock.queses), qblock.queses[0].qGID) assert len(qebchkquesids) == len(qblock.queses) for i in range(len(qebchkquesids)): res.append('\n' % (qebchkquesids[i], qblock.queses[i].qGID, matchtype)) diff --git a/pyscraper/lazyrunall.py b/pyscraper/lazyrunall.py index a2fa61b5f..1da9d4010 100755 --- a/pyscraper/lazyrunall.py +++ b/pyscraper/lazyrunall.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python +#! /usr/bin/env python3 # vim:sw=8:ts=8:et:nowrap # Run the script with --help to see command line options @@ -91,18 +91,18 @@ elif arg == "ni": options.ni = True else: - print >>sys.stderr, "error: no such option %s" % arg + print("error: no such option %s" % arg, file=sys.stderr) parser.print_help() sys.exit(1) if len(args) == 0: parser.print_help() sys.exit(1) if not options.scrape and not options.parse: - print >>sys.stderr, "error: choose what to do; scrape, parse, or both" + print("error: choose what to do; scrape, parse, or both", file=sys.stderr) parser.print_help() sys.exit(1) if not options.regmem and not options.ni: - print >>sys.stderr, "error: choose what work on; regmem, several of them" + print("error: choose what work on; regmem, several of them", file=sys.stderr) parser.print_help() sys.exit(1) diff --git a/pyscraper/lords/resolvenames.py b/pyscraper/lords/resolvenames.py index a61e5b1e8..2589de6d1 100644 --- a/pyscraper/lords/resolvenames.py +++ b/pyscraper/lords/resolvenames.py @@ -1,6 +1,5 @@ import json import os.path -import string import re from contextexception import ContextException @@ -22,7 +21,7 @@ hontitles = [ 'Lord ?Bishop', 'Bishop', 'Marquess', 'Lord', 'Baroness', 'Viscount', 'Earl', 'Countess', 'Lord Archbishop', 'Archbishop', 'Duke', 'Lady' ] -hontitleso = string.join(hontitles, '|') +hontitleso = '|'.join(hontitles) honcompl = re.compile('(?:(%s)|(%s) \s*(.*?))(?:\s+of\s+(.*))?$' % (hontitleso, hontitleso)) @@ -44,12 +43,12 @@ def import_people_membership(self, mship, posts, orgs): return if mship["id"] in self.membertopersonmap: - raise Exception, "Same member id %s appeared twice" % mship["id"] + raise Exception("Same member id %s appeared twice" % mship["id"]) self.membertopersonmap[mship["id"]] = mship['person_id'] self.persontomembermap.setdefault(mship['person_id'], []).append(mship["id"]) if self.members.get(mship["id"]): - raise Exception, "Repeated identifier %s in members JSON file" % mship["id"] + raise Exception("Repeated identifier %s in members JSON file" % mship["id"]) self.members[mship["id"]] = mship if 'end_date' not in mship: @@ -62,7 +61,6 @@ def import_people_main_name(self, name, memberships): lname = re.sub("\.", "", lname) assert lname attr = { - "id": m["id"], "title": name["honorific_prefix"], "lordname": name.get("lordname", ""), "lordofname": name.get("lordofname", ""), @@ -71,6 +69,7 @@ def import_people_main_name(self, name, memberships): newattr = attr.copy() newattr['start_date'] = max(m['start_date'], name.get('start_date', '1000-01-01')) newattr['end_date'] = min(m['end_date'], name.get('end_date', '9999-12-31')) + newattr['id'] = m["id"] self.lordnames.setdefault(lname, []).append(newattr) def import_people_alternate_name(self, person, other_name, memberships): @@ -84,8 +83,8 @@ def GetLordID(self, ltitle, llordname, llordofname, loffice, stampurl, sdate, bD if ltitle == "Lord Archbishop": ltitle = "Archbishop" - llordofname = string.replace(llordofname, ".", "") - llordname = string.replace(llordname, ".", "") + llordofname = llordofname.replace(".", "") + llordname = llordname.replace(".", "") llordname = re.sub('&#(039|146|8217);', "'", llordname) llordofname = llordofname.strip() @@ -130,7 +129,7 @@ def GetLordID(self, ltitle, llordname, llordofname, loffice, stampurl, sdate, bD if lm["start_date"] <= sdate <= lm["end_date"]: if lm["lordname"] and llordofname: #if not IsNotQuiet(): - print "cm---", ltitle, lm["lordname"], lm["lordofname"], llordname, llordofname + print("cm---", ltitle, lm["lordname"], lm["lordofname"], llordname, llordofname) raise ContextException("lordofname matches lordname in lordlist", stamp=stampurl, fragment=lname) else: assert lm["lordofname"] and llordname @@ -139,7 +138,7 @@ def GetLordID(self, ltitle, llordname, llordofname, loffice, stampurl, sdate, bD raise ContextException("lordname matches lordofname in lordlist", stamp=stampurl, fragment=lname) res.append(lm) elif ltitle != "Bishop" and ltitle != "Archbishop" and (ltitle, lname) not in (("Duke", "Norfolk"), ("Duke", "Wellington"), ('Earl', 'Kinnoull'), ('Earl', 'Selborne')): - print lm + print(lm) raise ContextException("wrong dates on lords with same name", stamp=stampurl, fragment=lname) if not res: @@ -187,19 +186,19 @@ def MatchRevName(self, fss, sdate, stampurl): assert fss lfn = re.match('(.*?)(?: of (.*?))?, {0,3}((?:L|B|Abp|Bp|V|E|D|M|C|Ly)\.?)$', fss) if not lfn: - print "$$$%s$$$" % fss + print("$$$%s$$$" % fss) raise ContextException("No match of format in MatchRevName", stamp=stampurl, fragment=fss) shorttitle = lfn.group(3) if shorttitle[-1] != '.': shorttitle += "." ltitle = titleconv[shorttitle] - llordname = string.replace(lfn.group(1), ".", "") - llordname = string.replace(llordname, "'", "'") + llordname = lfn.group(1).replace(".", "") + llordname = llordname.replace("'", "'") llordname = re.sub("^De ", "de ", llordname) fullname = '%s %s' % (ltitle, llordname) llordofname = "" if lfn.group(2): - llordofname = string.replace(lfn.group(2), ".", "") + llordofname = lfn.group(2).replace(".", "") fullname = '%s of %s' % (fullname, llordofname) if fullname in self.aliases: diff --git a/pyscraper/miscfuncs.py b/pyscraper/miscfuncs.py index 341d49b87..1cc463e33 100755 --- a/pyscraper/miscfuncs.py +++ b/pyscraper/miscfuncs.py @@ -1,6 +1,3 @@ -#! /usr/bin/python -# vim:sw=8:ts=8:et:nowrap - import re import sys import string @@ -22,7 +19,7 @@ pwpatchesdirs = os.path.abspath("patches") # made locally, relative to the lazyrunall.py module. Should be relative to toppath eventually if (not os.path.isdir(toppath)): - raise Exception, 'Data directory %s does not exist, please create' % (toppath) + raise Exception('Data directory %s does not exist, please create' % (toppath)) # print "Data directory (set in miscfuncs.py): %s" % toppath # temporary files are stored here @@ -34,15 +31,15 @@ # find raw data path rawdatapath = os.path.join(os.getcwd(), "../rawdata") if (not os.path.isdir(toppath)): - raise Exception, 'Raw data directory %s does not exist, you\'ve not got a proper checkout from CVS.' % (toppath) + raise Exception('Raw data directory %s does not exist, you\'ve not got a proper checkout from CVS.' % (toppath)) # quiet flag bNotQuiet = True def SetQuiet(): - global bNotQuiet - bNotQuiet = False + global bNotQuiet + bNotQuiet = False def IsNotQuiet(): - return bNotQuiet + return bNotQuiet # import lower down so we get the top-path into the contextexception file @@ -51,22 +48,22 @@ def IsNotQuiet(): # use this to generate chronological scraped files of the same page def NextAlphaString(s): - assert re.match('[a-z]*$', s) - if not s: - return 'a' - i = string.find(string.lowercase, s[-1]) + 1 - if i < len(string.lowercase): - return s[:-1] + string.lowercase[i] - return NextAlphaString(s[:-1]) + 'a' + assert re.match('[a-z]*$', s) + if not s: + return 'a' + i = string.ascii_lowercase.find(s[-1]) + 1 + if i < len(string.ascii_lowercase): + return s[:-1] + string.ascii_lowercase[i] + return NextAlphaString(s[:-1]) + 'a' def AlphaStringToOrder(s): - assert re.match('[a-z]*$', s) - res = 0 - while s: - i = string.find(string.lowercase, s[0]) + 1 - res = res * 30 + i - s = s[1:] - return res + assert re.match('[a-z]*$', s) + res = 0 + while s: + i = string.ascii_lowercase.find(s[0]) + 1 + res = res * 30 + i + s = s[1:] + return res # Impossible to do 6pm, 7.15pm, 6.30pm, 6.45pm, 7pm without future timestamps # So not caring any more about timestamp errors @@ -77,94 +74,94 @@ def AlphaStringToOrder(s): # 7 pm regparsetimeonhour = re.compile("^(\d+)()(?:\s?| )([\w\.]+)$") def TimeProcessing(time, previoustimearr, bIsDivisionTime, stampurl): - #print "time ", time - - previoustime = None - if previoustimearr: - previoustime = previoustimearr[-1] - prevtimeMatch = re.match("(\d+):(\d+)", previoustime) - previoustimehour = int(prevtimeMatch.group(1)) - - # This code lifted from fix_time PHP code from easyParliament - timeparts = regparsetime.match(time) - if not timeparts: - timeparts = regparsetimeonhour.match(time) - if timeparts: - hour = int(timeparts.group(1)) - if (timeparts.group(2) != ""): - mins = int(timeparts.group(2)) - else: - mins = 0 - meridien = timeparts.group(3) - if re.match("p\.?m\.?", meridien): - if hour != 12: - hour += 12 - elif meridien == "midnight": - assert hour == 12 - hour += 12 - elif meridien == "noon": - assert hour == 12 - else: - if hour == 12: - hour -= 12 - if not re.match("a\.?m\.?", meridien): - if previoustime and previoustimehour > hour: - hour += 12 - - # skipping forward by twelve hours is a good sign an am/pm has gotten mixed - # Assume it's that if it's exactly 12 hours, alert otherwise - if previoustime and previoustimehour + 12 == hour: - hour -= 12 - - if previoustime and previoustimehour + 12 <= hour: - print "TIME: time shift by 12 (from %s to %s) -- should a p.m. be an a.m.? %s" % (previoustime, time, repr(stampurl)) - - elif time == 'Midnight': - hour = 24 - mins = 0 - elif time == 'Noon': - hour = 12 - mins = 0 - else: - return None - - res = "%03d:%02d:00" % (hour, mins) - - - # day-rotate situation where they went on beyond midnight - # it's uncommon enough to handle by listing exceptional days - # (sometimes the division time is out of order because that is where it is inserted in the record -- maybe should patch to handle) - #print previoustime, res, bIsDivisionTime, stampurl.sdate - if previoustime and res < previoustime: - if stampurl.sdate in ["2005-03-10"]: - if previoustime < "024": - print "dayrotate on ", stampurl.sdate, (hour, mins), previoustime - hour += 24 - - # correction heading case -- a copy of some text that is to be inserted into a different day. - elif stampurl.sdate in ["2002-10-28"]: - return res - - elif hour in [0, 1, 2, 3, 4] or stampurl.sdate in ["2003-10-20", "2000-10-03", "2000-07-24", "2011-01-17"]: - hour += 24 - else: - print 'TIME: time rotation (from %s to %s %s) not close to midnight %s' % (previoustime, time, res, repr(stampurl)) - - res = "%03d:%02d:00" % (hour, mins) - - - # capture the case where we are out of order by more than a few minutes - # (divisions are often out of order slightly) - - # out of order case - if previoustime and res < previoustime: - # if it's a division type, we can tolerate a few minutes - timeminutes = int(hour) * 60 + int(mins) - previoustimeminutes = previoustimehour * 60 + int(prevtimeMatch.group(2)) - if timeminutes < previoustimeminutes: - if not bIsDivisionTime or (previoustimeminutes - timeminutes > 10): - print 'TIME: time out of order, from %s to %s (division=%s) %s' % (previoustime, res, bIsDivisionTime, repr(stampurl)) - return res + #print "time ", time + + previoustime = None + if previoustimearr: + previoustime = previoustimearr[-1] + prevtimeMatch = re.match("(\d+):(\d+)", previoustime) + previoustimehour = int(prevtimeMatch.group(1)) + + # This code lifted from fix_time PHP code from easyParliament + timeparts = regparsetime.match(time) + if not timeparts: + timeparts = regparsetimeonhour.match(time) + if timeparts: + hour = int(timeparts.group(1)) + if (timeparts.group(2) != ""): + mins = int(timeparts.group(2)) + else: + mins = 0 + meridien = timeparts.group(3) + if re.match("p\.?m\.?", meridien): + if hour != 12: + hour += 12 + elif meridien == "midnight": + assert hour == 12 + hour += 12 + elif meridien == "noon": + assert hour == 12 + else: + if hour == 12: + hour -= 12 + if not re.match("a\.?m\.?", meridien): + if previoustime and previoustimehour > hour: + hour += 12 + + # skipping forward by twelve hours is a good sign an am/pm has gotten mixed + # Assume it's that if it's exactly 12 hours, alert otherwise + if previoustime and previoustimehour + 12 == hour: + hour -= 12 + + if previoustime and previoustimehour + 12 <= hour: + print("TIME: time shift by 12 (from %s to %s) -- should a p.m. be an a.m.? %s" % (previoustime, time, repr(stampurl))) + + elif time == 'Midnight': + hour = 24 + mins = 0 + elif time == 'Noon': + hour = 12 + mins = 0 + else: + return None + + res = "%03d:%02d:00" % (hour, mins) + + + # day-rotate situation where they went on beyond midnight + # it's uncommon enough to handle by listing exceptional days + # (sometimes the division time is out of order because that is where it is inserted in the record -- maybe should patch to handle) + #print previoustime, res, bIsDivisionTime, stampurl.sdate + if previoustime and res < previoustime: + if stampurl.sdate in ["2005-03-10"]: + if previoustime < "024": + print("dayrotate on ", stampurl.sdate, (hour, mins), previoustime) + hour += 24 + + # correction heading case -- a copy of some text that is to be inserted into a different day. + elif stampurl.sdate in ["2002-10-28"]: + return res + + elif hour in [0, 1, 2, 3, 4] or stampurl.sdate in ["2003-10-20", "2000-10-03", "2000-07-24", "2011-01-17"]: + hour += 24 + else: + print('TIME: time rotation (from %s to %s %s) not close to midnight %s' % (previoustime, time, res, repr(stampurl))) + + res = "%03d:%02d:00" % (hour, mins) + + + # capture the case where we are out of order by more than a few minutes + # (divisions are often out of order slightly) + + # out of order case + if previoustime and res < previoustime: + # if it's a division type, we can tolerate a few minutes + timeminutes = int(hour) * 60 + int(mins) + previoustimeminutes = previoustimehour * 60 + int(prevtimeMatch.group(2)) + if timeminutes < previoustimeminutes: + if not bIsDivisionTime or (previoustimeminutes - timeminutes > 10): + print('TIME: time out of order, from %s to %s (division=%s) %s' % (previoustime, res, bIsDivisionTime, repr(stampurl))) + return res # The names of entities and what they are are here: @@ -229,7 +226,7 @@ def TimeProcessing(time, previoustimearr, bIsDivisionTime, stampurl): '_':'_', # this is underscore symbol '_':'_', # this is underscore symbol - ''':"'", # possession apostrophe + ''':"'", # possession apostrophe "€":'€', # this is euro currency "™":'™', "•":'•', @@ -254,172 +251,154 @@ def TimeProcessing(time, previoustimearr, bIsDivisionTime, stampurl): def StripAnchorTags(text): - raise Exception, "I've never called this function, so test it" + raise Exception("I've never called this function, so test it") - abf = re.split('(<[^>]*>)', text) + abf = re.split('(<[^>]*>)', text) - ret = '' - for ab in abf: - if re.match(']*>(?i)', ab): - pass + ret = '' + for ab in abf: + if re.match(']*>(?i)', ab): + pass - elif re.match('(?i)', ab): - pass + elif re.match('(?i)', ab): + pass - else: - ret = ret + ab + else: + ret = ret + ab - return ret + return ret def WriteCleanText(fout, text, striphref=True): - text = re.sub('', '', text) - abf = re.split('(<[^>]*>)', text) - for ab in abf: - # delete comments and links - if re.match(']*?->', ab): - pass + text = re.sub('', '', text) + abf = re.split('(<[^>]*>)', text) + for ab in abf: + # delete comments and links + if re.match(']*?->', ab): + pass - # XXX Differs from pullgluepages version - elif striphref and re.match(']+>(?i)', ab): - anamem = re.match(']+>(?i)', ab): + anamem = re.match('(?i)', ab): - pass + elif striphref and re.match('(?i)', ab): + pass - # spaces only inside tags - elif re.match('<[^>]*>', ab): - fout.write(re.sub('\s', ' ', ab)) + # spaces only inside tags + elif re.match('<[^>]*>', ab): + fout.write(re.sub('\s', ' ', ab)) - # take out spurious > symbols and dos linefeeds - else: - fout.write(re.sub('>|\r', '', ab)) + # take out spurious > symbols and dos linefeeds + else: + fout.write(re.sub('>|\r', '', ab)) # Legacy patch system, use patchfilter.py and patchtool now def ApplyFixSubstitutions(text, sdate, fixsubs): - for sub in fixsubs: - if sub[3] == 'all' or sub[3] == sdate: - (text, n) = re.subn(sub[0], sub[1], text) - if (sub[2] != -1) and (n != sub[2]): - print sub - raise Exception, 'wrong number of substitutions %d on %s' % (n, sub[0]) - return text + for sub in fixsubs: + if sub[3] == 'all' or sub[3] == sdate: + (text, n) = re.subn(sub[0], sub[1], text) + if (sub[2] != -1) and (n != sub[2]): + print(sub) + raise Exception('wrong number of substitutions %d on %s' % (n, sub[0])) + return text # this only accepts and tags def StraightenHTMLrecurse(stex, stampurl): - # split the text into and and and - qisup = re.search(r'(<(a|i|b|s|small|sup|sub)( href="[^"]*")?>(.*?))(?i)', stex) + # split the text into and and and + qisup = re.search(r'(<(a|i|b|s|small|sup|sub)( href="[^"]*")?>(.*?))(?i)', stex) + if qisup: + qtagtype = qisup.group(2) + qhref = qisup.group(3) or '' + qtag = ('<%s%s>' % (qtagtype, qhref), '' % qtagtype) + if not qisup: + qisup = re.search('(<(a) href="([^"]*)">(.*?))(?i)', stex) if qisup: - qtagtype = qisup.group(2) - qhref = qisup.group(3) or '' - qtag = ('<%s%s>' % (qtagtype, qhref), '' % qtagtype) - if not qisup: - qisup = re.search('(<(a) href="([^"]*)">(.*?))(?i)', stex) - if qisup: - qtag = ('' % qisup.group(3), '') - - if qisup: - sres = StraightenHTMLrecurse(stex[:qisup.start(1)], stampurl) - sres.append(qtag[0]) - sres.extend(StraightenHTMLrecurse(qisup.group(4), stampurl)) - sres.append(qtag[1]) - sres.extend(StraightenHTMLrecurse(stex[qisup.end(1):], stampurl)) - return sres - - sres = re.split('(&[a-z0-9]*?;|&#\d+;|"|\xa3|&|\x01|\x0e|\x14|\x92|\xb0|\xab|\xe9|\xc3\xb8|\xc3\xb1|<[^>]*>|<|>)', stex) - for i in range(len(sres)): - #print "sresi ", sres[i], "\n" - #print "-----------------------------------------------\n" - - if not sres[i]: - pass - elif re.match('&#[0-9]+;', sres[i]) and not re.match('[345][0-9];', sres[i]): - pass - elif sres[i][0] == '&': - if sres[i] in entitymap: - sres[i] = entitymap[sres[i]] - elif sres[i] in entitymaprev: - pass - elif sres[i] == '—': # special case as entitymap maps it with spaces - pass - elif sres[i] in ('"', '&', '<', '>'): - pass - elif sres[i] in ('“', '”'): - sres[i] = '"' - else: - raise Exception, sres[i] + ' unknown ent' - sres[i] = 'UNKNOWN-ENTITY' - - elif sres[i] == '"': - sres[i] = '"' - - # junk chars sometimes get in - # NB this only works if the characters are split in the regexp above - elif sres[i] == '\x01': - sres[i] = '' - elif sres[i] == '\x0e': - sres[i] = ' ' - elif sres[i] == '\x14': - sres[i] = ' ' - elif sres[i] == '\x92': - sres[i] = "'" - elif sres[i] == '\xa3': - sres[i] = '£' - elif sres[i] == '\xb0': - sres[i] = '°' - elif sres[i] == '\xab': - sres[i] = 'é' - elif sres[i] == '\xe9': - sres[i] = 'é' - elif sres[i] == '\xc3\xb8': - sres[i] = 'ø' - elif sres[i] == '\xc3\xb1': - sres[i] = 'ñ' - - elif re.match('$(?i)', sres[i]): - sres[i] = '' # 'OPEN-i-TAG-OUT-OF-PLACE' 'CLOSE-i-TAG-OUT-OF-PLACE' - - elif re.match('$', sres[i]): # what is this? wrans 2003-05-13 has one - sres[i] = '' - - # allow brs through - elif re.match('
    $(?i)', sres[i]): - sres[i] = '
    ' - - # discard garbage that appears in recent today postings - elif re.match('$(?i)', sres[i]): - sres[i] = '' - - elif sres[i][0] == '<' or sres[i][0] == '>': - print "Part:", sres[i][0] - print "All:",sres[i] - print "stex:", stex - print "raising" - raise ContextException('tag %s tag out of place in %s' % (sres[i], stex), stamp=stampurl, fragment=stex) - - return sres - - -def FixHTMLEntitiesL(stex, signore='', stampurl=None): - # will formalize this into the recursion later - if signore: - stex = re.sub(signore, '', stex) - return StraightenHTMLrecurse(stex, stampurl) - -def FixHTMLEntities(stex, signore='', stampurl=None): - res = string.join(FixHTMLEntitiesL(stex, signore, stampurl), '') - try: - res = res.decode('utf-8') - return res.encode("latin-1") - except Exception, e: - print "Encoding problem with:", res - raise ContextException(str(e), stamp=stampurl, fragment=res) - - + qtag = ('' % qisup.group(3), '') + + if qisup: + sres = StraightenHTMLrecurse(stex[:qisup.start(1)], stampurl) + sres.append(qtag[0]) + sres.extend(StraightenHTMLrecurse(qisup.group(4), stampurl)) + sres.append(qtag[1]) + sres.extend(StraightenHTMLrecurse(stex[qisup.end(1):], stampurl)) + return sres + + sres = re.split('(&[a-z0-9]*?;|&#\d+;|"|\xa3|&|\x01|\x0e|\x14|\x92|\xb0|\xab|\xe9|\xc3\xb8|\xc3\xb1|<[^>]*>|<|>)', stex) + for i in range(len(sres)): + #print "sresi ", sres[i], "\n" + #print "-----------------------------------------------\n" + + if not sres[i]: + pass + elif re.match('&#[0-9]+;', sres[i]) and not re.match('[345][0-9];', sres[i]): + pass + elif sres[i][0] == '&': + if sres[i] in entitymap: + sres[i] = entitymap[sres[i]] + elif sres[i] in entitymaprev: + pass + elif sres[i] == '—': # special case as entitymap maps it with spaces + pass + elif sres[i] in ('"', '&', '<', '>'): + pass + elif sres[i] in ('“', '”'): + sres[i] = '"' + else: + raise Exception(sres[i] + ' unknown ent') + sres[i] = 'UNKNOWN-ENTITY' + + elif sres[i] == '"': + sres[i] = '"' + + # junk chars sometimes get in + # NB this only works if the characters are split in the regexp above + elif sres[i] == '\x01': + sres[i] = '' + elif sres[i] == '\x0e': + sres[i] = ' ' + elif sres[i] == '\x14': + sres[i] = ' ' + elif sres[i] == '\x92': + sres[i] = "'" + elif sres[i] == '\xa3': + sres[i] = '£' + elif sres[i] == '\xb0': + sres[i] = '°' + elif sres[i] == '\xab': + sres[i] = 'é' + elif sres[i] == '\xe9': + sres[i] = 'é' + elif sres[i] == '\xc3\xb8': + sres[i] = 'ø' + elif sres[i] == '\xc3\xb1': + sres[i] = 'ñ' + + elif re.match('$(?i)', sres[i]): + sres[i] = '' # 'OPEN-i-TAG-OUT-OF-PLACE' 'CLOSE-i-TAG-OUT-OF-PLACE' + + elif re.match('$', sres[i]): # what is this? wrans 2003-05-13 has one + sres[i] = '' + + # allow brs through + elif re.match('
    $(?i)', sres[i]): + sres[i] = '
    ' + + # discard garbage that appears in recent today postings + elif re.match('$(?i)', sres[i]): + sres[i] = '' + + elif sres[i][0] == '<' or sres[i][0] == '>': + print("Part:", sres[i][0]) + print("All:",sres[i]) + print("stex:", stex) + print("raising") + raise ContextException('tag %s tag out of place in %s' % (sres[i], stex), stamp=stampurl, fragment=stex) + + return sres # The lookahead assertion (?= bits without end into component parts - for nf in parts: - - # a tiny bit of extra splitting up as output - if retablestart.match(nf) and not retable.match(nf): - newparts.extend(reparts2.split(nf)) - else: - newparts.append(nf) - - # get rid of blank and boring paragraphs - if reparaempty.match(nf): - if pstring and re.search('\S', nf): - print text - print '---' + pstring - print '---' + nf - raise Exception, ' it carried across empty para ' - continue - - # list of space type objects - if reparaspace.match(nf): - spclist.append(nf) - continue - - # sometimes italics are hidden among the paragraph choss - # bring forward onto the next string - if reitalif.match(nf): - if pstring: - print text - print spclist - print pstring - raise Exception, ' double italic in paraspace ' - pstring = '' - continue - - - # we now have a string of a paragraph which we are putting into the list. - - # table type - bthisparaalone = False - if retable.match(nf): - if pstring: - print text - raise Exception, ' non-empty preceding string ' - pstring = nf - bthisparaalone = True - - else: - lnf = re.sub("\s+", " ", nf) - if pstring: - pstring = pstring + " " + string.strip(lnf) - else: - pstring = string.strip(lnf) - - - # check that paragraphs have some text - if re.match('(?:<[^>]*>|\s)*$', pstring): - print "\nspclist:", spclist - print "\npstring:", pstring - print "\nthe text:", text[:100] - print "\nnf:", nf - raise ContextException('no text in paragraph', stamp=stampurl, fragment=pstring) - - # check that paragraph spaces aren't only font text, and have something - # real in them, unless they are breaks because of tables - if not (bprevparaalone or bthisparaalone): - bnonfont = False - for sl in spclist: - if not re.match(']*>(?i)', sl): - bnonfont = True - if not bnonfont: - print "text:", text - print "spclist:", spclist - print "pstring", pstring - print "----------" - print "nf", nf - print "----------" - raise ContextException('font found in middle of paragraph should be a paragraph break or removed', stamp=stampurl, fragment=pstring) - bprevparaalone = bthisparaalone - - - # put the preceding space, then the string into output list - res.append(spclist) - res.append(pstring) - #print "???%s???" % pstring - - spclist = [ ] - pstring = '' - - # findal spaces into the output list - res.append(spclist) - - return res + res = [] + + # used to detect over breaking in spaces + bprevparaalone = True + + # list of space objects, list of string + spclist = [] + pstring = '' + parts = reparts.split(text) + newparts = [] + # split up the start bits without end
    into component parts + for nf in parts: + + # a tiny bit of extra splitting up as output + if retablestart.match(nf) and not retable.match(nf): + newparts.extend(reparts2.split(nf)) + else: + newparts.append(nf) + + # get rid of blank and boring paragraphs + if reparaempty.match(nf): + if pstring and re.search('\S', nf): + print(text) + print('---' + pstring) + print('---' + nf) + raise Exception(' it carried across empty para ') + continue + + # list of space type objects + if reparaspace.match(nf): + spclist.append(nf) + continue + + # sometimes italics are hidden among the paragraph choss + # bring forward onto the next string + if reitalif.match(nf): + if pstring: + print(text) + print(spclist) + print(pstring) + raise Exception(' double italic in paraspace ') + pstring = '' + continue + + + # we now have a string of a paragraph which we are putting into the list. + + # table type + bthisparaalone = False + if retable.match(nf): + if pstring: + print(text) + raise Exception(' non-empty preceding string ') + pstring = nf + bthisparaalone = True + + else: + lnf = re.sub("\s+", " ", nf) + if pstring: + pstring = pstring + " " + lnf.strip() + else: + pstring = lnf.strip() + + + # check that paragraphs have some text + if re.match('(?:<[^>]*>|\s)*$', pstring): + print("\nspclist:", spclist) + print("\npstring:", pstring) + print("\nthe text:", text[:100]) + print("\nnf:", nf) + raise ContextException('no text in paragraph', stamp=stampurl, fragment=pstring) + + # check that paragraph spaces aren't only font text, and have something + # real in them, unless they are breaks because of tables + if not (bprevparaalone or bthisparaalone): + bnonfont = False + for sl in spclist: + if not re.match(']*>(?i)', sl): + bnonfont = True + if not bnonfont: + print("text:", text) + print("spclist:", spclist) + print("pstring", pstring) + print("----------") + print("nf", nf) + print("----------") + raise ContextException('font found in middle of paragraph should be a paragraph break or removed', stamp=stampurl, fragment=pstring) + bprevparaalone = bthisparaalone + + + # put the preceding space, then the string into output list + res.append(spclist) + res.append(pstring) + #print "???%s???" % pstring + + spclist = [ ] + pstring = '' + + # findal spaces into the output list + res.append(spclist) + + return res # Break text into paragraphs and mark the paragraphs according to their