Skip to content

Commit

Permalink
Upgrade python 2 to 3.
Browse files Browse the repository at this point in the history
  • Loading branch information
dracos committed Mar 14, 2023
1 parent 6355dcf commit daf097f
Show file tree
Hide file tree
Showing 75 changed files with 943 additions and 1,053 deletions.
19 changes: 6 additions & 13 deletions filtersentence_xml.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,10 @@
#! /usr/bin/python

from datetime import datetime
import re
import string

from lxml import etree

from contextexception import ContextException
from parlphrases import parlPhrases

from wrans.emblinks import rreglink, rregemail, rehtlink, ConstructHTTPlink

from resolvemembernames import memberList


Expand Down Expand Up @@ -81,12 +75,12 @@ def TokenStandingOrder(mstandingo, phrtok):
'phrase', ' class="standing-order" code="%s"' % mstandingo.group(1)
)

rehtlink = re.compile('(?<!["\'])(https?://)([^\s]+)')

def TokenHttpLink(mhttp, phrtok):
qstrlink = ConstructHTTPlink(mhttp.group(1), mhttp.group(2), mhttp.group(3))
qstrlink = mhttp.group(0)
return ('a', ' href="%s"' % qstrlink)


def TokenHrefLink(mhttp, phrtok):
return ('', '')

Expand Down Expand Up @@ -118,8 +112,8 @@ def TokenOffRep(qoffrep, phrtok):
qcolnum = qcpart.group(1)
if qcpart.group(2):
qcpartlead = qcpart.group(1)[len(qcpart.group(1)) - len(qcpart.group(2)):]
if string.atoi(qcpartlead) >= string.atoi(qcpart.group(2)):
print ' non-following column leadoff ', qoffrep.group(0)
if int(qcpartlead) >= int(qcpart.group(2)):
print(' non-following column leadoff ', qoffrep.group(0))
# raise Exception, ' non-following column leadoff '

if qcolsuffix == 'WH':
Expand Down Expand Up @@ -218,8 +212,7 @@ def TokenHonFriend(mhonfriend, phrtok):
# remove any xml entities from the name
orgname = res[1]

# if you put the .encode("latin-1") on the res[1] it doesn't work when there are strange characters.
return ('phrase', (' class="honfriend" person_id="%s" name="%s"' % (nid, orgname)).encode("latin-1"))
return ('phrase', ' class="honfriend" person_id="%s" name="%s"' % (nid, orgname))


# the array of tokens which we will detect on the way through
Expand Down Expand Up @@ -312,4 +305,4 @@ def GetPara(self):
else:
res.append(tok[2])

return string.join(res, '')
return ''.join(res)
10 changes: 5 additions & 5 deletions get-historic-debates
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import requests
import requests_cache
from xml.sax.saxutils import escape, quoteattr
from lxml import html, etree
from urlparse import urljoin
from urllib.parse import urljoin

requests_cache.install_cache(cache_name='debates', allowable_codes=(200, 404))
BASE_SOURCE_URL = 'https://api.parliament.uk'
Expand Down Expand Up @@ -183,9 +183,9 @@ def walk(ol, typ, prefix=''):
title = link.text_content()
url = BASE_SOURCE_URL + link.get('href')
if re.match('ORALL? ANS[WN]?ERS? [Tt][Oo] [OQU]UESTIONS?[.,]?$', title):
next_prefix = u'Oral Answers to Questions &#8212; '
next_prefix = 'Oral Answers to Questions &#8212; '
elif re.match('ORDERS OF THE\.? DAY[.,:]?$', title):
next_prefix = u'Orders of the Day &#8212; '
next_prefix = 'Orders of the Day &#8212; '
else:
next_prefix = ''
out += output_xml('%s-heading' % typ, '%s%s' % (prefix, escape(title)), url)
Expand Down Expand Up @@ -225,13 +225,13 @@ for year in range(1919, 1935+1):
if "id='commons'" not in res.content:
continue

print '\r\x1b[K%d' % year, month, day,
print('\r\x1b[K%d' % year, month, day, end=' ')
date = '%d-%02d-%02d' % (year, months.index(month)+1, day)
col = 0

tree = html.fromstring(res.content)
ol = tree.cssselect('h3#commons + ol')[0]
out = u'<publicwhip scrapeversion="a" latest="yes">\n'
out = '<publicwhip scrapeversion="a" latest="yes">\n'
out += walk(ol, 'major')
out += '</publicwhip>\n'

Expand Down
6 changes: 3 additions & 3 deletions get-historic-person-urls
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ for l in string.ascii_lowercase:
for li in lis:
a = li.find('a')
url = a['href']
print '\r\x1b[K' + url,
print('\r\x1b[K' + url, end=' ')
html = requests.get('https://api.parliament.uk/historic-hansard/people/%s/index.html' % url).text
if html == 'Page not found':
html = requests.get('https://api.parliament.uk/historic-hansard/people/%s' % url).text
Expand Down Expand Up @@ -99,10 +99,10 @@ for l in string.ascii_lowercase:
people[name] = [mm for mm in people[name] if mm['id'] != pid]
found = True
if not found:
print '\n', url, presence[pid]['min'], presence[pid]['max'], fr, to
print('\n', url, presence[pid]['min'], presence[pid]['max'], fr, to)
raise Exception
else:
print '\n', matches, url
print('\n', matches, url)
raise Exception

json.dump(data, open(JSON, 'w'), indent=2, sort_keys=True)
18 changes: 9 additions & 9 deletions london-mayors-questions/questions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#! /usr/bin/env python
#! /usr/bin/env python3

import os
import logging
Expand Down Expand Up @@ -84,7 +84,7 @@ def writeScraperState(state, output_folder):
with open(output_file, 'w') as state_json_file:
logger.debug('Writing state file')
state_json_file.write(json_string)
except TypeError, e:
except TypeError as e:
logger.error('Could not serialise to valid JSON: {}'.format(str(e)))


Expand Down Expand Up @@ -234,14 +234,14 @@ def parseQuestionPage(content):

question_title = main_content.h1.text.strip()

logger.debug(u'Question title is {}'.format(question_title))
logger.debug('Question title is {}'.format(question_title))

# Extract who asked it

asked_by_name = main_content.find('div', class_='field--name-field-asked-by').find('div', class_='field__item').text.strip()
asked_by_person = getSpeakerObjectFromName(asked_by_name)

logger.debug(u'Question asked by {}'.format(asked_by_person['name']))
logger.debug('Question asked by {}'.format(asked_by_person['name']))

# Try to extract the actual question

Expand Down Expand Up @@ -326,7 +326,7 @@ def parseAnswersFromQuestionPage(page_content):
answered_by_name = answer_article.find('div', class_='field--name-field-answered-by').find('div', class_='field__item').text.strip()
answered_by_person = getSpeakerObjectFromName(answered_by_name)

logger.debug(u'Question answered by {}'.format(answered_by_person['name']))
logger.debug('Question answered by {}'.format(answered_by_person['name']))

answer_paragraphs = []

Expand Down Expand Up @@ -410,12 +410,12 @@ def getPersonIDFromName(name):
def getSpeakerObjectFromName(name):
''' Given a name, try to find a speaker ID and return a whole object. '''

name = name.replace(u'\u00a0', ' ')
name = name.replace('\u00a0', ' ')
name = stripPatternsFromName(name)
id = getPersonIDFromName(name)
if not id:
if 'Liz Peace' not in name:
logger.warning(u'Could not match name {} to any assembly member'.format(name))
logger.warning('Could not match name {} to any assembly member'.format(name))
id = 'unknown'

return {
Expand All @@ -427,7 +427,7 @@ def getSpeakerObjectFromName(name):
def cleanParagraphText(text):

# Remove non-breaking spaces followed by a space.
text = text.replace(u'\u00a0 ', ' ')
text = text.replace('\u00a0 ', ' ')

# Strip trailing whitespace
text = text.strip()
Expand Down Expand Up @@ -618,7 +618,7 @@ def loadMembershipsFromFile(members_file):

if name not in person_ids_by_name:
person_ids_by_name[name] = membership['person_id']
logger.debug(u'Added ID map for for {}'.format(name))
logger.debug('Added ID map for for {}'.format(name))
else:
if person_ids_by_name[name] != membership['person_id']:
raise Exception('Multiple people with name {}'.format(name))
Expand Down
8 changes: 4 additions & 4 deletions members/parl-old-check-party.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# fixing (or longer term, fix it automatically).

import re
import urllib
import urllib.request
import lxml.objectify
import sys

Expand Down Expand Up @@ -86,10 +86,10 @@ def __init__(self, lord):
self.type = TYPES.index('Elected Hereditary') # One of the 92

def __str__(self):
return u'%s (%s) - %s' % ( self.longTitle, PARTIES[self.party], self.status )
return '%s (%s) - %s' % ( self.longTitle, PARTIES[self.party], self.status )

# Fetch the current live information
lords = urllib.urlopen('http://data.parliament.uk/resources/members/api/lords/all/').read()
lords = urllib.request.urlopen('http://data.parliament.uk/resources/members/api/lords/all/').read()
lords = [ Lord(lord) for lord in lxml.objectify.fromstring(lords).peer ]

for lord in lords:
Expand All @@ -113,5 +113,5 @@ def __str__(self):
if PARTIES[lord.party] == 'UK Independence Party' and lordsList.lords[match]['party'] == 'UKIP': continue
if PARTIES[lord.party] == 'Plaid Cymru' and lordsList.lords[match]['party'] == 'PC': continue
if PARTIES[lord.party] == 'Plaid Cymru' and lordsList.lords[match]['party'] == 'PC': continue
print PARTIES[lord.party], lordsList.lords[match]['party']
print(PARTIES[lord.party], lordsList.lords[match]['party'])

26 changes: 10 additions & 16 deletions members/wikipedia-commons.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
#!/usr/bin/env python
# -*- coding: latin-1 -*-
# $Id: bbcconv.py,v 1.4 2005/03/25 23:33:35 theyworkforyou Exp $
#!/usr/bin/env python3

# Screen scrape list of links to Lords on Wikipedia, so we can link to the articles.

Expand All @@ -11,8 +9,7 @@

import datetime
import sys
import urllib
import urlparse
import urllib.parse
import re
# import sets

Expand Down Expand Up @@ -49,25 +46,22 @@
cons = cons2
name = name2
url = url2
cons = cons.decode('utf-8')
cons = cons.replace('&amp;', '&')
name = name.decode('utf-8')
try:
(id, canonname, canoncons) = memberList.matchfullnamecons(name, cons, date_parl[year])
except Exception, e:
print >>sys.stderr, e
except Exception as e:
print(e, file=sys.stderr)
if not id:
continue
wikimembers[id] = url

print '''<?xml version="1.0" encoding="ISO-8859-1"?>
<publicwhip>'''
k = wikimembers.keys()
k.sort()
print('''<?xml version="1.0" encoding="ISO-8859-1"?>
<publicwhip>''')
k = sorted(wikimembers)
for id in k:
url = urlparse.urljoin(wiki_index_url, wikimembers[id])
print '<personinfo id="%s" wikipedia_url="%s" />' % (id, url)
print '</publicwhip>'
url = urllib.parse.urljoin(wiki_index_url, wikimembers[id])
print('<personinfo id="%s" wikipedia_url="%s" />' % (id, url))
print('</publicwhip>')

#wikimembers = sets.Set(wikimembers.keys())
#print "len: ", len(wikimembers)
Expand Down
16 changes: 8 additions & 8 deletions members/wikipedia-lords.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3

# Screen scrape list of links to Lords on Wikipedia, so we can link to the articles.

Expand All @@ -9,7 +9,7 @@

import datetime
import sys
import urlparse
import urllib.parse
import re

sys.path.append("../pyscraper")
Expand All @@ -32,19 +32,19 @@
id = None
try:
id = lordsList.GetLordIDfname(name, None, date_today)
except Exception, e:
except Exception as e:
continue

if not id:
continue
wikimembers[id] = url

print '''<?xml version="1.0" encoding="ISO-8859-1"?>
<publicwhip>'''
print('''<?xml version="1.0" encoding="ISO-8859-1"?>
<publicwhip>''')
for id, url in sorted(wikimembers.items()):
url = urlparse.urljoin(wiki_index_url, url)
print '<personinfo id="%s" wikipedia_url="%s" />' % (id, url)
print '</publicwhip>'
url = urllib.parse.urljoin(wiki_index_url, url)
print('<personinfo id="%s" wikipedia_url="%s" />' % (id, url))
print('</publicwhip>')

#print "len: ", len(wikimembers)

Expand Down
12 changes: 5 additions & 7 deletions members/wikipedia-standingdown.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3

# Screen scrape list of who's standing down in the 2010 general election

Expand All @@ -7,9 +7,7 @@
# certain conditions. However, it comes with ABSOLUTELY NO WARRANTY.
# For details see the file LICENSE.html in the top level of the source.

import datetime
import sys
import urlparse
import re

sys.path.append("../pyscraper")
Expand All @@ -19,15 +17,15 @@

page = open('../rawdata/MPs_standing_down_in_2010').read()

print '''<?xml version="1.0" encoding="ISO-8859-1"?>
<publicwhip>'''
print('''<?xml version="1.0" encoding="ISO-8859-1"?>
<publicwhip>''')
m = re.findall('<li><a href="([^"]*)"[^>]*>([^<]*)</a>', page)
for row in m:
url, name = row
name = name.decode('utf-8')
if name in ('Iris Robinson', 'Ashok Kumar', 'David Taylor'): continue
id, canonname, canoncons = memberList.matchfullnamecons(name, None, today)
pid = memberList.membertoperson(id)
print (' <personinfo id="%s" name="%s" standing_down="1" />' % (pid, name)).encode('iso-8859-1')
print '</publicwhip>'
print((' <personinfo id="%s" name="%s" standing_down="1" />' % (pid, name)).encode('iso-8859-1'))
print('</publicwhip>')

Loading

0 comments on commit daf097f

Please sign in to comment.