Upgrade python 2 to 3.

mysociety · Mar 14, 2023 · daf097f · daf097f
1 parent 6355dcf
commit daf097f
Show file tree

Hide file tree

Showing 75 changed files with 943 additions and 1,053 deletions.
diff --git a/filtersentence_xml.py b/filtersentence_xml.py
@@ -1,16 +1,10 @@
-#! /usr/bin/python
-
 from datetime import datetime
 import re
-import string
 
 from lxml import etree
 
 from contextexception import ContextException
 from parlphrases import parlPhrases
-
-from wrans.emblinks import rreglink, rregemail, rehtlink, ConstructHTTPlink
-
 from resolvemembernames import memberList
 
 
@@ -81,12 +75,12 @@ def TokenStandingOrder(mstandingo, phrtok):
         'phrase', ' class="standing-order" code="%s"' % mstandingo.group(1)
     )
 
+rehtlink = re.compile('(?<!["\'])(https?://)([^\s]+)')
 
 def TokenHttpLink(mhttp, phrtok):
-    qstrlink = ConstructHTTPlink(mhttp.group(1), mhttp.group(2), mhttp.group(3))
+    qstrlink = mhttp.group(0)
     return ('a', ' href="%s"' % qstrlink)
 
-
 def TokenHrefLink(mhttp, phrtok):
     return ('', '')
 
@@ -118,8 +112,8 @@ def TokenOffRep(qoffrep, phrtok):
     qcolnum = qcpart.group(1)
     if qcpart.group(2):
         qcpartlead = qcpart.group(1)[len(qcpart.group(1)) - len(qcpart.group(2)):]
-        if string.atoi(qcpartlead) >= string.atoi(qcpart.group(2)):
-            print ' non-following column leadoff ', qoffrep.group(0)
+        if int(qcpartlead) >= int(qcpart.group(2)):
+            print(' non-following column leadoff ', qoffrep.group(0))
             # raise Exception, ' non-following column leadoff '
 
     if qcolsuffix == 'WH':
@@ -218,8 +212,7 @@ def TokenHonFriend(mhonfriend, phrtok):
     # remove any xml entities from the name
     orgname = res[1]
 
-    # if you put the .encode("latin-1") on the res[1] it doesn't work when there are strange characters.
-    return ('phrase', (' class="honfriend" person_id="%s" name="%s"' % (nid, orgname)).encode("latin-1"))
+    return ('phrase', ' class="honfriend" person_id="%s" name="%s"' % (nid, orgname))
 
 
 # the array of tokens which we will detect on the way through
@@ -312,4 +305,4 @@ def GetPara(self):
             else:
                 res.append(tok[2])
 
-        return string.join(res, '')
+        return ''.join(res)
diff --git a/get-historic-debates b/get-historic-debates
@@ -11,7 +11,7 @@ import requests
 import requests_cache
 from xml.sax.saxutils import escape, quoteattr
 from lxml import html, etree
-from urlparse import urljoin
+from urllib.parse import urljoin
 
 requests_cache.install_cache(cache_name='debates', allowable_codes=(200, 404))
 BASE_SOURCE_URL = 'https://api.parliament.uk'
@@ -183,9 +183,9 @@ def walk(ol, typ, prefix=''):
             title = link.text_content()
             url = BASE_SOURCE_URL + link.get('href')
             if re.match('ORALL? ANS[WN]?ERS? [Tt][Oo] [OQU]UESTIONS?[.,]?$', title):
-                next_prefix = u'Oral Answers to Questions &#8212; '
+                next_prefix = 'Oral Answers to Questions &#8212; '
             elif re.match('ORDERS OF THE\.? DAY[.,:]?$', title):
-                next_prefix = u'Orders of the Day &#8212; '
+                next_prefix = 'Orders of the Day &#8212; '
             else:
                 next_prefix = ''
                 out += output_xml('%s-heading' % typ, '%s%s' % (prefix, escape(title)), url)
@@ -225,13 +225,13 @@ for year in range(1919, 1935+1):
             if "id='commons'" not in res.content:
                 continue
 
-            print '\r\x1b[K%d' % year, month, day,
+            print('\r\x1b[K%d' % year, month, day, end=' ')
             date = '%d-%02d-%02d' % (year, months.index(month)+1, day)
             col = 0
 
             tree = html.fromstring(res.content)
             ol = tree.cssselect('h3#commons + ol')[0]
-            out = u'<publicwhip scrapeversion="a" latest="yes">\n'
+            out = '<publicwhip scrapeversion="a" latest="yes">\n'
             out += walk(ol, 'major')
             out += '</publicwhip>\n'
 

diff --git a/get-historic-person-urls b/get-historic-person-urls
@@ -55,7 +55,7 @@ for l in string.ascii_lowercase:
     for li in lis:
         a = li.find('a')
         url = a['href']
-        print '\r\x1b[K' + url,
+        print('\r\x1b[K' + url, end=' ')
         html = requests.get('https://api.parliament.uk/historic-hansard/people/%s/index.html' % url).text
         if html == 'Page not found':
             html = requests.get('https://api.parliament.uk/historic-hansard/people/%s' % url).text
@@ -99,10 +99,10 @@ for l in string.ascii_lowercase:
                     people[name] = [mm for mm in people[name] if mm['id'] != pid]
                     found = True
             if not found:
-                print '\n', url, presence[pid]['min'], presence[pid]['max'], fr, to
+                print('\n', url, presence[pid]['min'], presence[pid]['max'], fr, to)
                 raise Exception
         else:
-            print '\n', matches, url
+            print('\n', matches, url)
             raise Exception
 
 json.dump(data, open(JSON, 'w'), indent=2, sort_keys=True)
diff --git a/london-mayors-questions/questions.py b/london-mayors-questions/questions.py
@@ -1,4 +1,4 @@
-#! /usr/bin/env python
+#! /usr/bin/env python3
 
 import os
 import logging
@@ -84,7 +84,7 @@ def writeScraperState(state, output_folder):
         with open(output_file, 'w') as state_json_file:
             logger.debug('Writing state file')
             state_json_file.write(json_string)
-    except TypeError, e:
+    except TypeError as e:
         logger.error('Could not serialise to valid JSON: {}'.format(str(e)))
 
 
@@ -234,14 +234,14 @@ def parseQuestionPage(content):
 
     question_title = main_content.h1.text.strip()
 
-    logger.debug(u'Question title is {}'.format(question_title))
+    logger.debug('Question title is {}'.format(question_title))
 
     # Extract who asked it
 
     asked_by_name = main_content.find('div', class_='field--name-field-asked-by').find('div', class_='field__item').text.strip()
     asked_by_person = getSpeakerObjectFromName(asked_by_name)
 
-    logger.debug(u'Question asked by {}'.format(asked_by_person['name']))
+    logger.debug('Question asked by {}'.format(asked_by_person['name']))
 
     # Try to extract the actual question
 
@@ -326,7 +326,7 @@ def parseAnswersFromQuestionPage(page_content):
         answered_by_name = answer_article.find('div', class_='field--name-field-answered-by').find('div', class_='field__item').text.strip()
         answered_by_person = getSpeakerObjectFromName(answered_by_name)
 
-        logger.debug(u'Question answered by {}'.format(answered_by_person['name']))
+        logger.debug('Question answered by {}'.format(answered_by_person['name']))
 
         answer_paragraphs = []
 
@@ -410,12 +410,12 @@ def getPersonIDFromName(name):
 def getSpeakerObjectFromName(name):
     ''' Given a name, try to find a speaker ID and return a whole object. '''
 
-    name = name.replace(u'\u00a0', ' ')
+    name = name.replace('\u00a0', ' ')
     name = stripPatternsFromName(name)
     id = getPersonIDFromName(name)
     if not id:
         if 'Liz Peace' not in name:
-            logger.warning(u'Could not match name {} to any assembly member'.format(name))
+            logger.warning('Could not match name {} to any assembly member'.format(name))
         id = 'unknown'
 
     return {
@@ -427,7 +427,7 @@ def getSpeakerObjectFromName(name):
 def cleanParagraphText(text):
 
     # Remove non-breaking spaces followed by a space.
-    text = text.replace(u'\u00a0 ', ' ')
+    text = text.replace('\u00a0 ', ' ')
 
     # Strip trailing whitespace
     text = text.strip()
@@ -618,7 +618,7 @@ def loadMembershipsFromFile(members_file):
 
             if name not in person_ids_by_name:
                 person_ids_by_name[name] = membership['person_id']
-                logger.debug(u'Added ID map for for {}'.format(name))
+                logger.debug('Added ID map for for {}'.format(name))
             else:
                 if person_ids_by_name[name] != membership['person_id']:
                     raise Exception('Multiple people with name {}'.format(name))

diff --git a/members/parl-old-check-party.py b/members/parl-old-check-party.py
@@ -6,7 +6,7 @@
 # fixing (or longer term, fix it automatically).
 
 import re
-import urllib
+import urllib.request
 import lxml.objectify
 import sys
 
@@ -86,10 +86,10 @@ def __init__(self, lord):
             self.type = TYPES.index('Elected Hereditary') # One of the 92
 
     def __str__(self):
-        return u'%s (%s) - %s' % ( self.longTitle, PARTIES[self.party], self.status )
+        return '%s (%s) - %s' % ( self.longTitle, PARTIES[self.party], self.status )
 
 # Fetch the current live information
-lords = urllib.urlopen('http://data.parliament.uk/resources/members/api/lords/all/').read()
+lords = urllib.request.urlopen('http://data.parliament.uk/resources/members/api/lords/all/').read()
 lords = [ Lord(lord) for lord in lxml.objectify.fromstring(lords).peer ]
 
 for lord in lords:
@@ -113,5 +113,5 @@ def __str__(self):
     if PARTIES[lord.party] == 'UK Independence Party' and lordsList.lords[match]['party'] == 'UKIP': continue
     if PARTIES[lord.party] == 'Plaid Cymru' and lordsList.lords[match]['party'] == 'PC': continue
     if PARTIES[lord.party] == 'Plaid Cymru' and lordsList.lords[match]['party'] == 'PC': continue
-    print PARTIES[lord.party], lordsList.lords[match]['party']
+    print(PARTIES[lord.party], lordsList.lords[match]['party'])
 
diff --git a/members/wikipedia-commons.py b/members/wikipedia-commons.py
@@ -1,6 +1,4 @@
-#!/usr/bin/env python
-# -*- coding: latin-1 -*-
-# $Id: bbcconv.py,v 1.4 2005/03/25 23:33:35 theyworkforyou Exp $
+#!/usr/bin/env python3
 
 # Screen scrape list of links to Lords on Wikipedia, so we can link to the articles.
 
@@ -11,8 +9,7 @@
 
 import datetime
 import sys
-import urllib
-import urlparse
+import urllib.parse
 import re
 # import sets
 
@@ -49,25 +46,22 @@
             cons = cons2
             name = name2
             url = url2
-        cons = cons.decode('utf-8')
         cons = cons.replace('&amp;', '&')
-        name = name.decode('utf-8')
         try:
             (id, canonname, canoncons) = memberList.matchfullnamecons(name, cons, date_parl[year])
-        except Exception, e:
-            print >>sys.stderr, e
+        except Exception as e:
+            print(e, file=sys.stderr)
         if not id:
             continue
         wikimembers[id] = url
 
-print '''<?xml version="1.0" encoding="ISO-8859-1"?>
-<publicwhip>'''
-k = wikimembers.keys()
-k.sort()
+print('''<?xml version="1.0" encoding="ISO-8859-1"?>
+<publicwhip>''')
+k = sorted(wikimembers)
 for id in k:
-    url = urlparse.urljoin(wiki_index_url, wikimembers[id])
-    print '<personinfo id="%s" wikipedia_url="%s" />' % (id, url)
-print '</publicwhip>'
+    url = urllib.parse.urljoin(wiki_index_url, wikimembers[id])
+    print('<personinfo id="%s" wikipedia_url="%s" />' % (id, url))
+print('</publicwhip>')
 
 #wikimembers = sets.Set(wikimembers.keys())
 #print "len: ", len(wikimembers)

diff --git a/members/wikipedia-lords.py b/members/wikipedia-lords.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Screen scrape list of links to Lords on Wikipedia, so we can link to the articles.
 
@@ -9,7 +9,7 @@
 
 import datetime
 import sys
-import urlparse
+import urllib.parse
 import re
 
 sys.path.append("../pyscraper")
@@ -32,19 +32,19 @@
     id = None
     try:
         id = lordsList.GetLordIDfname(name, None, date_today)
-    except Exception, e:
+    except Exception as e:
         continue
 
     if not id:
         continue
     wikimembers[id] = url
 
-print '''<?xml version="1.0" encoding="ISO-8859-1"?>
-<publicwhip>'''
+print('''<?xml version="1.0" encoding="ISO-8859-1"?>
+<publicwhip>''')
 for id, url in sorted(wikimembers.items()):
-    url = urlparse.urljoin(wiki_index_url, url)
-    print '<personinfo id="%s" wikipedia_url="%s" />' % (id, url)
-print '</publicwhip>'
+    url = urllib.parse.urljoin(wiki_index_url, url)
+    print('<personinfo id="%s" wikipedia_url="%s" />' % (id, url))
+print('</publicwhip>')
 
 #print "len: ", len(wikimembers)
 

diff --git a/members/wikipedia-standingdown.py b/members/wikipedia-standingdown.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Screen scrape list of who's standing down in the 2010 general election
 
@@ -7,9 +7,7 @@
 # certain conditions.  However, it comes with ABSOLUTELY NO WARRANTY.
 # For details see the file LICENSE.html in the top level of the source.
 
-import datetime
 import sys
-import urlparse
 import re
 
 sys.path.append("../pyscraper")
@@ -19,15 +17,15 @@
 
 page = open('../rawdata/MPs_standing_down_in_2010').read()
 
-print '''<?xml version="1.0" encoding="ISO-8859-1"?>
-<publicwhip>'''
+print('''<?xml version="1.0" encoding="ISO-8859-1"?>
+<publicwhip>''')
 m = re.findall('<li><a href="([^"]*)"[^>]*>([^<]*)</a>', page)
 for row in m:
     url, name = row
     name = name.decode('utf-8')
     if name in ('Iris Robinson', 'Ashok Kumar', 'David Taylor'): continue
     id, canonname, canoncons = memberList.matchfullnamecons(name, None, today) 
     pid = memberList.membertoperson(id)
-    print ('  <personinfo id="%s" name="%s" standing_down="1" />' % (pid, name)).encode('iso-8859-1')
-print '</publicwhip>'
+    print(('  <personinfo id="%s" name="%s" standing_down="1" />' % (pid, name)).encode('iso-8859-1'))
+print('</publicwhip>')