From d5e1775a4632f13d54db0f2dbb8be4adf427ddfd Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@gmail.com>
Date: Tue, 17 Mar 2015 16:26:21 -0700
Subject: [PATCH 01/32] setup.py: make 'python setup.py test' run py.test with
 coverage

---
 setup.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/setup.py b/setup.py
index 03df6fa..e7ccf09 100644
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,19 @@
 from setuptools import setup
+from setuptools.command.test import test as TestCommand
+
+class PyTest(TestCommand):
+    def finalize_options(self):
+        TestCommand.finalize_options(self)
+        self.test_suite = True
+
+    def run_tests(self):
+        import pytest
+        import sys
+        cmdline = ' -v --doctest-module --cov surt surt/'
+        errcode = pytest.main(cmdline)
+        sys.exit(errcode)
+
+
 setup(name='surt',
       version='0.2',
       author='rajbot',
@@ -15,4 +30,8 @@
       provides=[ 'surt' ],
       packages=[ 'surt' ],
       scripts=[],
+      # Tests
+      tests_require=[ 'pytest' ],
+      test_suite='',
+      cmdclass={'test': PyTest},
      )

From e00716f7877e1290cb19e800bd4bd5f34de3d039 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@gmail.com>
Date: Tue, 17 Mar 2015 16:42:38 -0700
Subject: [PATCH 02/32] add support for options to surt() command add
 'trailing_comma' option, which will yield a surt with trailing comma
 restored: com,example,)/ instead of (current default) com,example)/

---
 surt/handyurl.py |  8 +++++++-
 surt/surt.py     | 29 +++++++++++++++++++++++------
 2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/surt/handyurl.py b/surt/handyurl.py
index 06f355f..449c228 100755
--- a/surt/handyurl.py
+++ b/surt/handyurl.py
@@ -200,7 +200,11 @@ def geturl(self):
 
     # getURLString()
     #___________________________________________________________________________
-    def getURLString(self, surt=False, public_suffix=False):
+    def getURLString(self,
+                     surt=False,
+                     public_suffix=False,
+                     trailing_comma=False,
+                     **options):
 
         if None != self.opaque:
             return self.opaque
@@ -229,6 +233,8 @@ def getURLString(self, surt=False, public_suffix=False):
             s += ":%d" % self.port
 
         if surt:
+            if trailing_comma:
+                s += ','
             s += ')'
 
         hasPath = (None != self.path) and (len(self.path) > 0)
diff --git a/surt/surt.py b/surt/surt.py
index 99c289c..8b47e61 100755
--- a/surt/surt.py
+++ b/surt/surt.py
@@ -48,7 +48,7 @@ def _normalize(canonicalizer):
 
 # surt()
 #_______________________________________________________________________________
-def surt(url, canonicalizer=None):
+def surt(url, canonicalizer=None, **options):
     """
     These doctests are from WaybackURLKeyMakerTest.java
 
@@ -82,6 +82,16 @@ def surt(url, canonicalizer=None):
     >>> surt("http://archive.org/goo/?a=2&b&a=1")
     'org,archive)/goo?a=1&a=2&b'
 
+    # trailing comma mode
+    >>> surt("http://archive.org/goo/?a=2&b&a=1", trailing_comma=True)
+    'org,archive,)/goo?a=1&a=2&b'
+
+    >>> surt("dns:archive.org", trailing_comma=True)
+    'org,archive,)'
+
+    >>> surt("warcinfo:foo.warc.gz", trailing_comma=True)
+    'warcinfo:foo.warc.gz'
+
     PHP session id:
     >>> surt("http://archive.org/index.php?PHPSESSID=0123456789abcdefghijklemopqrstuv&action=profile;u=4221")
     'org,archive)/index.php?action=profile;u=4221'
@@ -95,7 +105,7 @@ def surt(url, canonicalizer=None):
     'com,yahoo,webhosting,visit)/visit.gif?&b=netscape%205.0%20(windows;%20en-us)&c=24&j=true&o=win32&r=http://web.archive.org/web/20090517140029/http://anthonystewarthead.electric-chi.com/&s=1366x768&v=1.2'
 
     Simple customization:
-    >>> surt("http://www.example.com/", canonicalizer=lambda x: x)
+    >>> surt("http://www.example.com/", canonicalizer=lambda x, **opts: x)
     'com,example,www)/'
     """
 
@@ -109,7 +119,11 @@ def surt(url, canonicalizer=None):
         return url
 
     if url.startswith("dns:"):
-        return hostToSURT(url[4:]) + ')'
+        res = hostToSURT(url[4:])
+        if options.get('trailing_comma'):
+            res += ','
+        res += ')'
+        return res
 
     if url.startswith("whois://"):
         return url
@@ -122,9 +136,12 @@ def surt(url, canonicalizer=None):
         elif (not hasattr(canonicalizer, '__call__') and
               hasattr(canonicalizer, 'canonicalize')):
             canonicalizer = canonicalizer.canonicalize
-            
-    hurl = canonicalizer(handyurl.parse(url))
-    key  = hurl.getURLString(surt=True)
+
+    if 'surt' not in options:
+        options['surt'] = True
+
+    hurl = canonicalizer(handyurl.parse(url), **options)
+    key  = hurl.getURLString(**options)
 
     parenIdx = key.find('(')
     if -1 == parenIdx:

From df00f11e5c559eeb76d96326501a6d55f034f8b6 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@gmail.com>
Date: Fri, 10 Apr 2015 19:42:50 -0700
Subject: [PATCH 03/32] make surt py3 compatible

---
 setup.py                          |  1 +
 surt/DefaultIAURLCanonicalizer.py | 11 ++++++++---
 surt/GoogleURLCanonicalizer.py    | 23 ++++++++++++++---------
 surt/IAURLCanonicalizer.py        |  9 +++++++--
 surt/__init__.py                  |  9 +++++++--
 surt/handyurl.py                  | 13 +++++++++----
 surt/surt.py                      | 13 ++++++++++---
 7 files changed, 56 insertions(+), 23 deletions(-)

diff --git a/setup.py b/setup.py
index e7ccf09..ca688ef 100644
--- a/setup.py
+++ b/setup.py
@@ -25,6 +25,7 @@ def run_tests(self):
       long_description=open('README.md').read(),
       url='https://github.com/rajbot/surt',
       install_requires=[
+          'six',
           'tldextract',
       ],
       provides=[ 'surt' ],
diff --git a/surt/DefaultIAURLCanonicalizer.py b/surt/DefaultIAURLCanonicalizer.py
index a50c991..e352f92 100755
--- a/surt/DefaultIAURLCanonicalizer.py
+++ b/surt/DefaultIAURLCanonicalizer.py
@@ -26,8 +26,13 @@
 http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/DefaultIAURLCanonicalizerTest.java?view=markup
 """
 
-import GoogleURLCanonicalizer
-import IAURLCanonicalizer
+try:  #pragma: no cover
+    import GoogleURLCanonicalizer
+    import IAURLCanonicalizer
+
+except ImportError:  #pragma: no cover
+    import surt.GoogleURLCanonicalizer as GoogleURLCanonicalizer
+    import surt.IAURLCanonicalizer as IAURLCanonicalizer
 
 # canonicalize()
 #_______________________________________________________________________________
@@ -36,7 +41,7 @@ def canonicalize(url, **options):
 
     These doctests are from DefaultIAURLCanonicalizerTest.java:
 
-    >>> from handyurl import handyurl
+    >>> from .handyurl import handyurl
     >>> canonicalize(handyurl.parse("http://www.alexa.com/")).getURLString()
     'http://alexa.com/'
     >>> canonicalize(handyurl.parse("http://archive.org/index.html")).getURLString()
diff --git a/surt/GoogleURLCanonicalizer.py b/surt/GoogleURLCanonicalizer.py
index 7c7be91..02877ef 100755
--- a/surt/GoogleURLCanonicalizer.py
+++ b/surt/GoogleURLCanonicalizer.py
@@ -31,9 +31,14 @@
 import struct
 import socket
 import encodings.idna
-from handyurl import handyurl
-from urllib import quote, unquote
+import six
 
+try:  #pragma: no cover
+    from handyurl import handyurl
+except ImportError:  #pragma: no cover
+    from surt.handyurl import handyurl
+
+from six.moves.urllib.parse import quote, unquote
 
 # unescapeRepeatedly()
 #_______________________________________________________________________________
@@ -93,14 +98,14 @@ def canonicalize(url, **_ignored):
     #'http://%01%80.com/' for this case. If idna/punycode encoding of a hostname
     #is not possible, the python version encodes unicode domains as utf-8 before
     #percent encoding, so we get 'http://%01%C2%80.com/'
-    >>> print canonicalize(handyurl.parse(u"http://\u0001\u0080.com/")).getURLString()
+    >>> print(canonicalize(handyurl.parse(u"http://\u0001\u0080.com/")).getURLString())
     http://%01%C2%80.com/
 
     #Add these unicode tests:
-    >>> print canonicalize(handyurl.parse(u'B\xfccher.ch:8080')).getURLString()
+    >>> print(canonicalize(handyurl.parse(u'B\xfccher.ch:8080')).getURLString())
     http://xn--bcher-kva.ch:8080/
-    >>> url = '☃.com'.decode('utf-8') #doctest has trouble with utf-8 encoding
-    >>> print canonicalize(handyurl.parse(url)).getURLString()
+    >>> url = '☃.com' #doctest has trouble with utf-8 encoding
+    >>> print(canonicalize(handyurl.parse(url)).getURLString())
     http://xn--n3h.com/
 
     #Add these percent-encoded unicode tests
@@ -140,7 +145,7 @@ def canonicalize(url, **_ignored):
     # if the host was an ascii string of percent-encoded bytes that represent
     # non-ascii unicode chars, then promote hostE from str to unicode.
     # e.g. "http://www.t%EF%BF%BD%04.82.net/", which contains the unicode replacement char
-    if isinstance(hostE, str):
+    if isinstance(hostE, six.binary_type):
         try:
             hostE.decode('ascii')
         except UnicodeDecodeError:
@@ -155,7 +160,7 @@ def canonicalize(url, **_ignored):
         # be encodeced separately, and doesn't work correctly with
         # full hostnames. So use 'idna' encoding instead.
         #host = encodings.idna.ToASCII(hostE)
-        host = hostE.encode('idna')
+        host = hostE.encode('idna').decode('utf-8')
     except ValueError:
         host = hostE
 
@@ -284,7 +289,7 @@ def escapeOnce(input):
         # percent encoding, since different encodings of the same unicode
         # characters will result in different surts.
         # We will use utf-8 for consistency.
-        if isinstance(input, unicode):
+        if isinstance(input, six.text_type):
             input = input.encode('utf-8')
         return quote(input, """!"$&'()*+,-./:;<=>?@[\]^_`{|}~""")
     else:
diff --git a/surt/IAURLCanonicalizer.py b/surt/IAURLCanonicalizer.py
index 2bf7746..aee2d36 100755
--- a/surt/IAURLCanonicalizer.py
+++ b/surt/IAURLCanonicalizer.py
@@ -27,8 +27,13 @@
 """
 
 import re
-from handyurl import handyurl
-from URLRegexTransformer import stripPathSessionID, stripQuerySessionID
+
+try:  #pragma: no cover
+    from handyurl import handyurl
+    from URLRegexTransformer import stripPathSessionID, stripQuerySessionID
+except ImportError:  #pragma: no cover
+    from surt.handyurl import handyurl
+    from surt.URLRegexTransformer import stripPathSessionID, stripQuerySessionID
 
 # canonicalize()
 #_______________________________________________________________________________
diff --git a/surt/__init__.py b/surt/__init__.py
index f0bfc83..f67faa4 100644
--- a/surt/__init__.py
+++ b/surt/__init__.py
@@ -25,8 +25,13 @@
 http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/
 """
 
-from .handyurl import handyurl
-from .surt import surt
+try:  #pragma: no cover
+    from handyurl import handyurl
+    from surt import surt
+except ImportError:  #pragma: no cover
+    from surt.handyurl import handyurl
+    from surt.surt import surt
+
 
 __all__= [
     'handyurl',
diff --git a/surt/handyurl.py b/surt/handyurl.py
index 449c228..260518d 100755
--- a/surt/handyurl.py
+++ b/surt/handyurl.py
@@ -22,8 +22,13 @@
 
 import re
 import tldextract
-from urlparse import urlsplit
-from URLRegexTransformer import hostToSURT
+
+from six.moves.urllib.parse import urlsplit
+
+try:  #pragma: no cover
+    from URLRegexTransformer import hostToSURT
+except ImportError:  #pragma: no cover
+    from surt.URLRegexTransformer import hostToSURT
 
 class handyurl(object):
     """A python port of the archive-commons org.archive.url HandyURL class
@@ -96,10 +101,10 @@ def parse(cls, url):
         >>> handyurl.parse("http://www.archive.org:8080?#foo").geturl()
         'http://www.archive.org:8080/#foo'
 
-        >>> print handyurl.parse(u"http://bücher.ch:8080?#foo").geturl()
+        >>> print(handyurl.parse(u"http://bücher.ch:8080?#foo").geturl())
         http://b\xfccher.ch:8080/#foo
 
-        >>> print handyurl.parse(u"dns:bücher.ch").geturl()
+        >>> print(handyurl.parse(u"dns:bücher.ch").geturl())
         dns:b\xfccher.ch
 
         ###From Tymm:
diff --git a/surt/surt.py b/surt/surt.py
index 8b47e61..ba6f0b8 100755
--- a/surt/surt.py
+++ b/surt/surt.py
@@ -24,9 +24,16 @@
 http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/WaybackURLKeyMaker.java?view=markup
 """
 
-from handyurl import handyurl
-from URLRegexTransformer import hostToSURT
-import DefaultIAURLCanonicalizer
+try:  #pragma: no cover
+    from handyurl import handyurl
+    from URLRegexTransformer import hostToSURT
+
+    import DefaultIAURLCanonicalizer
+except ImportError:  #pragma: no cover
+    from surt.handyurl import handyurl
+    from surt.URLRegexTransformer import hostToSURT
+
+    import surt.DefaultIAURLCanonicalizer as DefaultIAURLCanonicalizer
 
 class CompositeCanonicalizer(object):
     def __init__(self, canonicalizers):

From e1e3878dd9e9b25d7495b7a8487dbb4573e9c967 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@gmail.com>
Date: Sat, 11 Apr 2015 11:45:02 -0700
Subject: [PATCH 04/32] use from __future__ import absolute_import add tox.ini
 and travis-ci support

---
 .travis.yml                       | 32 ++++++++++++++++++++++++-------
 surt/DefaultIAURLCanonicalizer.py | 13 +++++--------
 surt/GoogleURLCanonicalizer.py    |  7 +++----
 surt/IAURLCanonicalizer.py        | 10 ++++------
 surt/__init__.py                  | 10 ++++------
 surt/handyurl.py                  |  7 +++----
 surt/surt.py                      | 12 ++++--------
 tox.ini                           | 13 +++++++++++++
 8 files changed, 61 insertions(+), 43 deletions(-)
 create mode 100644 tox.ini

diff --git a/.travis.yml b/.travis.yml
index 8db6574..d6fb14a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,8 +1,26 @@
+# vim: set sw=4 et:
+#
+# tox approach stolen from
+# https://github.com/pypa/pip/blob/abdb597dbfb51b21cc76c1cff068b72c80f3a77d/.travis.yml
+#
+
 language: python
-python:
-  - "2.6"
-  - "2.7"
-install: pip install -r requirements.txt
-script:
-  - py.test --doctest-modules -v surt/
-  - pylint --disable=all --enable=W0312 --reports=n surt/
+
+env:
+    - TOXENV=py26
+    - TOXENV=py27
+    - TOXENV=py32
+    - TOXENV=py33
+    - TOXENV=py34
+
+before_install:
+    - sudo apt-get update
+    - pip install coveralls --use-mirrors
+
+before_script:
+    - pip install tox
+
+script: tox
+
+#after_success:
+    #coveralls
diff --git a/surt/DefaultIAURLCanonicalizer.py b/surt/DefaultIAURLCanonicalizer.py
index e352f92..11ae90e 100755
--- a/surt/DefaultIAURLCanonicalizer.py
+++ b/surt/DefaultIAURLCanonicalizer.py
@@ -25,14 +25,11 @@
 The doctests are copied from DefaultIAURLCanonicalizerTest.java:
 http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/DefaultIAURLCanonicalizerTest.java?view=markup
 """
+from __future__ import absolute_import
 
-try:  #pragma: no cover
-    import GoogleURLCanonicalizer
-    import IAURLCanonicalizer
+import surt.GoogleURLCanonicalizer
+import surt.IAURLCanonicalizer
 
-except ImportError:  #pragma: no cover
-    import surt.GoogleURLCanonicalizer as GoogleURLCanonicalizer
-    import surt.IAURLCanonicalizer as IAURLCanonicalizer
 
 # canonicalize()
 #_______________________________________________________________________________
@@ -58,8 +55,8 @@ def canonicalize(url, **options):
     'http://archive.org/index.html?a=b&b=a&b=b'
     """
 
-    url = GoogleURLCanonicalizer.canonicalize(url, **options)
-    url = IAURLCanonicalizer.canonicalize(url, **options)
+    url = surt.GoogleURLCanonicalizer.canonicalize(url, **options)
+    url = surt.IAURLCanonicalizer.canonicalize(url, **options)
 
     return url
 
diff --git a/surt/GoogleURLCanonicalizer.py b/surt/GoogleURLCanonicalizer.py
index 02877ef..51b9c07 100755
--- a/surt/GoogleURLCanonicalizer.py
+++ b/surt/GoogleURLCanonicalizer.py
@@ -27,16 +27,15 @@
 http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/GoogleURLCanonicalizerTest.java?view=markup
 """
 
+from __future__ import absolute_import
+
 import re
 import struct
 import socket
 import encodings.idna
 import six
 
-try:  #pragma: no cover
-    from handyurl import handyurl
-except ImportError:  #pragma: no cover
-    from surt.handyurl import handyurl
+from surt.handyurl import handyurl
 
 from six.moves.urllib.parse import quote, unquote
 
diff --git a/surt/IAURLCanonicalizer.py b/surt/IAURLCanonicalizer.py
index aee2d36..bbfe0a2 100755
--- a/surt/IAURLCanonicalizer.py
+++ b/surt/IAURLCanonicalizer.py
@@ -26,14 +26,12 @@
 http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/IAURLCanonicalizerTest.java?view=markup
 """
 
+from __future__ import absolute_import
+
 import re
 
-try:  #pragma: no cover
-    from handyurl import handyurl
-    from URLRegexTransformer import stripPathSessionID, stripQuerySessionID
-except ImportError:  #pragma: no cover
-    from surt.handyurl import handyurl
-    from surt.URLRegexTransformer import stripPathSessionID, stripQuerySessionID
+from surt.handyurl import handyurl
+from surt.URLRegexTransformer import stripPathSessionID, stripQuerySessionID
 
 # canonicalize()
 #_______________________________________________________________________________
diff --git a/surt/__init__.py b/surt/__init__.py
index f67faa4..1af4c40 100644
--- a/surt/__init__.py
+++ b/surt/__init__.py
@@ -25,12 +25,10 @@
 http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/
 """
 
-try:  #pragma: no cover
-    from handyurl import handyurl
-    from surt import surt
-except ImportError:  #pragma: no cover
-    from surt.handyurl import handyurl
-    from surt.surt import surt
+from __future__ import absolute_import
+
+from surt.handyurl import handyurl
+from surt.surt import surt
 
 
 __all__= [
diff --git a/surt/handyurl.py b/surt/handyurl.py
index 260518d..f6ae5bc 100755
--- a/surt/handyurl.py
+++ b/surt/handyurl.py
@@ -20,15 +20,14 @@
 #
 #     The surt source is hosted at https://github.com/internetarchive/surt
 
+from __future__ import absolute_import
+
 import re
 import tldextract
 
 from six.moves.urllib.parse import urlsplit
 
-try:  #pragma: no cover
-    from URLRegexTransformer import hostToSURT
-except ImportError:  #pragma: no cover
-    from surt.URLRegexTransformer import hostToSURT
+from surt.URLRegexTransformer import hostToSURT
 
 class handyurl(object):
     """A python port of the archive-commons org.archive.url HandyURL class
diff --git a/surt/surt.py b/surt/surt.py
index ba6f0b8..6c10aac 100755
--- a/surt/surt.py
+++ b/surt/surt.py
@@ -24,16 +24,12 @@
 http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/WaybackURLKeyMaker.java?view=markup
 """
 
-try:  #pragma: no cover
-    from handyurl import handyurl
-    from URLRegexTransformer import hostToSURT
+from __future__ import absolute_import
 
-    import DefaultIAURLCanonicalizer
-except ImportError:  #pragma: no cover
-    from surt.handyurl import handyurl
-    from surt.URLRegexTransformer import hostToSURT
+from surt.handyurl import handyurl
+from surt.URLRegexTransformer import hostToSURT
 
-    import surt.DefaultIAURLCanonicalizer as DefaultIAURLCanonicalizer
+import surt.DefaultIAURLCanonicalizer as DefaultIAURLCanonicalizer
 
 class CompositeCanonicalizer(object):
     def __init__(self, canonicalizers):
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..697a8b3
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,13 @@
+# Tox (http://tox.testrun.org/) is a tool for running tests
+# in multiple virtualenvs. This configuration file will run the
+# test suite on all supported python versions. To use it, "pip install tox"
+# and then run "tox" from this directory.
+
+[tox]
+envlist = py26, py27, py34
+
+[testenv]
+commands = python setup.py test
+deps =
+    pytest
+    pytest-cov

From 07307962541173b5ac4793fe9cbdc637e34f3d4c Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@gmail.com>
Date: Sat, 11 Apr 2015 11:46:32 -0700
Subject: [PATCH 05/32] add py32, py33 to tox

---
 tox.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tox.ini b/tox.ini
index 697a8b3..fcbf2a9 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,7 +4,7 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py26, py27, py34
+envlist = py26, py27, py32, py33, py34
 
 [testenv]
 commands = python setup.py test

From 4a1ff4a15904470c141cd40d31cf0df4b114cdd4 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@gmail.com>
Date: Sat, 11 Apr 2015 12:06:38 -0700
Subject: [PATCH 06/32] add zip_safe=True, use six.u() for unicode literals to
 support py32

---
 setup.py                       |  1 +
 surt/GoogleURLCanonicalizer.py | 10 +++++-----
 surt/handyurl.py               |  9 +++++----
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/setup.py b/setup.py
index ca688ef..67613ef 100644
--- a/setup.py
+++ b/setup.py
@@ -24,6 +24,7 @@ def run_tests(self):
       description='Sort-friendly URI Reordering Transform (SURT) python package.',
       long_description=open('README.md').read(),
       url='https://github.com/rajbot/surt',
+      zip_safe=True,
       install_requires=[
           'six',
           'tldextract',
diff --git a/surt/GoogleURLCanonicalizer.py b/surt/GoogleURLCanonicalizer.py
index 51b9c07..efc3ce3 100755
--- a/surt/GoogleURLCanonicalizer.py
+++ b/surt/GoogleURLCanonicalizer.py
@@ -33,11 +33,11 @@
 import struct
 import socket
 import encodings.idna
-import six
 
 from surt.handyurl import handyurl
 
 from six.moves.urllib.parse import quote, unquote
+from six import u, text_type, binary_type
 
 # unescapeRepeatedly()
 #_______________________________________________________________________________
@@ -97,11 +97,11 @@ def canonicalize(url, **_ignored):
     #'http://%01%80.com/' for this case. If idna/punycode encoding of a hostname
     #is not possible, the python version encodes unicode domains as utf-8 before
     #percent encoding, so we get 'http://%01%C2%80.com/'
-    >>> print(canonicalize(handyurl.parse(u"http://\u0001\u0080.com/")).getURLString())
+    >>> print(canonicalize(handyurl.parse(u("http://\u0001\u0080.com/"))).getURLString())
     http://%01%C2%80.com/
 
     #Add these unicode tests:
-    >>> print(canonicalize(handyurl.parse(u'B\xfccher.ch:8080')).getURLString())
+    >>> print(canonicalize(handyurl.parse(u('B\xfccher.ch:8080'))).getURLString())
     http://xn--bcher-kva.ch:8080/
     >>> url = '☃.com' #doctest has trouble with utf-8 encoding
     >>> print(canonicalize(handyurl.parse(url)).getURLString())
@@ -144,7 +144,7 @@ def canonicalize(url, **_ignored):
     # if the host was an ascii string of percent-encoded bytes that represent
     # non-ascii unicode chars, then promote hostE from str to unicode.
     # e.g. "http://www.t%EF%BF%BD%04.82.net/", which contains the unicode replacement char
-    if isinstance(hostE, six.binary_type):
+    if isinstance(hostE, binary_type):
         try:
             hostE.decode('ascii')
         except UnicodeDecodeError:
@@ -288,7 +288,7 @@ def escapeOnce(input):
         # percent encoding, since different encodings of the same unicode
         # characters will result in different surts.
         # We will use utf-8 for consistency.
-        if isinstance(input, six.text_type):
+        if isinstance(input, text_type):
             input = input.encode('utf-8')
         return quote(input, """!"$&'()*+,-./:;<=>?@[\]^_`{|}~""")
     else:
diff --git a/surt/handyurl.py b/surt/handyurl.py
index f6ae5bc..0f2f644 100755
--- a/surt/handyurl.py
+++ b/surt/handyurl.py
@@ -26,6 +26,7 @@
 import tldextract
 
 from six.moves.urllib.parse import urlsplit
+from six import u
 
 from surt.URLRegexTransformer import hostToSURT
 
@@ -74,7 +75,7 @@ def __init__(self, scheme=None, authUser=None, authPass=None,
     #___________________________________________________________________________
     @classmethod
     def parse(cls, url):
-        u"""This method was in the java URLParser class, but we don't need
+        u("""This method was in the java URLParser class, but we don't need
         a whole class to parse a url, when we can just use python's urlparse.
 
         These doctests come from URLParserTest.java:
@@ -100,10 +101,10 @@ def parse(cls, url):
         >>> handyurl.parse("http://www.archive.org:8080?#foo").geturl()
         'http://www.archive.org:8080/#foo'
 
-        >>> print(handyurl.parse(u"http://bücher.ch:8080?#foo").geturl())
+        >>> print(handyurl.parse(u("http://bücher.ch:8080?#foo")).geturl())
         http://b\xfccher.ch:8080/#foo
 
-        >>> print(handyurl.parse(u"dns:bücher.ch").geturl())
+        >>> print(handyurl.parse(u("dns:bücher.ch")).geturl())
         dns:b\xfccher.ch
 
         ###From Tymm:
@@ -115,7 +116,7 @@ def parse(cls, url):
         ###From Common Crawl, host ends with ':' without a port number
         >>> handyurl.parse("http://mineral.galleries.com:/minerals/silicate/chabazit/chabazit.htm").geturl()
         'http://mineral.galleries.com/minerals/silicate/chabazit/chabazit.htm'
-        """
+        """)
         url = url.strip()
         url = re.sub('[\n\r\t]', '', url)
 

From 031586fd7115ba137a36c9c612b3c37e1d6e711b Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Thu, 20 Aug 2015 21:11:26 +0000
Subject: [PATCH 07/32] - get rid of special "opaque" thing in handyurl,
 unnecessary with proper handling of url without authority - handyurl.parse
 doctests were not running because docstring was in six.u(), and on top of
 that it turns out six.u() doesn't work for non-ascii strings, and u"" was
 restored in python3.3+, so get rid of all use of six.u(). the idn tests are
 failing in python2 for unknown reasons

---
 setup.py                       |  2 +-
 surt/GoogleURLCanonicalizer.py |  8 ++--
 surt/handyurl.py               | 83 +++++++++++++++++-----------------
 tox.ini                        |  2 +-
 4 files changed, 48 insertions(+), 47 deletions(-)

diff --git a/setup.py b/setup.py
index 67613ef..d001bcd 100644
--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,7 @@ def finalize_options(self):
     def run_tests(self):
         import pytest
         import sys
-        cmdline = ' -v --doctest-module --cov surt surt/'
+        cmdline = ' -v --doctest-modules --cov surt surt/'
         errcode = pytest.main(cmdline)
         sys.exit(errcode)
 
diff --git a/surt/GoogleURLCanonicalizer.py b/surt/GoogleURLCanonicalizer.py
index efc3ce3..3b83bb7 100755
--- a/surt/GoogleURLCanonicalizer.py
+++ b/surt/GoogleURLCanonicalizer.py
@@ -37,12 +37,12 @@
 from surt.handyurl import handyurl
 
 from six.moves.urllib.parse import quote, unquote
-from six import u, text_type, binary_type
+from six import text_type, binary_type
 
 # unescapeRepeatedly()
 #_______________________________________________________________________________
 def canonicalize(url, **_ignored):
-    """
+    u"""
     >>> canonicalize(handyurl.parse("http://host/%25%32%35")).getURLString()
     'http://host/%25'
     >>> canonicalize(handyurl.parse("http://host/%25%32%35%25%32%35")).getURLString()
@@ -97,11 +97,11 @@ def canonicalize(url, **_ignored):
     #'http://%01%80.com/' for this case. If idna/punycode encoding of a hostname
     #is not possible, the python version encodes unicode domains as utf-8 before
     #percent encoding, so we get 'http://%01%C2%80.com/'
-    >>> print(canonicalize(handyurl.parse(u("http://\u0001\u0080.com/"))).getURLString())
+    >>> print(canonicalize(handyurl.parse(u"http://\u0001\u0080.com/")).getURLString())
     http://%01%C2%80.com/
 
     #Add these unicode tests:
-    >>> print(canonicalize(handyurl.parse(u('B\xfccher.ch:8080'))).getURLString())
+    >>> print(canonicalize(handyurl.parse(u'B\xfccher.ch:8080')).getURLString())
     http://xn--bcher-kva.ch:8080/
     >>> url = '☃.com' #doctest has trouble with utf-8 encoding
     >>> print(canonicalize(handyurl.parse(url)).getURLString())
diff --git a/surt/handyurl.py b/surt/handyurl.py
index 0f2f644..4c65e88 100755
--- a/surt/handyurl.py
+++ b/surt/handyurl.py
@@ -26,7 +26,6 @@
 import tldextract
 
 from six.moves.urllib.parse import urlsplit
-from six import u
 
 from surt.URLRegexTransformer import hostToSURT
 
@@ -59,7 +58,7 @@ class strips empty queries.
     #___________________________________________________________________________
     def __init__(self, scheme=None, authUser=None, authPass=None,
                  host=None, port=DEFAULT_PORT, path=None,
-                 query=None, hash=None, opaque=None, last_delimiter=None):
+                 query=None, hash=None, last_delimiter=None):
         self.scheme   = scheme
         self.authUser = authUser
         self.authPass = authPass
@@ -68,14 +67,13 @@ def __init__(self, scheme=None, authUser=None, authPass=None,
         self.path     = path
         self.query    = query
         self.hash     = hash
-        self.opaque   = opaque
         self.last_delimiter = last_delimiter #added in python version
 
     # parse() classmethod
     #___________________________________________________________________________
     @classmethod
     def parse(cls, url):
-        u("""This method was in the java URLParser class, but we don't need
+        u"""This method was in the java URLParser class, but we don't need
         a whole class to parse a url, when we can just use python's urlparse.
 
         These doctests come from URLParserTest.java:
@@ -101,10 +99,16 @@ def parse(cls, url):
         >>> handyurl.parse("http://www.archive.org:8080?#foo").geturl()
         'http://www.archive.org:8080/#foo'
 
-        >>> print(handyurl.parse(u("http://bücher.ch:8080?#foo")).geturl())
+        >>> handyurl.parse(u"http://bücher.ch:8080?#foo").geturl()
+        'http://bücher.ch:8080/#foo'
+
+        >>> handyurl.parse(u"dns:bücher.ch").geturl()
+        'dns:bücher.ch'
+
+        >>> print(handyurl.parse(u"http://bücher.ch:8080?#foo").geturl())
         http://b\xfccher.ch:8080/#foo
 
-        >>> print(handyurl.parse(u("dns:bücher.ch")).geturl())
+        >>> print(handyurl.parse(u"dns:bücher.ch").geturl())
         dns:b\xfccher.ch
 
         ###From Tymm:
@@ -116,15 +120,16 @@ def parse(cls, url):
         ###From Common Crawl, host ends with ':' without a port number
         >>> handyurl.parse("http://mineral.galleries.com:/minerals/silicate/chabazit/chabazit.htm").geturl()
         'http://mineral.galleries.com/minerals/silicate/chabazit/chabazit.htm'
-        """)
+
+        >>> handyurl.parse("mailto:bot@archive.org").scheme
+        'mailto'
+
+        >>> handyurl.parse("mailto:bot@archive.org").geturl()
+        'mailto:bot@archive.org'
+        """
         url = url.strip()
         url = re.sub('[\n\r\t]', '', url)
 
-        ### DNS URLs are treated separately as opaque urls by URLParser.java
-        # However, we want to surtify dns urls as well.
-        if re.match("^(filedesc|warcinfo):.*", url):
-            return cls(opaque=url)
-
         url = cls.addDefaultSchemeIfNeeded(url)
 
         #From Tymm: deal with http://https/order.1and1.com
@@ -185,12 +190,8 @@ def addDefaultSchemeIfNeeded(cls, url):
         if not url:
             return url
 
-        ###raj: DNS URLs are treated separately as opaque urls by URLParser.java,
-        #but we want to surtify dns urls as well
-        if url.startswith('dns:'):
-            return url
-
-        if re.match("^(http|https|ftp|mms|rtsp|wais)://.*", url):
+        ###noah: accept anything that looks like it starts with a scheme:
+        if re.match("^([a-zA-Z][a-zA-Z0-9\+\-\.]*):", url):
             return url
         else:
             return "http://"+url
@@ -211,36 +212,36 @@ def getURLString(self,
                      trailing_comma=False,
                      **options):
 
-        if None != self.opaque:
-            return self.opaque
-
-        if 'dns' == self.scheme:
-            s = self.scheme + ':'   ###java version adds :// regardless of scheme
-        else:                       ###java version uses opaque type for dns urls, but this version supports dns urls
-            s = self.scheme + '://'
-        if surt:
-            s += "("
-
-        if self.authUser:
-            s += self.authUser
-            if self.authPass:
-                s += self.authPass
-            s += '@'
+        s = self.scheme + ':'
 
         hostSrc = self.host
         if public_suffix:
             hostSrc = self.getPublicSuffix()
         if surt:
             hostSrc = hostToSURT(hostSrc)
-        s += hostSrc
 
-        if self.port != self.DEFAULT_PORT:
-            s += ":%d" % self.port
+        if hostSrc:
+            if self.scheme != 'dns':
+                s += '//'
 
-        if surt:
-            if trailing_comma:
-                s += ','
-            s += ')'
+            if surt:
+                s += "("
+
+            if self.authUser:
+                s += self.authUser
+                if self.authPass:
+                    s += self.authPass
+                s += '@'
+
+            s += hostSrc
+
+            if self.port != self.DEFAULT_PORT:
+                s += ":%d" % self.port
+
+            if surt:
+                if trailing_comma:
+                    s += ','
+                s += ')'
 
         hasPath = (None != self.path) and (len(self.path) > 0)
         if hasPath:
@@ -320,7 +321,7 @@ def getPublicSuffix(self):
     # commented out because of http://bugs.python.org/issue5876
     # "__repr__ returning unicode doesn't work when called implicitly"
     #def __repr__(self):
-    #    return u"""handyurl(scheme=%s, authUser=%s, authPass=%s, host=%s, port=%s, path=%s, query=%s, hash=%s, opaque=%s)""".encode('utf-8') % (self.scheme, self.authUser, self.authPass, self.host, self.port, self.path, self.query, self.hash, self.opaque)
+    #    return u"""handyurl(scheme=%s, authUser=%s, authPass=%s, host=%s, port=%s, path=%s, query=%s, hash=%s)""".encode('utf-8') % (self.scheme, self.authUser, self.authPass, self.host, self.port, self.path, self.query, self.hash)
 
 
 
diff --git a/tox.ini b/tox.ini
index fcbf2a9..8b6ba6d 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,7 +4,7 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py26, py27, py32, py33, py34
+envlist = py26, py27, py33, py34
 
 [testenv]
 commands = python setup.py test

From 5c1e62318731d28d6675789f477ac11f1c1d316f Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Mon, 31 Aug 2015 19:12:33 +0000
Subject: [PATCH 08/32] bump version number so pip install --upgrade picks it
 up

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index d001bcd..e149dd7 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@ def run_tests(self):
 
 
 setup(name='surt',
-      version='0.2',
+      version='0.3',
       author='rajbot',
       author_email='raj@archive.org',
       classifiers=[

From 8fea26bfe224e10e59626598e186b4381ac76208 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Mon, 7 Sep 2015 05:47:36 +0000
Subject: [PATCH 09/32] fix canonicalization of urls without authority

---
 surt/GoogleURLCanonicalizer.py | 69 +++++++++++++++++++---------------
 1 file changed, 38 insertions(+), 31 deletions(-)

diff --git a/surt/GoogleURLCanonicalizer.py b/surt/GoogleURLCanonicalizer.py
index 3b83bb7..be8ad7d 100755
--- a/surt/GoogleURLCanonicalizer.py
+++ b/surt/GoogleURLCanonicalizer.py
@@ -129,6 +129,9 @@ def canonicalize(url, **_ignored):
     'http://host.com/ab%23cd'
     >>> canonicalize(handyurl.parse("http://host.com//twoslashes?more//slashes")).getURLString()
     'http://host.com/twoslashes?more//slashes'
+
+    >>> canonicalize(handyurl.parse("mailto:foo@example.com")).getURLString()
+    'mailto:foo@example.com'
     """
 
     url.hash = None
@@ -139,42 +142,46 @@ def canonicalize(url, **_ignored):
     if url.query:
         url.query = minimalEscape(url.query)
 
-    hostE = unescapeRepeatedly(url.host)
+    if url.host:
+        hostE = unescapeRepeatedly(url.host)
+
+        # if the host was an ascii string of percent-encoded bytes that represent
+        # non-ascii unicode chars, then promote hostE from str to unicode.
+        # e.g. "http://www.t%EF%BF%BD%04.82.net/", which contains the unicode replacement char
+        if isinstance(hostE, binary_type):
+            try:
+                hostE.decode('ascii')
+            except UnicodeDecodeError:
+                hostE = hostE.decode('utf-8', 'ignore')
+
 
-    # if the host was an ascii string of percent-encoded bytes that represent
-    # non-ascii unicode chars, then promote hostE from str to unicode.
-    # e.g. "http://www.t%EF%BF%BD%04.82.net/", which contains the unicode replacement char
-    if isinstance(hostE, binary_type):
+        host = None
         try:
-            hostE.decode('ascii')
-        except UnicodeDecodeError:
-            hostE = hostE.decode('utf-8', 'ignore')
-
-
-    host = None
-    try:
-        # Note: I copied the use of the ToASCII(hostE) from
-        # the java code. This function implements RFC3490, which
-        # requires that each component of the hostname (i.e. each label)
-        # be encodeced separately, and doesn't work correctly with
-        # full hostnames. So use 'idna' encoding instead.
-        #host = encodings.idna.ToASCII(hostE)
-        host = hostE.encode('idna').decode('utf-8')
-    except ValueError:
-        host = hostE
-
-    host = host.replace('..', '.').strip('.')
-
-    ip = attemptIPFormats(host)
-    if ip:
-        host = ip;
-    else:
-        host = escapeOnce(host.lower())
+            # Note: I copied the use of the ToASCII(hostE) from
+            # the java code. This function implements RFC3490, which
+            # requires that each component of the hostname (i.e. each label)
+            # be encodeced separately, and doesn't work correctly with
+            # full hostnames. So use 'idna' encoding instead.
+            #host = encodings.idna.ToASCII(hostE)
+            host = hostE.encode('idna').decode('utf-8')
+        except ValueError:
+            host = hostE
+
+        host = host.replace('..', '.').strip('.')
+
+        ip = attemptIPFormats(host)
+        if ip:
+            host = ip;
+        else:
+            host = escapeOnce(host.lower())
 
-    url.host = host
+        url.host = host
 
     path = unescapeRepeatedly(url.path)
-    url.path = escapeOnce(normalizePath(path))
+    if url.host:
+        path = normalizePath(path)
+    # else path is free-form sort of thing, not /directory/thing
+    url.path = escapeOnce(path)
 
     return url
 

From e2a2d796dbab84f5f4cbde9fbcffdb89ca13feb4 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Mon, 7 Sep 2015 06:11:50 +0000
Subject: [PATCH 10/32] another fix for urls without authority

---
 surt/handyurl.py | 9 +++++----
 surt/surt.py     | 3 +++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/surt/handyurl.py b/surt/handyurl.py
index 4c65e88..3c13ce9 100755
--- a/surt/handyurl.py
+++ b/surt/handyurl.py
@@ -215,10 +215,11 @@ def getURLString(self,
         s = self.scheme + ':'
 
         hostSrc = self.host
-        if public_suffix:
-            hostSrc = self.getPublicSuffix()
-        if surt:
-            hostSrc = hostToSURT(hostSrc)
+        if hostSrc:
+            if public_suffix:
+                hostSrc = self.getPublicSuffix()
+            if surt:
+                hostSrc = hostToSURT(hostSrc)
 
         if hostSrc:
             if self.scheme != 'dns':
diff --git a/surt/surt.py b/surt/surt.py
index 6c10aac..d3798d6 100755
--- a/surt/surt.py
+++ b/surt/surt.py
@@ -110,6 +110,9 @@ def surt(url, canonicalizer=None, **options):
     Simple customization:
     >>> surt("http://www.example.com/", canonicalizer=lambda x, **opts: x)
     'com,example,www)/'
+
+    >>> surt("mailto:foo@example.com")
+    'mailto:foo@example.com'
     """
 
     if not url:

From c8e1f94d4c388bbb0d1dc327e8c875abf6ecc9cf Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Tue, 15 Sep 2015 20:18:55 +0000
Subject: [PATCH 11/32] add option "with_scheme" to surt.surt() to produce surt
 with leading "scheme://("

---
 surt/surt.py | 45 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 40 insertions(+), 5 deletions(-)

diff --git a/surt/surt.py b/surt/surt.py
index d3798d6..9774ca9 100755
--- a/surt/surt.py
+++ b/surt/surt.py
@@ -113,6 +113,39 @@ def surt(url, canonicalizer=None, **options):
 
     >>> surt("mailto:foo@example.com")
     'mailto:foo@example.com'
+
+    >>> surt("http://www.example.com/", with_scheme=True)
+    'http://(com,example)/'
+
+    >>> surt("http://www.example.com/", with_scheme=True, host_massage=True)
+    'http://(com,example)/'
+
+    >>> surt("http://www.example.com/", with_scheme=False)
+    'com,example)/'
+
+    >>> surt("http://www.example.com/", with_scheme=True, trailing_comma=True)
+    'http://(com,example,)/'
+
+    >>> surt("https://www.example.com/", with_scheme=True, trailing_comma=True)
+    'https://(com,example,)/'
+
+    >>> surt("ftp://www.example.com/", with_scheme=True, trailing_comma=True)
+    'ftp://(com,example,)/'
+
+    >>> surt("http://www.example.com/", with_scheme=True, host_massage=False)
+    'http://(com,example,www)/'
+
+    >>> surt("http://www.example.com/", with_scheme=False, host_massage=False)
+    'com,example,www)/'
+
+    >>> surt("http://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False)
+    'http://(com,example,www,)/'
+
+    >>> surt("https://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False)
+    'https://(com,example,www,)/'
+
+    >>> surt("ftp://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False)
+    'ftp://(com,example,www,)/'
     """
 
     if not url:
@@ -149,11 +182,13 @@ def surt(url, canonicalizer=None, **options):
     hurl = canonicalizer(handyurl.parse(url), **options)
     key  = hurl.getURLString(**options)
 
-    parenIdx = key.find('(')
-    if -1 == parenIdx:
-        return url #something very wrong
-
-    return key[parenIdx+1:]
+    if not options.get('with_scheme'):
+        parenIdx = key.find('(')
+        if -1 == parenIdx:
+            return url #something very wrong
+        return key[parenIdx+1:]
+    else:
+        return key
 
 
 # main()

From d03b93284b39de6348b8ddf11b878f05b1b6ba7e Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Tue, 22 Sep 2015 23:34:04 +0000
Subject: [PATCH 12/32] requirements are specified in setup.y

---
 requirements.txt | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 43349b8..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-tldextract==1.0
-pylint
-pytest

From 3ec34349c473331677f2d70ce9932054efb78597 Mon Sep 17 00:00:00 2001
From: Kenji Nagahashi <kenji@archive.org>
Date: Fri, 23 Oct 2015 18:36:56 +0000
Subject: [PATCH 13/32] start 0.3b line

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 03df6fa..3bb52e7 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,6 @@
 from setuptools import setup
 setup(name='surt',
-      version='0.2',
+      version='0.3b1',
       author='rajbot',
       author_email='raj@archive.org',
       classifiers=[

From 38f6dfb260e10bccf566a9d3d55c774df03c2bde Mon Sep 17 00:00:00 2001
From: Kenji Nagahashi <kenji@archive.org>
Date: Fri, 23 Oct 2015 18:38:12 +0000
Subject: [PATCH 14/32] Several performance improvements (~25% faster)

- prepare regular expression outside of methods.
- don't use RE if cheaper alternative exists.
- use simpler equivalent code, remove shortcut that in reality is slower
- eliminate sub template compilation by passing a function
---
 surt/GoogleURLCanonicalizer.py |  9 ++++-----
 surt/IAURLCanonicalizer.py     |  4 +++-
 surt/URLRegexTransformer.py    | 33 +++++++++++++++------------------
 surt/handyurl.py               | 27 +++++++++++++++++----------
 4 files changed, 39 insertions(+), 34 deletions(-)

diff --git a/surt/GoogleURLCanonicalizer.py b/surt/GoogleURLCanonicalizer.py
index 7c7be91..d734f55 100755
--- a/surt/GoogleURLCanonicalizer.py
+++ b/surt/GoogleURLCanonicalizer.py
@@ -219,6 +219,9 @@ def normalizePath(path):
 
     return path
 
+OCTAL_IP = re.compile(r"^(0[0-7]*)(\.[0-7]+)?(\.[0-7]+)?(\.[0-7]+)?$")
+DECIMAL_IP = re.compile(r"^([1-9][0-9]*)(\.[0-9]+)?(\.[0-9]+)?(\.[0-9]+)?$")
+
 # attemptIPFormats()
 #_______________________________________________________________________________
 def attemptIPFormats(host):
@@ -242,14 +245,10 @@ def attemptIPFormats(host):
     >>> attemptIPFormats("39024579298")
     '22.11.210.226'
     """
-
-    OCTAL_IP   = re.compile("^(0[0-7]*)(\.[0-7]+)?(\.[0-7]+)?(\.[0-7]+)?$")
-    DECIMAL_IP = re.compile("^([1-9][0-9]*)(\.[0-9]+)?(\.[0-9]+)?(\.[0-9]+)?$")
-
     if None == host:
         return None
 
-    if re.match("^\d+$", host):
+    if host.isdigit():
         #mask hostname to lower four bytes to workaround issue with liveweb arc files
         return socket.inet_ntoa(struct.pack('>L', int(host) & 0xffffffff))
     else:
diff --git a/surt/IAURLCanonicalizer.py b/surt/IAURLCanonicalizer.py
index 2bf7746..1613ae5 100755
--- a/surt/IAURLCanonicalizer.py
+++ b/surt/IAURLCanonicalizer.py
@@ -172,6 +172,8 @@ def alphaReorderQuery(orig):
 
 # massageHost()
 #_______________________________________________________________________________
+_RE_WWWDIGITS = re.compile('www\d*\.')
+
 def massageHost(host):
     """These doctests are from IAURLCanonicalizerTest.java:
 
@@ -188,7 +190,7 @@ def massageHost(host):
     'www2foo.com'
     """
 
-    m = re.match('www\d*\.', host)
+    m = _RE_WWWDIGITS.match(host)
     if m:
         return host[len(m.group(0)):]
     else:
diff --git a/surt/URLRegexTransformer.py b/surt/URLRegexTransformer.py
index 1949508..babe76e 100755
--- a/surt/URLRegexTransformer.py
+++ b/surt/URLRegexTransformer.py
@@ -30,6 +30,11 @@
 
 # stripPathSessionID
 #_______________________________________________________________________________
+_RES_PATH_SESSIONID = [
+    re.compile("^(.*/)(\((?:[a-z]\([0-9a-z]{24}\))+\)/)([^\?]+\.aspx.*)$", re.I),
+    re.compile("^(.*/)(\\([0-9a-z]{24}\\)/)([^\\?]+\\.aspx.*)$", re.I),
+    ]
+
 def stripPathSessionID(path):
     """It looks like the java version returns a lowercased path..
     So why does it uses a case-insensitive regex? We won't lowercase here.
@@ -52,11 +57,7 @@ def stripPathSessionID(path):
     >>> stripPathSessionID("/photos/36050182@N05/")
     '/photos/36050182@N05/'
     """
-    patterns = [re.compile("^(.*/)(\((?:[a-z]\([0-9a-z]{24}\))+\)/)([^\?]+\.aspx.*)$", re.I),
-                re.compile("^(.*/)(\\([0-9a-z]{24}\\)/)([^\\?]+\\.aspx.*)$", re.I),
-               ]
-
-    for pattern in patterns:
+    for pattern in _RES_PATH_SESSIONID:
         m = pattern.match(path)
         if m:
             path = m.group(1) + m.group(3)
@@ -66,6 +67,14 @@ def stripPathSessionID(path):
 
 # stripQuerySessionID
 #_______________________________________________________________________________
+_RES_QUERY_SESSIONID = [
+    re.compile("^(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
+    re.compile("^(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
+    re.compile("^(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
+    re.compile("^(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I),
+    re.compile("^(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I),
+    ]
+
 def stripQuerySessionID(path):
     """These doctests are from IAURLCanonicalizerTest.java:
 
@@ -148,14 +157,7 @@ def stripQuerySessionID(path):
     '?requestID=200608200458360%2E39414378'
 
     """
-    patterns = [re.compile("^(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
-                re.compile("^(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
-                re.compile("^(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
-                re.compile("^(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I),
-                re.compile("^(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I),
-               ]
-
-    for pattern in patterns:
+    for pattern in _RES_QUERY_SESSIONID:
         m = pattern.match(path)
         if m:
             if m.group(2):
@@ -175,12 +177,7 @@ def hostToSURT(host):
     """
     # TODO: ensure we DONT reverse IP addresses!
     parts = host.split('.')
-
-    if 1 == len(parts):
-        return host
-
     parts.reverse()
-
     return ','.join(parts)
 
 # main()
diff --git a/surt/handyurl.py b/surt/handyurl.py
index 06f355f..10b19be 100755
--- a/surt/handyurl.py
+++ b/surt/handyurl.py
@@ -25,6 +25,11 @@
 from urlparse import urlsplit
 from URLRegexTransformer import hostToSURT
 
+_RE_MULTIPLE_PROTOCOLS = re.compile(r'^(https?://)+')
+_RE_HAS_PROTOCOL = re.compile("^(?:http|https|ftp|mms|rtsp|wais)://")
+_RE_OPAQUE_URLS = re.compile("^(?:filedesc|warcinfo):")
+_RE_SPACES = re.compile('[\n\r\t]')
+
 class handyurl(object):
     """A python port of the archive-commons org.archive.url HandyURL class
 
@@ -112,18 +117,22 @@ def parse(cls, url):
         >>> handyurl.parse("http://mineral.galleries.com:/minerals/silicate/chabazit/chabazit.htm").geturl()
         'http://mineral.galleries.com/minerals/silicate/chabazit/chabazit.htm'
         """
+        # Note RE_SPACES does not match regular space (0x20). That is,
+        # regular spaces are removed at head and tail, but not in the middle.
+        # There's a test case for GoogleURLCanonicalizer.canonicalize that
+        # asserts this behavior.
         url = url.strip()
-        url = re.sub('[\n\r\t]', '', url)
+        url = _RE_SPACES.sub('', url)
 
         ### DNS URLs are treated separately as opaque urls by URLParser.java
         # However, we want to surtify dns urls as well.
-        if re.match("^(filedesc|warcinfo):.*", url):
+        if _RE_OPAQUE_URLS.match(url):
             return cls(opaque=url)
 
         url = cls.addDefaultSchemeIfNeeded(url)
 
         #From Tymm: deal with http://https/order.1and1.com
-        url = re.sub('^(https?://)+', r'\1', url)
+        url = _RE_MULTIPLE_PROTOCOLS.sub(lambda m: m.group(1), url)
 
         """The java code seems to use this regex:
         re.compile("^(([a-zA-Z][a-zA-Z0-9\+\-\.]*):)?((//([^/?#]*))?([^?#]*)(\?([^#]*))?)?(#(.*))?")
@@ -185,7 +194,7 @@ def addDefaultSchemeIfNeeded(cls, url):
         if url.startswith('dns:'):
             return url
 
-        if re.match("^(http|https|ftp|mms|rtsp|wais)://.*", url):
+        if _RE_HAS_PROTOCOL.match(url):
             return url
         else:
             return "http://"+url
@@ -231,13 +240,11 @@ def getURLString(self, surt=False, public_suffix=False):
         if surt:
             s += ')'
 
-        hasPath = (None != self.path) and (len(self.path) > 0)
-        if hasPath:
+        if self.path:
             s += self.path
-        else:
-            if (None != self.query) or (None != self.hash):
-                #must have '/' with query or hash:
-                s += '/'
+        elif self.query is not None or self.hash is not None:
+            #must have '/' with query or hash:
+            s += '/'
 
         if None != self.query:
             s += '?' + self.query

From 640a75664a1665818071ca45ef28e898ce0ab9b5 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Thu, 29 Oct 2015 18:04:49 +0000
Subject: [PATCH 15/32] fix double definition of getPublicSuffix, rename second
 one to correct getPublicPrefix (!!!)

---
 surt/handyurl.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/surt/handyurl.py b/surt/handyurl.py
index 4969a26..75cb92f 100755
--- a/surt/handyurl.py
+++ b/surt/handyurl.py
@@ -299,26 +299,26 @@ def getPublicSuffix(self):
 
     # getPublicPrefix
     #___________________________________________________________________________
-    def getPublicSuffix(self):
+    def getPublicPrefix(self):
         """Uses the tldextract module to get the subdomain, using the
         Public Suffix List.
 
         These doctests are based off the ones found in HandyURLTest.java:
 
         >>> h = handyurl(host='www.fool.com')
-        >>> h.getPublicSuffix()
+        >>> h.getPublicPrefix()
         'www'
 
         >>> h = handyurl(host='www.amazon.co.uk')
-        >>> h.getPublicSuffix()
+        >>> h.getPublicPrefix()
         'www'
 
         >>> h = handyurl(host='www.images.amazon.co.uk')
-        >>> h.getPublicSuffix()
+        >>> h.getPublicPrefix()
         'www.images'
 
         >>> h = handyurl(host='funky-images.fancy.co.jp')
-        >>> h.getPublicSuffix()
+        >>> h.getPublicPrefix()
         'funky-images'
         """
         return tldextract.extract(self.host).subdomain

From ffdcf38ed73d456b15c1ad3e5d282079fa0fde05 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Thu, 29 Oct 2015 19:00:18 +0000
Subject: [PATCH 16/32] replace doctests with regular tests to avoid character
 set encoding issues

---
 setup.py                          |   4 +-
 surt/DefaultIAURLCanonicalizer.py |  27 ---
 surt/GoogleURLCanonicalizer.py    | 140 --------------
 surt/IAURLCanonicalizer.py        |  78 --------
 surt/URLRegexTransformer.py       | 112 -----------
 surt/handyurl.py                  |  94 ---------
 surt/surt.py                      | 102 ----------
 tests/test_surt.py                | 308 ++++++++++++++++++++++++++++++
 8 files changed, 310 insertions(+), 555 deletions(-)
 create mode 100644 tests/test_surt.py

diff --git a/setup.py b/setup.py
index f653dc7..4031299 100644
--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,7 @@ def finalize_options(self):
     def run_tests(self):
         import pytest
         import sys
-        cmdline = ' -v --doctest-modules --cov surt surt/'
+        cmdline = ' -v --doctest-modules --cov surt tests/'
         errcode = pytest.main(cmdline)
         sys.exit(errcode)
 
@@ -33,7 +33,7 @@ def run_tests(self):
       packages=[ 'surt' ],
       scripts=[],
       # Tests
-      tests_require=[ 'pytest' ],
+      tests_require=[ 'pytest', 'pytest-cov' ],
       test_suite='',
       cmdclass={'test': PyTest},
      )
diff --git a/surt/DefaultIAURLCanonicalizer.py b/surt/DefaultIAURLCanonicalizer.py
index 11ae90e..5c0b862 100755
--- a/surt/DefaultIAURLCanonicalizer.py
+++ b/surt/DefaultIAURLCanonicalizer.py
@@ -21,9 +21,6 @@
 
 """This is a python port of DefaultIAURLCanonicalizer.java:
 http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/DefaultIAURLCanonicalizer.java?view=markup
-
-The doctests are copied from DefaultIAURLCanonicalizerTest.java:
-http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/DefaultIAURLCanonicalizerTest.java?view=markup
 """
 from __future__ import absolute_import
 
@@ -35,33 +32,9 @@
 #_______________________________________________________________________________
 def canonicalize(url, **options):
     """The input url is a handyurl instance
-
-    These doctests are from DefaultIAURLCanonicalizerTest.java:
-
-    >>> from .handyurl import handyurl
-    >>> canonicalize(handyurl.parse("http://www.alexa.com/")).getURLString()
-    'http://alexa.com/'
-    >>> canonicalize(handyurl.parse("http://archive.org/index.html")).getURLString()
-    'http://archive.org/index.html'
-    >>> canonicalize(handyurl.parse("http://archive.org/index.html?")).getURLString()
-    'http://archive.org/index.html'
-    >>> canonicalize(handyurl.parse("http://archive.org/index.html?a=b")).getURLString()
-    'http://archive.org/index.html?a=b'
-    >>> canonicalize(handyurl.parse("http://archive.org/index.html?b=b&a=b")).getURLString()
-    'http://archive.org/index.html?a=b&b=b'
-    >>> canonicalize(handyurl.parse("http://archive.org/index.html?b=a&b=b&a=b")).getURLString()
-    'http://archive.org/index.html?a=b&b=a&b=b'
-    >>> canonicalize(handyurl.parse("http://www34.archive.org/index.html?b=a&b=b&a=b")).getURLString()
-    'http://archive.org/index.html?a=b&b=a&b=b'
     """
 
     url = surt.GoogleURLCanonicalizer.canonicalize(url, **options)
     url = surt.IAURLCanonicalizer.canonicalize(url, **options)
 
     return url
-
-# main()
-#_______________________________________________________________________________
-if __name__ == "__main__":
-    import doctest
-    doctest.testmod()
diff --git a/surt/GoogleURLCanonicalizer.py b/surt/GoogleURLCanonicalizer.py
index 3f9049f..9757eb9 100755
--- a/surt/GoogleURLCanonicalizer.py
+++ b/surt/GoogleURLCanonicalizer.py
@@ -22,9 +22,6 @@
 
 """This is a python port of GoogleURLCanonicalizer.java:
 http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/GoogleURLCanonicalizer.java?view=markup
-
-The doctests are copied from GoogleURLCanonicalizerTest.java:
-http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/GoogleURLCanonicalizerTest.java?view=markup
 """
 
 from __future__ import absolute_import
@@ -42,98 +39,6 @@
 # unescapeRepeatedly()
 #_______________________________________________________________________________
 def canonicalize(url, **_ignored):
-    u"""
-    >>> canonicalize(handyurl.parse("http://host/%25%32%35")).getURLString()
-    'http://host/%25'
-    >>> canonicalize(handyurl.parse("http://host/%25%32%35%25%32%35")).getURLString()
-    'http://host/%25%25'
-    >>> canonicalize(handyurl.parse("http://host/%2525252525252525")).getURLString()
-    'http://host/%25'
-    >>> canonicalize(handyurl.parse("http://host/asdf%25%32%35asd")).getURLString()
-    'http://host/asdf%25asd'
-    >>> canonicalize(handyurl.parse("http://host/%%%25%32%35asd%%")).getURLString()
-    'http://host/%25%25%25asd%25%25'
-    >>> canonicalize(handyurl.parse("http://www.google.com/")).getURLString()
-    'http://www.google.com/'
-    >>> canonicalize(handyurl.parse("http://%31%36%38%2e%31%38%38%2e%39%39%2e%32%36/%2E%73%65%63%75%72%65/%77%77%77%2E%65%62%61%79%2E%63%6F%6D/")).getURLString()
-    'http://168.188.99.26/.secure/www.ebay.com/'
-    >>> canonicalize(handyurl.parse("http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/")).getURLString()
-    'http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/'
-    >>> canonicalize(handyurl.parse("http://host%23.com/%257Ea%2521b%2540c%2523d%2524e%25f%255E00%252611%252A22%252833%252944_55%252B")).getURLString()
-    'http://host%23.com/~a!b@c%23d$e%25f^00&11*22(33)44_55+'
-    >>> canonicalize(handyurl.parse("http://3279880203/blah")).getURLString()
-    'http://195.127.0.11/blah'
-    >>> canonicalize(handyurl.parse("http://www.google.com/blah/..")).getURLString()
-    'http://www.google.com/'
-    >>> canonicalize(handyurl.parse("www.google.com/")).getURLString()
-    'http://www.google.com/'
-    >>> canonicalize(handyurl.parse("www.google.com")).getURLString()
-    'http://www.google.com/'
-    >>> canonicalize(handyurl.parse("http://www.evil.com/blah#frag")).getURLString()
-    'http://www.evil.com/blah'
-    >>> canonicalize(handyurl.parse("http://www.GOOgle.com/")).getURLString()
-    'http://www.google.com/'
-    >>> canonicalize(handyurl.parse("http://www.google.com.../")).getURLString()
-    'http://www.google.com/'
-
-    #This works but the newline in the docstring messes up doctest
-    #>>> canonicalize(handyurl.parse("http://www.google.com/foo\tbar\rbaz\n2")).getURLString()
-    #'http://www.google.com/foobarbaz2'
-
-    >>> canonicalize(handyurl.parse("http://www.google.com/q?")).getURLString()
-    'http://www.google.com/q?'
-    >>> canonicalize(handyurl.parse("http://www.google.com/q?r?")).getURLString()
-    'http://www.google.com/q?r?'
-    >>> canonicalize(handyurl.parse("http://www.google.com/q?r?s")).getURLString()
-    'http://www.google.com/q?r?s'
-    >>> canonicalize(handyurl.parse("http://evil.com/foo#bar#baz")).getURLString()
-    'http://evil.com/foo'
-    >>> canonicalize(handyurl.parse("http://evil.com/foo;")).getURLString()
-    'http://evil.com/foo;'
-    >>> canonicalize(handyurl.parse("http://evil.com/foo?bar;")).getURLString()
-    'http://evil.com/foo?bar;'
-
-    #This test case differs from the Java version. The Java version returns
-    #'http://%01%80.com/' for this case. If idna/punycode encoding of a hostname
-    #is not possible, the python version encodes unicode domains as utf-8 before
-    #percent encoding, so we get 'http://%01%C2%80.com/'
-    >>> print(canonicalize(handyurl.parse(u"http://\u0001\u0080.com/")).getURLString())
-    http://%01%C2%80.com/
-
-    #Add these unicode tests:
-    >>> print(canonicalize(handyurl.parse(u'B\xfccher.ch:8080')).getURLString())
-    http://xn--bcher-kva.ch:8080/
-    >>> url = '☃.com' #doctest has trouble with utf-8 encoding
-    >>> print(canonicalize(handyurl.parse(url)).getURLString())
-    http://xn--n3h.com/
-
-    #Add these percent-encoded unicode tests
-    >>> canonicalize(handyurl.parse("http://www.t%EF%BF%BD%04.82.net/")).getURLString()
-    'http://www.t%EF%BF%BD%04.82.net/'
-
-    >>> canonicalize(handyurl.parse("http://notrailingslash.com")).getURLString()
-    'http://notrailingslash.com/'
-    >>> canonicalize(handyurl.parse("http://www.gotaport.com:1234/")).getURLString()
-    'http://www.gotaport.com:1234/'
-    >>> canonicalize(handyurl.parse("  http://www.google.com/  ")).getURLString()
-    'http://www.google.com/'
-    >>> canonicalize(handyurl.parse("http:// leadingspace.com/")).getURLString()
-    'http://%20leadingspace.com/'
-    >>> canonicalize(handyurl.parse("http://%20leadingspace.com/")).getURLString()
-    'http://%20leadingspace.com/'
-    >>> canonicalize(handyurl.parse("%20leadingspace.com/")).getURLString()
-    'http://%20leadingspace.com/'
-    >>> canonicalize(handyurl.parse("https://www.securesite.com/")).getURLString()
-    'https://www.securesite.com/'
-    >>> canonicalize(handyurl.parse("http://host.com/ab%23cd")).getURLString()
-    'http://host.com/ab%23cd'
-    >>> canonicalize(handyurl.parse("http://host.com//twoslashes?more//slashes")).getURLString()
-    'http://host.com/twoslashes?more//slashes'
-
-    >>> canonicalize(handyurl.parse("mailto:foo@example.com")).getURLString()
-    'mailto:foo@example.com'
-    """
-
     url.hash = None
     if url.authUser:
         url.authUser = minimalEscape(url.authUser)
@@ -236,26 +141,6 @@ def normalizePath(path):
 # attemptIPFormats()
 #_______________________________________________________________________________
 def attemptIPFormats(host):
-    """
-    The doctests are copied from GoogleURLCanonicalizerTest.java:
-
-    >>> attemptIPFormats(None)
-    >>> attemptIPFormats("www.foo.com") #returns None
-    >>> attemptIPFormats("127.0.0.1")
-    '127.0.0.1'
-    >>> attemptIPFormats("017.0.0.1")
-    '15.0.0.1'
-    >>> attemptIPFormats("168.188.99.26")
-    '168.188.99.26'
-    >>> attemptIPFormats("10.0.258") #java version returns null, ours returns the correct ipv4
-    '10.0.1.2'
-    >>> attemptIPFormats("1.2.3.256") #returns None
-
-    ARC files from the wayback machine's liveweb proxy contain numeric
-    hostnames > 2^32 for some reason. We'll copy the behavior of the java code.
-    >>> attemptIPFormats("39024579298")
-    '22.11.210.226'
-    """
     if None == host:
         return None
 
@@ -304,26 +189,6 @@ def escapeOnce(input):
 # unescapeRepeatedly()
 #_______________________________________________________________________________
 def unescapeRepeatedly(input):
-    """
-    The doctests are copied from GoogleURLCanonicalizerTest.java:
-
-    >>> unescapeRepeatedly("%!A%21%21%25")
-    '%!A!!%'
-    >>> unescapeRepeatedly("%")
-    '%'
-    >>> unescapeRepeatedly("%2")
-    '%2'
-    >>> unescapeRepeatedly("%25")
-    '%'
-    >>> unescapeRepeatedly("%25%")
-    '%%'
-    >>> unescapeRepeatedly("%2525")
-    '%'
-    >>> unescapeRepeatedly("%252525")
-    '%'
-    >>> unescapeRepeatedly("%25%32%35")
-    '%'
-    """
     if None == input:
         return None
 
@@ -333,8 +198,3 @@ def unescapeRepeatedly(input):
             return input
         input = un
 
-# main()
-#_______________________________________________________________________________
-if __name__ == "__main__":
-    import doctest
-    doctest.testmod()
diff --git a/surt/IAURLCanonicalizer.py b/surt/IAURLCanonicalizer.py
index 142e8fa..0603d6a 100755
--- a/surt/IAURLCanonicalizer.py
+++ b/surt/IAURLCanonicalizer.py
@@ -21,9 +21,6 @@
 
 """This is a python port of IAURLCanonicalizer.java:
 http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/IAURLCanonicalizer.java?view=markup
-
-The doctests are copied from IAURLCanonicalizerTest.java:
-http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/IAURLCanonicalizerTest.java?view=markup
 """
 
 from __future__ import absolute_import
@@ -44,23 +41,6 @@ def canonicalize(url, host_lowercase=True, host_massage=True,
                  query_strip_empty=True, query_alpha_reorder=True,
                  hash_strip=True, **_ignored):
     """The input url is a handyurl instance
-
-    These doctests are from IAURLCanonicalizerTest.java:
-
-    >>> canonicalize(handyurl.parse("http://ARCHIVE.ORG/")).getURLString()
-    'http://archive.org/'
-    >>> canonicalize(handyurl.parse("http://www.archive.org:80/")).getURLString()
-    'http://archive.org/'
-    >>> canonicalize(handyurl.parse("https://www.archive.org:80/")).getURLString()
-    'https://archive.org:80/'
-    >>> canonicalize(handyurl.parse("http://www.archive.org:443/")).getURLString()
-    'http://archive.org:443/'
-    >>> canonicalize(handyurl.parse("https://www.archive.org:443/")).getURLString()
-    'https://archive.org/'
-    >>> canonicalize(handyurl.parse("http://www.archive.org/big/")).getURLString()
-    'http://archive.org/big'
-    >>> canonicalize(handyurl.parse("dns:www.archive.org")).getURLString()
-    'dns:www.archive.org'
     """
     if host_lowercase and url.host:
         url.host = url.host.lower()
@@ -122,34 +102,6 @@ def alphaReorderQuery(orig):
     """It's a shame that we can't use urlparse.parse_qsl() for this, but this
     function does keeps the trailing '=' if there is a query arg with no value:
     "?foo" vs "?foo=", and we want to exactly match the java version
-
-    These doctests are from IAURLCanonicalizerTest.java:
-
-    >>> alphaReorderQuery(None)
-    >>> alphaReorderQuery("")
-    ''
-    >>> alphaReorderQuery("")
-    ''
-    >>> alphaReorderQuery("a")
-    'a'
-    >>> alphaReorderQuery("ab")
-    'ab'
-    >>> alphaReorderQuery("a=1")
-    'a=1'
-    >>> alphaReorderQuery("ab=1")
-    'ab=1'
-    >>> alphaReorderQuery("a=1&")
-    '&a=1'
-    >>> alphaReorderQuery("a=1&b=1")
-    'a=1&b=1'
-    >>> alphaReorderQuery("b=1&a=1")
-    'a=1&b=1'
-    >>> alphaReorderQuery("a=a&a=a")
-    'a=a&a=a'
-    >>> alphaReorderQuery("a=b&a=a")
-    'a=a&a=b'
-    >>> alphaReorderQuery("b=b&a=b&b=a&a=a")
-    'a=a&a=b&b=a&b=b'
     """
 
 
@@ -178,21 +130,6 @@ def alphaReorderQuery(orig):
 _RE_WWWDIGITS = re.compile('www\d*\.')
 
 def massageHost(host):
-    """These doctests are from IAURLCanonicalizerTest.java:
-
-    >>> massageHost("foo.com")
-    'foo.com'
-    >>> massageHost("www.foo.com")
-    'foo.com'
-    >>> massageHost("www12.foo.com")
-    'foo.com'
-
-    >>> massageHost("www2foo.com")
-    'www2foo.com'
-    >>> massageHost("www2.www2foo.com")
-    'www2foo.com'
-    """
-
     m = _RE_WWWDIGITS.match(host)
     if m:
         return host[len(m.group(0)):]
@@ -202,15 +139,6 @@ def massageHost(host):
 # getDefaultPort()
 #_______________________________________________________________________________
 def getDefaultPort(scheme):
-    """These doctests are from IAURLCanonicalizerTest.java:
-
-    >>> getDefaultPort("foo")
-    0
-    >>> getDefaultPort("http")
-    80
-    >>> getDefaultPort("https")
-    443
-    """
     scheme_lower = scheme.lower()
     if 'http' == scheme_lower:
         return 80
@@ -219,9 +147,3 @@ def getDefaultPort(scheme):
     else:
         return 0
 
-# main()
-#_______________________________________________________________________________
-if __name__ == "__main__":
-    import doctest
-    doctest.testmod()
-
diff --git a/surt/URLRegexTransformer.py b/surt/URLRegexTransformer.py
index babe76e..04f829b 100755
--- a/surt/URLRegexTransformer.py
+++ b/surt/URLRegexTransformer.py
@@ -21,9 +21,6 @@
 
 """This is a python port of URLRegexTransformer.java:
 http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/URLRegexTransformer.java?view=markup
-
-The doctests are copied from URLRegexTransformerTest.java:
-http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/URLRegexTransformerTest.java?view=markup
 """
 
 import re
@@ -38,24 +35,6 @@
 def stripPathSessionID(path):
     """It looks like the java version returns a lowercased path..
     So why does it uses a case-insensitive regex? We won't lowercase here.
-
-    These doctests are from IAURLCanonicalizerTest.java:
-
-	Check ASP_SESSIONID2:
-	>>> stripPathSessionID("/(S(4hqa0555fwsecu455xqckv45))/mileg.aspx")
-	'/mileg.aspx'
-
-    Check ASP_SESSIONID2 (again):
-    >>> stripPathSessionID("/(4hqa0555fwsecu455xqckv45)/mileg.aspx")
-    '/mileg.aspx'
-
-    Check ASP_SESSIONID3:
-    >>> stripPathSessionID("/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=sessionschedules")
-    '/mileg.aspx?page=sessionschedules'
-
-    '@' in path:
-    >>> stripPathSessionID("/photos/36050182@N05/")
-    '/photos/36050182@N05/'
     """
     for pattern in _RES_PATH_SESSIONID:
         m = pattern.match(path)
@@ -76,87 +55,6 @@ def stripPathSessionID(path):
     ]
 
 def stripQuerySessionID(path):
-    """These doctests are from IAURLCanonicalizerTest.java:
-
-    >>> #base = "http://www.archive.org/index.html"
-    >>> base = ""
-    >>> str32id = "0123456789abcdefghijklemopqrstuv"
-    >>> url = base + "?jsessionid=" + str32id
-    >>> stripQuerySessionID(url)
-    '?'
-
-    Test that we don't strip if not 32 chars only.
-    >>> url = base + "?jsessionid=" + str32id + '0'
-    >>> stripQuerySessionID(url)
-    '?jsessionid=0123456789abcdefghijklemopqrstuv0'
-
-    Test what happens when followed by another key/value pair.
-    >>> url = base + "?jsessionid=" + str32id + "&x=y"
-    >>> stripQuerySessionID(url)
-    '?x=y'
-
-    Test what happens when followed by another key/value pair and
-    prefixed by a key/value pair.
-    >>> url = base + "?one=two&jsessionid=" + str32id + "&x=y"
-    >>> stripQuerySessionID(url)
-    '?one=two&x=y'
-
-    Test what happens when prefixed by a key/value pair.
-    >>> url = base + "?one=two&jsessionid=" + str32id
-    >>> stripQuerySessionID(url)
-    '?one=two&'
-
-    Test aspsession.
-    >>> url = base + "?aspsessionidABCDEFGH=" + "ABCDEFGHIJKLMNOPQRSTUVWX" + "&x=y"
-    >>> stripQuerySessionID(url)
-    '?x=y'
-
-    Test archive phpsession.
-    >>> url = base + "?phpsessid=" + str32id + "&x=y"
-    >>> stripQuerySessionID(url)
-    '?x=y'
-
-    With prefix too.
-    >>> url = base + "?one=two&phpsessid=" + str32id + "&x=y"
-    >>> stripQuerySessionID(url)
-    '?one=two&x=y'
-
-    With only prefix
-    >>> url = base + "?one=two&phpsessid=" + str32id
-    >>> stripQuerySessionID(url)
-    '?one=two&'
-
-    Test sid.
-    >>> url = base + "?" + "sid=9682993c8daa2c5497996114facdc805" + "&x=y";
-    >>> stripQuerySessionID(url)
-    '?x=y'
-
-    Igor test.
-    >>> url = base + "?" + "sid=9682993c8daa2c5497996114facdc805" + "&" + "jsessionid=" + str32id
-    >>> stripQuerySessionID(url)
-    '?'
-
-    >>> url = "?CFID=1169580&CFTOKEN=48630702&dtstamp=22%2F08%2F2006%7C06%3A58%3A11"
-    >>> stripQuerySessionID(url)
-    '?dtstamp=22%2F08%2F2006%7C06%3A58%3A11'
-
-    >>> url = "?CFID=12412453&CFTOKEN=15501799&dt=19_08_2006_22_39_28"
-    >>> stripQuerySessionID(url)
-    '?dt=19_08_2006_22_39_28'
-
-    >>> url = "?CFID=14475712&CFTOKEN=2D89F5AF-3048-2957-DA4EE4B6B13661AB&r=468710288378&m=forgotten"
-    >>> stripQuerySessionID(url)
-    '?r=468710288378&m=forgotten'
-
-    >>> url = "?CFID=16603925&CFTOKEN=2AE13EEE-3048-85B0-56CEDAAB0ACA44B8"
-    >>> stripQuerySessionID(url)
-    '?'
-
-    >>> url = "?CFID=4308017&CFTOKEN=63914124&requestID=200608200458360%2E39414378"
-    >>> stripQuerySessionID(url)
-    '?requestID=200608200458360%2E39414378'
-
-    """
     for pattern in _RES_QUERY_SESSIONID:
         m = pattern.match(path)
         if m:
@@ -171,18 +69,8 @@ def stripQuerySessionID(path):
 # hostToSURT
 #_______________________________________________________________________________
 def hostToSURT(host):
-    """This doctest comes from IAURLCanonicalizerTest.java:
-    >>> hostToSURT("www.archive.org")
-    'org,archive,www'
-    """
     # TODO: ensure we DONT reverse IP addresses!
     parts = host.split('.')
     parts.reverse()
     return ','.join(parts)
 
-# main()
-#_______________________________________________________________________________
-if __name__ == "__main__":
-    import doctest
-    doctest.testmod()
-
diff --git a/surt/handyurl.py b/surt/handyurl.py
index 75cb92f..d2fb984 100755
--- a/surt/handyurl.py
+++ b/surt/handyurl.py
@@ -80,56 +80,6 @@ def parse(cls, url):
         u"""This method was in the java URLParser class, but we don't need
         a whole class to parse a url, when we can just use python's urlparse.
 
-        These doctests come from URLParserTest.java:
-
-        >>> handyurl.parse("http://www.archive.org/index.html#foo").geturl()
-        'http://www.archive.org/index.html#foo'
-
-        >>> handyurl.parse("http://www.archive.org/").geturl()
-        'http://www.archive.org/'
-
-        >>> handyurl.parse("http://www.archive.org").geturl()
-        'http://www.archive.org'
-
-        >>> handyurl.parse("http://www.archive.org?").geturl()
-        'http://www.archive.org?'
-
-        >>> handyurl.parse("http://www.archive.org:8080/index.html?query#foo").geturl()
-        'http://www.archive.org:8080/index.html?query#foo'
-
-        >>> handyurl.parse("http://www.archive.org:8080/index.html?#foo").geturl()
-        'http://www.archive.org:8080/index.html#foo'
-
-        >>> handyurl.parse("http://www.archive.org:8080?#foo").geturl()
-        'http://www.archive.org:8080/#foo'
-
-        >>> handyurl.parse(u"http://bücher.ch:8080?#foo").geturl()
-        'http://bücher.ch:8080/#foo'
-
-        >>> handyurl.parse(u"dns:bücher.ch").geturl()
-        'dns:bücher.ch'
-
-        >>> print(handyurl.parse(u"http://bücher.ch:8080?#foo").geturl())
-        http://b\xfccher.ch:8080/#foo
-
-        >>> print(handyurl.parse(u"dns:bücher.ch").geturl())
-        dns:b\xfccher.ch
-
-        ###From Tymm:
-        >>> handyurl.parse("http:////////////////www.vikings.com").geturl()
-        'http://www.vikings.com/'
-        >>> handyurl.parse("http://https://order.1and1.com").geturl()
-        'https://order.1and1.com'
-
-        ###From Common Crawl, host ends with ':' without a port number
-        >>> handyurl.parse("http://mineral.galleries.com:/minerals/silicate/chabazit/chabazit.htm").geturl()
-        'http://mineral.galleries.com/minerals/silicate/chabazit/chabazit.htm'
-
-        >>> handyurl.parse("mailto:bot@archive.org").scheme
-        'mailto'
-
-        >>> handyurl.parse("mailto:bot@archive.org").geturl()
-        'mailto:bot@archive.org'
         """
         # Note RE_SPACES does not match regular space (0x20). That is,
         # regular spaces are removed at head and tail, but not in the middle.
@@ -273,26 +223,7 @@ def getURLString(self,
     def getPublicSuffix(self):
         """Uses the tldextract module to get the public suffix via the
         Public Suffix List.
-
-        These doctests are based off the ones found in HandyURLTest.java:
-
-        >>> h = handyurl(host='www.fool.com')
-        >>> h.getPublicSuffix()
-        'fool.com'
-
-        >>> h = handyurl(host='www.amazon.co.uk')
-        >>> h.getPublicSuffix()
-        'amazon.co.uk'
-
-        >>> h = handyurl(host='www.images.amazon.co.uk')
-        >>> h.getPublicSuffix()
-        'amazon.co.uk'
-
-        >>> h = handyurl(host='funky-images.fancy.co.jp')
-        >>> h.getPublicSuffix()
-        'fancy.co.jp'
         """
-
         r = tldextract.extract(self.host)
         return "%s.%s" % (r.domain, r.tld)
 
@@ -302,24 +233,6 @@ def getPublicSuffix(self):
     def getPublicPrefix(self):
         """Uses the tldextract module to get the subdomain, using the
         Public Suffix List.
-
-        These doctests are based off the ones found in HandyURLTest.java:
-
-        >>> h = handyurl(host='www.fool.com')
-        >>> h.getPublicPrefix()
-        'www'
-
-        >>> h = handyurl(host='www.amazon.co.uk')
-        >>> h.getPublicPrefix()
-        'www'
-
-        >>> h = handyurl(host='www.images.amazon.co.uk')
-        >>> h.getPublicPrefix()
-        'www.images'
-
-        >>> h = handyurl(host='funky-images.fancy.co.jp')
-        >>> h.getPublicPrefix()
-        'funky-images'
         """
         return tldextract.extract(self.host).subdomain
 
@@ -330,10 +243,3 @@ def getPublicPrefix(self):
     #def __repr__(self):
     #    return u"""handyurl(scheme=%s, authUser=%s, authPass=%s, host=%s, port=%s, path=%s, query=%s, hash=%s)""".encode('utf-8') % (self.scheme, self.authUser, self.authPass, self.host, self.port, self.path, self.query, self.hash)
 
-
-
-# main()
-#_______________________________________________________________________________
-if __name__ == "__main__":
-    import doctest
-    doctest.testmod()
diff --git a/surt/surt.py b/surt/surt.py
index 9774ca9..8a937bd 100755
--- a/surt/surt.py
+++ b/surt/surt.py
@@ -52,102 +52,6 @@ def _normalize(canonicalizer):
 # surt()
 #_______________________________________________________________________________
 def surt(url, canonicalizer=None, **options):
-    """
-    These doctests are from WaybackURLKeyMakerTest.java
-
-    >>> surt(None)
-    '-'
-    >>> surt('')
-    '-'
-    >>> surt("filedesc:foo.arc.gz")
-    'filedesc:foo.arc.gz'
-    >>> surt("filedesc:/foo.arc.gz")
-    'filedesc:/foo.arc.gz'
-    >>> surt("filedesc://foo.arc.gz")
-    'filedesc://foo.arc.gz'
-    >>> surt("warcinfo:foo.warc.gz")
-    'warcinfo:foo.warc.gz'
-    >>> surt("dns:alexa.com")
-    'com,alexa)'
-    >>> surt("dns:archive.org")
-    'org,archive)'
-
-    >>> surt("http://www.archive.org/")
-    'org,archive)/'
-    >>> surt("http://archive.org/")
-    'org,archive)/'
-    >>> surt("http://archive.org/goo/")
-    'org,archive)/goo'
-    >>> surt("http://archive.org/goo/?")
-    'org,archive)/goo'
-    >>> surt("http://archive.org/goo/?b&a")
-    'org,archive)/goo?a&b'
-    >>> surt("http://archive.org/goo/?a=2&b&a=1")
-    'org,archive)/goo?a=1&a=2&b'
-
-    # trailing comma mode
-    >>> surt("http://archive.org/goo/?a=2&b&a=1", trailing_comma=True)
-    'org,archive,)/goo?a=1&a=2&b'
-
-    >>> surt("dns:archive.org", trailing_comma=True)
-    'org,archive,)'
-
-    >>> surt("warcinfo:foo.warc.gz", trailing_comma=True)
-    'warcinfo:foo.warc.gz'
-
-    PHP session id:
-    >>> surt("http://archive.org/index.php?PHPSESSID=0123456789abcdefghijklemopqrstuv&action=profile;u=4221")
-    'org,archive)/index.php?action=profile;u=4221'
-
-    WHOIS url:
-    >>> surt("whois://whois.isoc.org.il/shaveh.co.il")
-    'whois://whois.isoc.org.il/shaveh.co.il'
-
-    Yahoo web bug. See https://github.com/internetarchive/surt/issues/1
-    >>> surt('http://visit.webhosting.yahoo.com/visit.gif?&r=http%3A//web.archive.org/web/20090517140029/http%3A//anthonystewarthead.electric-chi.com/&b=Netscape%205.0%20%28Windows%3B%20en-US%29&s=1366x768&o=Win32&c=24&j=true&v=1.2')
-    'com,yahoo,webhosting,visit)/visit.gif?&b=netscape%205.0%20(windows;%20en-us)&c=24&j=true&o=win32&r=http://web.archive.org/web/20090517140029/http://anthonystewarthead.electric-chi.com/&s=1366x768&v=1.2'
-
-    Simple customization:
-    >>> surt("http://www.example.com/", canonicalizer=lambda x, **opts: x)
-    'com,example,www)/'
-
-    >>> surt("mailto:foo@example.com")
-    'mailto:foo@example.com'
-
-    >>> surt("http://www.example.com/", with_scheme=True)
-    'http://(com,example)/'
-
-    >>> surt("http://www.example.com/", with_scheme=True, host_massage=True)
-    'http://(com,example)/'
-
-    >>> surt("http://www.example.com/", with_scheme=False)
-    'com,example)/'
-
-    >>> surt("http://www.example.com/", with_scheme=True, trailing_comma=True)
-    'http://(com,example,)/'
-
-    >>> surt("https://www.example.com/", with_scheme=True, trailing_comma=True)
-    'https://(com,example,)/'
-
-    >>> surt("ftp://www.example.com/", with_scheme=True, trailing_comma=True)
-    'ftp://(com,example,)/'
-
-    >>> surt("http://www.example.com/", with_scheme=True, host_massage=False)
-    'http://(com,example,www)/'
-
-    >>> surt("http://www.example.com/", with_scheme=False, host_massage=False)
-    'com,example,www)/'
-
-    >>> surt("http://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False)
-    'http://(com,example,www,)/'
-
-    >>> surt("https://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False)
-    'https://(com,example,www,)/'
-
-    >>> surt("ftp://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False)
-    'ftp://(com,example,www,)/'
-    """
-
     if not url:
         return "-"
 
@@ -190,9 +94,3 @@ def surt(url, canonicalizer=None, **options):
     else:
         return key
 
-
-# main()
-#_______________________________________________________________________________
-if __name__ == "__main__":
-    import doctest
-    doctest.testmod()
diff --git a/tests/test_surt.py b/tests/test_surt.py
new file mode 100644
index 0000000..5e8461e
--- /dev/null
+++ b/tests/test_surt.py
@@ -0,0 +1,308 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import absolute_import
+
+import surt
+from surt import handyurl
+
+def test_handyurl_parse():
+    # These tests come from URLParserTest.java
+    assert handyurl.parse("http://www.archive.org/index.html#foo").geturl() == 'http://www.archive.org/index.html#foo'
+    assert handyurl.parse("http://www.archive.org/").geturl() == 'http://www.archive.org/'
+    assert handyurl.parse("http://www.archive.org").geturl() == 'http://www.archive.org'
+    assert handyurl.parse("http://www.archive.org?").geturl() == 'http://www.archive.org?' 
+    assert handyurl.parse("http://www.archive.org:8080/index.html?query#foo").geturl() == 'http://www.archive.org:8080/index.html?query#foo'
+    assert handyurl.parse("http://www.archive.org:8080/index.html?#foo").geturl() == 'http://www.archive.org:8080/index.html#foo'
+    assert handyurl.parse("http://www.archive.org:8080?#foo").geturl() == 'http://www.archive.org:8080/#foo'
+    assert handyurl.parse(u"http://bücher.ch:8080?#foo").geturl() == u'http://bücher.ch:8080/#foo'
+    assert handyurl.parse(u"dns:bücher.ch").geturl() == u'dns:bücher.ch'
+    # XXX assert print(handyurl.parse(u"http://bücher.ch:8080?#foo").geturl()) == http://b\xfccher.ch:8080/#foo 
+    # XXX assert print(handyurl.parse(u"dns:bücher.ch").geturl()) == dns:b\xfccher.ch
+    assert handyurl.parse(u"http://bücher.ch:8080?#foo").geturl() == u"http://b\xfccher.ch:8080/#foo"
+    assert handyurl.parse(u"dns:bücher.ch").geturl() == u"dns:b\xfccher.ch"
+
+    ###From Tymm:
+    assert handyurl.parse("http:////////////////www.vikings.com").geturl() == 'http://www.vikings.com/'
+    assert handyurl.parse("http://https://order.1and1.com").geturl() == 'https://order.1and1.com'
+
+    ###From Common Crawl, host ends with ':' without a port number
+    assert handyurl.parse("http://mineral.galleries.com:/minerals/silicate/chabazit/chabazit.htm").geturl() == 'http://mineral.galleries.com/minerals/silicate/chabazit/chabazit.htm'
+
+    assert handyurl.parse("mailto:bot@archive.org").scheme == 'mailto'
+    assert handyurl.parse("mailto:bot@archive.org").geturl() == 'mailto:bot@archive.org'
+
+def test_getPublicSuffix():
+    # These tests are based off the ones found in HandyURLTest.java
+    assert handyurl(host='www.fool.com').getPublicSuffix() == 'fool.com'
+    assert handyurl(host='www.amazon.co.uk').getPublicSuffix() == 'amazon.co.uk'
+    assert handyurl(host='www.images.amazon.co.uk').getPublicSuffix() == 'amazon.co.uk'
+    assert handyurl(host='funky-images.fancy.co.jp').getPublicSuffix() == 'fancy.co.jp'
+
+def test_getPublicPrefix():
+    # These tests are based off the ones found in HandyURLTest.java
+    assert handyurl(host='www.fool.com').getPublicPrefix() == 'www'
+    assert handyurl(host='www.amazon.co.uk').getPublicPrefix() == 'www'
+    assert handyurl(host='www.images.amazon.co.uk').getPublicPrefix() == 'www.images'
+    assert handyurl(host='funky-images.fancy.co.jp').getPublicPrefix() == 'funky-images'
+
+def test_DefaultIAURLCanonicalizer():
+    # These tests are from DefaultIAURLCanonicalizerTest.java
+    assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://www.alexa.com/")).getURLString() == 'http://alexa.com/'
+    assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://archive.org/index.html")).getURLString() == 'http://archive.org/index.html'
+    assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://archive.org/index.html?")).getURLString() == 'http://archive.org/index.html'
+    assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://archive.org/index.html?a=b")).getURLString() == 'http://archive.org/index.html?a=b'
+    assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://archive.org/index.html?b=b&a=b")).getURLString() == 'http://archive.org/index.html?a=b&b=b'
+    assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://archive.org/index.html?b=a&b=b&a=b")).getURLString() == 'http://archive.org/index.html?a=b&b=a&b=b'
+    assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://www34.archive.org/index.html?b=a&b=b&a=b")).getURLString() == 'http://archive.org/index.html?a=b&b=a&b=b'
+
+def test_GoogleURLCanonicalizer():
+    # The tests are copied from GoogleURLCanonicalizerTest.java
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host/%25%32%35")).getURLString() == 'http://host/%25'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host/%25%32%35%25%32%35")).getURLString() == 'http://host/%25%25'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host/%2525252525252525")).getURLString() == 'http://host/%25'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host/asdf%25%32%35asd")).getURLString() == 'http://host/asdf%25asd'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host/%%%25%32%35asd%%")).getURLString() == 'http://host/%25%25%25asd%25%25'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com/")).getURLString() == 'http://www.google.com/'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://%31%36%38%2e%31%38%38%2e%39%39%2e%32%36/%2E%73%65%63%75%72%65/%77%77%77%2E%65%62%61%79%2E%63%6F%6D/")).getURLString() == 'http://168.188.99.26/.secure/www.ebay.com/'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/")).getURLString() == 'http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host%23.com/%257Ea%2521b%2540c%2523d%2524e%25f%255E00%252611%252A22%252833%252944_55%252B")).getURLString() == 'http://host%23.com/~a!b@c%23d$e%25f^00&11*22(33)44_55+'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://3279880203/blah")).getURLString() == 'http://195.127.0.11/blah'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com/blah/..")).getURLString() == 'http://www.google.com/'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("www.google.com/")).getURLString() == 'http://www.google.com/'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("www.google.com")).getURLString() == 'http://www.google.com/'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.evil.com/blah#frag")).getURLString() == 'http://www.evil.com/blah'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.GOOgle.com/")).getURLString() == 'http://www.google.com/'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com.../")).getURLString() == 'http://www.google.com/'
+
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com/foo\tbar\rbaz\n2")).getURLString() == 'http://www.google.com/foobarbaz2'
+
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com/q?")).getURLString() == 'http://www.google.com/q?'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com/q?r?")).getURLString() == 'http://www.google.com/q?r?'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com/q?r?s")).getURLString() == 'http://www.google.com/q?r?s'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://evil.com/foo#bar#baz")).getURLString() == 'http://evil.com/foo'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://evil.com/foo;")).getURLString() == 'http://evil.com/foo;'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://evil.com/foo?bar;")).getURLString() == 'http://evil.com/foo?bar;'
+
+    #This test case differs from the Java version. The Java version returns
+    #'http://%01%80.com/' for this case. If idna/punycode encoding of a hostname
+    #is not possible, the python version encodes unicode domains as utf-8 before
+    #percent encoding, so we get 'http://%01%C2%80.com/'
+    # assert print(canonicalize(handyurl.parse(u"http://\u0001\u0080.com/")).getURLString()) http://%01%C2%80.com/
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse(u"http://\u0001\u0080.com/")).getURLString() == 'http://%01%C2%80.com/'
+
+    #Add these unicode tests:
+    # assert print(canonicalize(handyurl.parse(u'B\xfccher.ch:8080')).getURLString()) http://xn--bcher-kva.ch:8080/
+    # assert print(canonicalize(handyurl.parse('☃.com')).getURLString()) == http://xn--n3h.com/
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse(u'B\xfccher.ch:8080')).getURLString() == 'http://xn--bcher-kva.ch:8080/'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse('☃.com')).getURLString() == 'http://xn--n3h.com/'
+
+    #Add these percent-encoded unicode tests
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.t%EF%BF%BD%04.82.net/")).getURLString() == 'http://www.t%EF%BF%BD%04.82.net/'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://notrailingslash.com")).getURLString() == 'http://notrailingslash.com/'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.gotaport.com:1234/")).getURLString() == 'http://www.gotaport.com:1234/'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("  http://www.google.com/  ")).getURLString() == 'http://www.google.com/'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http:// leadingspace.com/")).getURLString() == 'http://%20leadingspace.com/'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://%20leadingspace.com/")).getURLString() == 'http://%20leadingspace.com/'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("%20leadingspace.com/")).getURLString() == 'http://%20leadingspace.com/'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("https://www.securesite.com/")).getURLString() == 'https://www.securesite.com/'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host.com/ab%23cd")).getURLString() == 'http://host.com/ab%23cd'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host.com//twoslashes?more//slashes")).getURLString() == 'http://host.com/twoslashes?more//slashes'
+    assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("mailto:foo@example.com")).getURLString() == 'mailto:foo@example.com'
+
+def test_attemptIPFormats():
+    # The tests are copied from GoogleURLCanonicalizerTest.java
+    assert surt.GoogleURLCanonicalizer.attemptIPFormats(None) is None
+    assert surt.GoogleURLCanonicalizer.attemptIPFormats("www.foo.com") is None
+    assert surt.GoogleURLCanonicalizer.attemptIPFormats("127.0.0.1") == '127.0.0.1'
+    assert surt.GoogleURLCanonicalizer.attemptIPFormats("017.0.0.1") == '15.0.0.1'
+    assert surt.GoogleURLCanonicalizer.attemptIPFormats("168.188.99.26") == '168.188.99.26'
+    #java version returns null, ours returns the correct ipv4
+    assert surt.GoogleURLCanonicalizer.attemptIPFormats("10.0.258") == '10.0.1.2'
+    assert surt.GoogleURLCanonicalizer.attemptIPFormats("1.2.3.256") is None #returns None
+
+    # ARC files from the wayback machine's liveweb proxy contain numeric
+    # hostnames > 2^32 for some reason. We'll copy the behavior of the java code.
+    assert surt.GoogleURLCanonicalizer.attemptIPFormats("39024579298") == '22.11.210.226'
+
+def test_unescapeRepeatedly():
+    # The tests are copied from GoogleURLCanonicalizerTest.java
+    assert surt.GoogleURLCanonicalizer.unescapeRepeatedly("%!A%21%21%25") == '%!A!!%'
+    assert surt.GoogleURLCanonicalizer.unescapeRepeatedly("%") == '%'
+    assert surt.GoogleURLCanonicalizer.unescapeRepeatedly("%2") == '%2'
+    assert surt.GoogleURLCanonicalizer.unescapeRepeatedly("%25") == '%'
+    assert surt.GoogleURLCanonicalizer.unescapeRepeatedly("%25%") == '%%'
+    assert surt.GoogleURLCanonicalizer.unescapeRepeatedly("%2525") == '%'
+    assert surt.GoogleURLCanonicalizer.unescapeRepeatedly("%252525") == '%'
+    assert surt.GoogleURLCanonicalizer.unescapeRepeatedly("%25%32%35") == '%'
+
+def test_IAURLCanonicalizer():
+    # These tests are from IAURLCanonicalizerTest.java
+    assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("http://ARCHIVE.ORG/")).getURLString() == 'http://archive.org/'
+    assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("http://www.archive.org:80/")).getURLString() == 'http://archive.org/'
+    assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("https://www.archive.org:80/")).getURLString() == 'https://archive.org:80/'
+    assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("http://www.archive.org:443/")).getURLString() == 'http://archive.org:443/'
+    assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("https://www.archive.org:443/")).getURLString() == 'https://archive.org/'
+    assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("http://www.archive.org/big/")).getURLString() == 'http://archive.org/big'
+    assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("dns:www.archive.org")).getURLString() == 'dns:www.archive.org'
+
+def test_alphaReorderQuery():
+    # These tests are from IAURLCanonicalizerTest.java
+    assert surt.IAURLCanonicalizer.alphaReorderQuery(None) is None
+    assert surt.IAURLCanonicalizer.alphaReorderQuery("") == ''
+    assert surt.IAURLCanonicalizer.alphaReorderQuery("") == ''
+    assert surt.IAURLCanonicalizer.alphaReorderQuery("a") == 'a'
+    assert surt.IAURLCanonicalizer.alphaReorderQuery("ab") == 'ab'
+    assert surt.IAURLCanonicalizer.alphaReorderQuery("a=1") == 'a=1'
+    assert surt.IAURLCanonicalizer.alphaReorderQuery("ab=1") == 'ab=1'
+    assert surt.IAURLCanonicalizer.alphaReorderQuery("a=1&") == '&a=1'
+    assert surt.IAURLCanonicalizer.alphaReorderQuery("a=1&b=1") == 'a=1&b=1'
+    assert surt.IAURLCanonicalizer.alphaReorderQuery("b=1&a=1") == 'a=1&b=1'
+    assert surt.IAURLCanonicalizer.alphaReorderQuery("a=a&a=a") == 'a=a&a=a'
+    assert surt.IAURLCanonicalizer.alphaReorderQuery("a=b&a=a") == 'a=a&a=b'
+    assert surt.IAURLCanonicalizer.alphaReorderQuery("b=b&a=b&b=a&a=a") == 'a=a&a=b&b=a&b=b'
+
+def test_massageHost():
+    # These tests are from IAURLCanonicalizerTest.java
+    assert surt.IAURLCanonicalizer.massageHost("foo.com") == 'foo.com'
+    assert surt.IAURLCanonicalizer.massageHost("www.foo.com") == 'foo.com'
+    assert surt.IAURLCanonicalizer.massageHost("www12.foo.com") == 'foo.com'
+
+    assert surt.IAURLCanonicalizer.massageHost("www2foo.com") == 'www2foo.com'
+    assert surt.IAURLCanonicalizer.massageHost("www2.www2foo.com") == 'www2foo.com'
+
+def test_getDefaultPort():
+    # These tests are from IAURLCanonicalizerTest.java
+    assert surt.IAURLCanonicalizer.getDefaultPort("foo") == 0
+    assert surt.IAURLCanonicalizer.getDefaultPort("http") == 80
+    assert surt.IAURLCanonicalizer.getDefaultPort("https") == 443
+
+def test_stripPathSessionID():
+    # These tests are from IAURLCanonicalizerTest.java
+    # Check ASP_SESSIONID2:
+    assert surt.URLRegexTransformer.stripPathSessionID("/(S(4hqa0555fwsecu455xqckv45))/mileg.aspx") == '/mileg.aspx'
+
+    # Check ASP_SESSIONID2 (again):
+    assert surt.URLRegexTransformer.stripPathSessionID("/(4hqa0555fwsecu455xqckv45)/mileg.aspx") == '/mileg.aspx'
+
+    # Check ASP_SESSIONID3:
+    assert surt.URLRegexTransformer.stripPathSessionID("/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=sessionschedules") == '/mileg.aspx?page=sessionschedules'
+
+    # '@' in path:
+    assert surt.URLRegexTransformer.stripPathSessionID("/photos/36050182@N05/") == '/photos/36050182@N05/'
+
+
+def test_stripQuerySessionID():
+    #base = "http://www.archive.org/index.html"
+    base = ""
+    str32id = "0123456789abcdefghijklemopqrstuv"
+    url = base + "?jsessionid=" + str32id
+    assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?'
+
+    # Test that we don't strip if not 32 chars only.
+    url = base + "?jsessionid=" + str32id + '0'
+    assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?jsessionid=0123456789abcdefghijklemopqrstuv0'
+
+    # Test what happens when followed by another key/value pair.
+    url = base + "?jsessionid=" + str32id + "&x=y"
+    assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?x=y'
+
+    # Test what happens when followed by another key/value pair and
+    # prefixed by a key/value pair.
+    url = base + "?one=two&jsessionid=" + str32id + "&x=y"
+    assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?one=two&x=y'
+
+    # Test what happens when prefixed by a key/value pair.
+    url = base + "?one=two&jsessionid=" + str32id
+    assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?one=two&'
+
+    # Test aspsession.
+    url = base + "?aspsessionidABCDEFGH=" + "ABCDEFGHIJKLMNOPQRSTUVWX" + "&x=y"
+    assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?x=y'
+
+    # Test archive phpsession.
+    url = base + "?phpsessid=" + str32id + "&x=y"
+    assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?x=y'
+
+    # With prefix too.
+    url = base + "?one=two&phpsessid=" + str32id + "&x=y"
+    assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?one=two&x=y'
+
+    # With only prefix
+    url = base + "?one=two&phpsessid=" + str32id
+    assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?one=two&'
+
+    # Test sid.
+    url = base + "?" + "sid=9682993c8daa2c5497996114facdc805" + "&x=y";
+    assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?x=y'
+
+    # Igor test.
+    url = base + "?" + "sid=9682993c8daa2c5497996114facdc805" + "&" + "jsessionid=" + str32id
+    assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?'
+
+    url = "?CFID=1169580&CFTOKEN=48630702&dtstamp=22%2F08%2F2006%7C06%3A58%3A11"
+    assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?dtstamp=22%2F08%2F2006%7C06%3A58%3A11'
+
+    url = "?CFID=12412453&CFTOKEN=15501799&dt=19_08_2006_22_39_28"
+    assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?dt=19_08_2006_22_39_28'
+
+    url = "?CFID=14475712&CFTOKEN=2D89F5AF-3048-2957-DA4EE4B6B13661AB&r=468710288378&m=forgotten"
+    assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?r=468710288378&m=forgotten'
+
+    url = "?CFID=16603925&CFTOKEN=2AE13EEE-3048-85B0-56CEDAAB0ACA44B8"
+    assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?'
+
+    url = "?CFID=4308017&CFTOKEN=63914124&requestID=200608200458360%2E39414378"
+    assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?requestID=200608200458360%2E39414378'
+
+def test_hostToSURT():
+    assert surt.URLRegexTransformer.hostToSURT("www.archive.org") == 'org,archive,www'
+
+
+def test_surt():
+    # These tests are from WaybackURLKeyMakerTest.java
+
+    assert surt.surt(None) == '-'
+    assert surt.surt('') == '-'
+    assert surt.surt("filedesc:foo.arc.gz") == 'filedesc:foo.arc.gz'
+    assert surt.surt("filedesc:/foo.arc.gz") == 'filedesc:/foo.arc.gz'
+    assert surt.surt("filedesc://foo.arc.gz") == 'filedesc://foo.arc.gz'
+    assert surt.surt("warcinfo:foo.warc.gz") == 'warcinfo:foo.warc.gz'
+    assert surt.surt("dns:alexa.com") == 'com,alexa)'
+    assert surt.surt("dns:archive.org") == 'org,archive)'
+
+    assert surt.surt("http://www.archive.org/") == 'org,archive)/'
+    assert surt.surt("http://archive.org/") == 'org,archive)/'
+    assert surt.surt("http://archive.org/goo/") == 'org,archive)/goo'
+    assert surt.surt("http://archive.org/goo/?") == 'org,archive)/goo'
+    assert surt.surt("http://archive.org/goo/?b&a") == 'org,archive)/goo?a&b'
+    assert surt.surt("http://archive.org/goo/?a=2&b&a=1") == 'org,archive)/goo?a=1&a=2&b'
+
+    # trailing comma mode
+    assert surt.surt("http://archive.org/goo/?a=2&b&a=1", trailing_comma=True) == 'org,archive,)/goo?a=1&a=2&b'
+    assert surt.surt("dns:archive.org", trailing_comma=True) == 'org,archive,)'
+    assert surt.surt("warcinfo:foo.warc.gz", trailing_comma=True) == 'warcinfo:foo.warc.gz'
+
+    # PHP session id:
+    assert surt.surt("http://archive.org/index.php?PHPSESSID=0123456789abcdefghijklemopqrstuv&action=profile;u=4221") == 'org,archive)/index.php?action=profile;u=4221'
+
+    # WHOIS url:
+    assert surt.surt("whois://whois.isoc.org.il/shaveh.co.il") == 'whois://whois.isoc.org.il/shaveh.co.il'
+
+    # Yahoo web bug. See https://github.com/internetarchive/surt/issues/1
+    assert surt.surt('http://visit.webhosting.yahoo.com/visit.gif?&r=http%3A//web.archive.org/web/20090517140029/http%3A//anthonystewarthead.electric-chi.com/&b=Netscape%205.0%20%28Windows%3B%20en-US%29&s=1366x768&o=Win32&c=24&j=true&v=1.2') == 'com,yahoo,webhosting,visit)/visit.gif?&b=netscape%205.0%20(windows;%20en-us)&c=24&j=true&o=win32&r=http://web.archive.org/web/20090517140029/http://anthonystewarthead.electric-chi.com/&s=1366x768&v=1.2'
+
+    # Simple customization:
+    assert surt.surt("http://www.example.com/", canonicalizer=lambda x, **opts: x) == 'com,example,www)/'
+    assert surt.surt("mailto:foo@example.com") == 'mailto:foo@example.com'
+    assert surt.surt("http://www.example.com/", with_scheme=True) == 'http://(com,example)/'
+    assert surt.surt("http://www.example.com/", with_scheme=True, host_massage=True) == 'http://(com,example)/'
+    assert surt.surt("http://www.example.com/", with_scheme=False) == 'com,example)/'
+    assert surt.surt("http://www.example.com/", with_scheme=True, trailing_comma=True) == 'http://(com,example,)/'
+    assert surt.surt("https://www.example.com/", with_scheme=True, trailing_comma=True) == 'https://(com,example,)/'
+    assert surt.surt("ftp://www.example.com/", with_scheme=True, trailing_comma=True) == 'ftp://(com,example,)/'
+    assert surt.surt("http://www.example.com/", with_scheme=True, host_massage=False) == 'http://(com,example,www)/'
+    assert surt.surt("http://www.example.com/", with_scheme=False, host_massage=False) == 'com,example,www)/'
+    assert surt.surt("http://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False) == 'http://(com,example,www,)/'
+    assert surt.surt("https://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False) == 'https://(com,example,www,)/'
+    assert surt.surt("ftp://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False) == 'ftp://(com,example,www,)/'
+

From f1aac724bb6f1602e92f3aeb35181713bc9f4b1e Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Thu, 29 Oct 2015 19:06:35 +0000
Subject: [PATCH 17/32] remove python 3.2 test which fails due to
 https://github.com/menegazzo/travispy/issues/20

---
 .travis.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index d6fb14a..db0b8e6 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,7 +9,6 @@ language: python
 env:
     - TOXENV=py26
     - TOXENV=py27
-    - TOXENV=py32
     - TOXENV=py33
     - TOXENV=py34
 

From c16d673b7a463bc6209a818839eea6820210bbfc Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Thu, 29 Oct 2015 19:10:13 +0000
Subject: [PATCH 18/32] let's see if tests pass on travis-ci for all these
 versions of python...

---
 .travis.yml | 4 ++++
 tox.ini     | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index db0b8e6..db0d07c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,6 +11,10 @@ env:
     - TOXENV=py27
     - TOXENV=py33
     - TOXENV=py34
+    - TOXENV=py35
+    - TOXENV=jython
+    - TOXENV=pypy
+    - TOXENV=pypy3
 
 before_install:
     - sudo apt-get update
diff --git a/tox.ini b/tox.ini
index 8b6ba6d..99aefa0 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,7 +4,7 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py26, py27, py33, py34
+envlist = py26, py27, py33, py34, py35, jython, pypy, pypy3
 
 [testenv]
 commands = python setup.py test

From dc74830ba008388a9624e84cca123697dac50b61 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Thu, 29 Oct 2015 19:19:35 +0000
Subject: [PATCH 19/32] fiddling with how tests run

---
 .travis.yml | 27 +++++++++++----------------
 setup.py    |  2 +-
 tox.ini     |  4 ++--
 3 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index db0d07c..1922529 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,20 +1,15 @@
-# vim: set sw=4 et:
-#
-# tox approach stolen from
-# https://github.com/pypa/pip/blob/abdb597dbfb51b21cc76c1cff068b72c80f3a77d/.travis.yml
-#
-
 language: python
 
-env:
-    - TOXENV=py26
-    - TOXENV=py27
-    - TOXENV=py33
-    - TOXENV=py34
-    - TOXENV=py35
-    - TOXENV=jython
-    - TOXENV=pypy
-    - TOXENV=pypy3
+python:
+    - 2.6
+    - 2.7
+    - 3.3
+    - 3.4
+    - 3.5
+    - 3.5-dev # 3.5 development branch
+    - nightly # currently points to 3.6-dev
+    - pypy # currently points to 3.6-dev
+    - pypy3 # currently points to 3.6-dev
 
 before_install:
     - sudo apt-get update
@@ -23,7 +18,7 @@ before_install:
 before_script:
     - pip install tox
 
-script: tox
+script: py.test -v --cov surt tests/
 
 #after_success:
     #coveralls
diff --git a/setup.py b/setup.py
index 4031299..4993c29 100644
--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,7 @@ def finalize_options(self):
     def run_tests(self):
         import pytest
         import sys
-        cmdline = ' -v --doctest-modules --cov surt tests/'
+        cmdline = ' -v --cov surt tests/'
         errcode = pytest.main(cmdline)
         sys.exit(errcode)
 
diff --git a/tox.ini b/tox.ini
index 99aefa0..fe18a1c 100644
--- a/tox.ini
+++ b/tox.ini
@@ -4,10 +4,10 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py26, py27, py33, py34, py35, jython, pypy, pypy3
+envlist = py26, py27, py33, py34, py35, pypy, pypy3
 
 [testenv]
-commands = python setup.py test
+commands = py.test -v --cov surt tests/
 deps =
     pytest
     pytest-cov

From e4ef467e369207d50a4fe3909eac5b0db500cef6 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Thu, 29 Oct 2015 20:13:41 +0000
Subject: [PATCH 20/32] try again travis

---
 .travis.yml | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 1922529..71851af 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,14 +11,9 @@ python:
     - pypy # currently points to 3.6-dev
     - pypy3 # currently points to 3.6-dev
 
-before_install:
-    - sudo apt-get update
-    - pip install coveralls --use-mirrors
+script: 
+        - py.test -v --cov surt tests/
 
-before_script:
-    - pip install tox
+install:
+        - pip install . pytest coveralls
 
-script: py.test -v --cov surt tests/
-
-#after_success:
-    #coveralls

From a2073fac829214f311eae9c278c31a096a668407 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Thu, 29 Oct 2015 20:25:28 +0000
Subject: [PATCH 21/32] fix coveralls dependency

---
 .travis.yml | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 71851af..7c38dd1 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,9 +11,6 @@ python:
     - pypy # currently points to 3.6-dev
     - pypy3 # currently points to 3.6-dev
 
-script: 
-        - py.test -v --cov surt tests/
-
-install:
-        - pip install . pytest coveralls
+install: pip install . pytest pytest-cov
+script: py.test -v --cov=surt tests/
 

From 64e992997e16074a87ed43e75e634715082c4506 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Fri, 30 Oct 2015 00:37:24 +0000
Subject: [PATCH 22/32] remove incorrect comments

---
 .travis.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 7c38dd1..2e51b89 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,10 +6,10 @@ python:
     - 3.3
     - 3.4
     - 3.5
-    - 3.5-dev # 3.5 development branch
-    - nightly # currently points to 3.6-dev
-    - pypy # currently points to 3.6-dev
-    - pypy3 # currently points to 3.6-dev
+    - 3.5-dev
+    - nightly
+    - pypy
+    - pypy3
 
 install: pip install . pytest pytest-cov
 script: py.test -v --cov=surt tests/

From 524677944c5cf9259757da23909448323bc471b4 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Mon, 2 Nov 2015 21:46:28 +0000
Subject: [PATCH 23/32] update links

---
 README.md | 8 ++++----
 setup.py  | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 772bbd1..afd1665 100644
--- a/README.md
+++ b/README.md
@@ -12,17 +12,17 @@ Installation:
 
 Or install the dev version from git:
 
-    pip install git+git://github.com/rajbot/surt#egg=surt
+    pip install git+https://github.com/internetarchive/surt.git#egg=surt
 
 
 More information about SURTs:
 http://crawler.archive.org/articles/user_manual/glossary.html#surt
 
-This is mostly a python port of the archive-commons org.archive.url package.
+This is mostly a python port of the webarchive-commons org.archive.url package.
 The original java version of the org.archive.url package is here:
-http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/
+https://github.com/iipc/webarchive-commons/tree/master/src/main/java/org/archive/url
 
 This module depends on the `tldextract` module to query the Public Suffix
 List. `tldextract` can be installed via `pip`
 
-[![Build Status](https://secure.travis-ci.org/rajbot/surt.png?branch=master)](http://travis-ci.org/rajbot/surt)
+[![Build Status](https://travis-ci.org/internetarchive/surt.svg)](https://travis-ci.org/internetarchive/surt)
diff --git a/setup.py b/setup.py
index 4993c29..d8ae970 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@ def run_tests(self):
       ],
       description='Sort-friendly URI Reordering Transform (SURT) python package.',
       long_description=open('README.md').read(),
-      url='https://github.com/rajbot/surt',
+      url='https://github.com/internetarchive/surt',
       zip_safe=True,
       install_requires=[
           'six',

From 9348d098bfd743623f77a1b8fc399300bbe902fa Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Mon, 2 Nov 2015 21:50:18 +0000
Subject: [PATCH 24/32] switch to restructuredtext for pypi

---
 README.md => README.rst | 23 ++++++++++++++++-------
 setup.py                |  2 +-
 2 files changed, 17 insertions(+), 8 deletions(-)
 rename README.md => README.rst (54%)

diff --git a/README.md b/README.rst
similarity index 54%
rename from README.md
rename to README.rst
index afd1665..251337c 100644
--- a/README.md
+++ b/README.rst
@@ -2,27 +2,36 @@ Sort-friendly URI Reordering Transform (SURT) python package.
 
 Usage:
 
+::
+
     >>> from surt import surt
     >>> surt("http://archive.org/goo/?a=2&b&a=1")
     'org,archive)/goo?a=1&a=2&b'
 
 Installation:
 
+::
+
     pip install surt
 
 Or install the dev version from git:
 
-    pip install git+https://github.com/internetarchive/surt.git#egg=surt
+::
 
+    pip install git+https://github.com/internetarchive/surt.git#egg=surt
 
 More information about SURTs:
-http://crawler.archive.org/articles/user_manual/glossary.html#surt
+http://crawler.archive.org/articles/user\_manual/glossary.html#surt
 
-This is mostly a python port of the webarchive-commons org.archive.url package.
-The original java version of the org.archive.url package is here:
+This is mostly a python port of the webarchive-commons org.archive.url
+package. The original java version of the org.archive.url package is
+here:
 https://github.com/iipc/webarchive-commons/tree/master/src/main/java/org/archive/url
 
-This module depends on the `tldextract` module to query the Public Suffix
-List. `tldextract` can be installed via `pip`
+This module depends on the ``tldextract`` module to query the Public
+Suffix List. ``tldextract`` can be installed via ``pip``
+
+|Build Status|
 
-[![Build Status](https://travis-ci.org/internetarchive/surt.svg)](https://travis-ci.org/internetarchive/surt)
+.. |Build Status| image:: https://travis-ci.org/internetarchive/surt.svg
+   :target: https://travis-ci.org/internetarchive/surt
diff --git a/setup.py b/setup.py
index d8ae970..5755168 100644
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@ def run_tests(self):
         'License :: OSI Approved :: GNU Affero General Public License v3',
       ],
       description='Sort-friendly URI Reordering Transform (SURT) python package.',
-      long_description=open('README.md').read(),
+      long_description=open('README.rst').read(),
       url='https://github.com/internetarchive/surt',
       zip_safe=True,
       install_requires=[

From 732ddd3e195062fa93e54a7cf251daa1995634ee Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Mon, 2 Nov 2015 21:52:57 +0000
Subject: [PATCH 25/32] switch to new travis-ci docker based build

---
 .travis.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 2e51b89..41411eb 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,5 +1,7 @@
-language: python
+# http://docs.travis-ci.com/user/migrating-from-legacy/
+sudo: false
 
+language: python
 python:
     - 2.6
     - 2.7

From 94cf1414e8746671be6545c9bd5c79c8ad308e34 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Wed, 16 Dec 2015 00:04:15 +0000
Subject: [PATCH 26/32] remove special handling of dns: and warcinfo: urls,
 which have no authority section

---
 setup.py           |  2 +-
 surt/handyurl.py   | 13 ++-----------
 surt/surt.py       | 13 -------------
 tests/test_surt.py | 22 ++++++++++++++++++----
 4 files changed, 21 insertions(+), 29 deletions(-)

diff --git a/setup.py b/setup.py
index 5755168..85d06ef 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@ def run_tests(self):
 
 
 setup(name='surt',
-      version='0.3b2',
+      version='0.3b3',
       author='rajbot',
       author_email='raj@archive.org',
       classifiers=[
diff --git a/surt/handyurl.py b/surt/handyurl.py
index d2fb984..866268c 100755
--- a/surt/handyurl.py
+++ b/surt/handyurl.py
@@ -107,17 +107,8 @@ def parse(cls, url):
             o = o._replace(netloc=o.netloc.rstrip(':'))
         port     = o.port     or None
 
-        """One more special-case for dns urls or broken http urls. From the docs:
-        Following the syntax specifications in RFC 1808, urlparse recognizes
-        a netloc only if it is properly introduced by ‘//’. Otherwise the input
-        is presumed to be a relative URL and thus to start with a path component.
-        """
-        if 'dns' == scheme:
-            hostname = o.path or None
-            path     = None
-        else:
-            hostname = o.hostname or None
-            path     = o.path     or None
+        hostname = o.hostname or None
+        path     = o.path     or None
 
         if scheme.startswith('http'):
             #deal with "http:////////////////www.vikings.com"
diff --git a/surt/surt.py b/surt/surt.py
index 8a937bd..829a27c 100755
--- a/surt/surt.py
+++ b/surt/surt.py
@@ -58,19 +58,6 @@ def surt(url, canonicalizer=None, **options):
     if url.startswith("filedesc"):
         return url
 
-    if url.startswith("warcinfo"):
-        return url
-
-    if url.startswith("dns:"):
-        res = hostToSURT(url[4:])
-        if options.get('trailing_comma'):
-            res += ','
-        res += ')'
-        return res
-
-    if url.startswith("whois://"):
-        return url
-
     if canonicalizer is None:
         canonicalizer = DefaultIAURLCanonicalizer.canonicalize
     else:
diff --git a/tests/test_surt.py b/tests/test_surt.py
index 5e8461e..39aa211 100644
--- a/tests/test_surt.py
+++ b/tests/test_surt.py
@@ -267,8 +267,8 @@ def test_surt():
     assert surt.surt("filedesc:/foo.arc.gz") == 'filedesc:/foo.arc.gz'
     assert surt.surt("filedesc://foo.arc.gz") == 'filedesc://foo.arc.gz'
     assert surt.surt("warcinfo:foo.warc.gz") == 'warcinfo:foo.warc.gz'
-    assert surt.surt("dns:alexa.com") == 'com,alexa)'
-    assert surt.surt("dns:archive.org") == 'org,archive)'
+    assert surt.surt("dns:alexa.com") == 'dns:alexa.com'
+    assert surt.surt("dns:archive.org") == 'dns:archive.org'
 
     assert surt.surt("http://www.archive.org/") == 'org,archive)/'
     assert surt.surt("http://archive.org/") == 'org,archive)/'
@@ -279,14 +279,14 @@ def test_surt():
 
     # trailing comma mode
     assert surt.surt("http://archive.org/goo/?a=2&b&a=1", trailing_comma=True) == 'org,archive,)/goo?a=1&a=2&b'
-    assert surt.surt("dns:archive.org", trailing_comma=True) == 'org,archive,)'
+    assert surt.surt("dns:archive.org", trailing_comma=True) == 'dns:archive.org'
     assert surt.surt("warcinfo:foo.warc.gz", trailing_comma=True) == 'warcinfo:foo.warc.gz'
 
     # PHP session id:
     assert surt.surt("http://archive.org/index.php?PHPSESSID=0123456789abcdefghijklemopqrstuv&action=profile;u=4221") == 'org,archive)/index.php?action=profile;u=4221'
 
     # WHOIS url:
-    assert surt.surt("whois://whois.isoc.org.il/shaveh.co.il") == 'whois://whois.isoc.org.il/shaveh.co.il'
+    assert surt.surt("whois://whois.isoc.org.il/shaveh.co.il") == 'il,org,isoc,whois)/shaveh.co.il'
 
     # Yahoo web bug. See https://github.com/internetarchive/surt/issues/1
     assert surt.surt('http://visit.webhosting.yahoo.com/visit.gif?&r=http%3A//web.archive.org/web/20090517140029/http%3A//anthonystewarthead.electric-chi.com/&b=Netscape%205.0%20%28Windows%3B%20en-US%29&s=1366x768&o=Win32&c=24&j=true&v=1.2') == 'com,yahoo,webhosting,visit)/visit.gif?&b=netscape%205.0%20(windows;%20en-us)&c=24&j=true&o=win32&r=http://web.archive.org/web/20090517140029/http://anthonystewarthead.electric-chi.com/&s=1366x768&v=1.2'
@@ -299,6 +299,8 @@ def test_surt():
     assert surt.surt("http://www.example.com/", with_scheme=False) == 'com,example)/'
     assert surt.surt("http://www.example.com/", with_scheme=True, trailing_comma=True) == 'http://(com,example,)/'
     assert surt.surt("https://www.example.com/", with_scheme=True, trailing_comma=True) == 'https://(com,example,)/'
+    assert surt.surt("ftp://www.example.com/", with_scheme=False, trailing_comma=True) == 'com,example,)/'
+    assert surt.surt("ftp://www.example.com/", with_scheme=False, trailing_comma=False) == 'com,example)/'
     assert surt.surt("ftp://www.example.com/", with_scheme=True, trailing_comma=True) == 'ftp://(com,example,)/'
     assert surt.surt("http://www.example.com/", with_scheme=True, host_massage=False) == 'http://(com,example,www)/'
     assert surt.surt("http://www.example.com/", with_scheme=False, host_massage=False) == 'com,example,www)/'
@@ -306,3 +308,15 @@ def test_surt():
     assert surt.surt("https://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False) == 'https://(com,example,www,)/'
     assert surt.surt("ftp://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False) == 'ftp://(com,example,www,)/'
 
+    assert surt.surt("mailto:foo@example.com", with_scheme=True) == 'mailto:foo@example.com'
+    assert surt.surt("mailto:foo@example.com", trailing_comma=True) == 'mailto:foo@example.com'
+    assert surt.surt("mailto:foo@example.com", with_scheme=True, trailing_comma=True) == 'mailto:foo@example.com'
+    assert surt.surt("dns:archive.org", with_scheme=True) == 'dns:archive.org'
+    assert surt.surt("dns:archive.org", trailing_comma=True) == 'dns:archive.org'
+    assert surt.surt("dns:archive.org", with_scheme=True, trailing_comma=True) == 'dns:archive.org'
+    assert surt.surt("whois://whois.isoc.org.il/shaveh.co.il", with_scheme=True) == 'whois://(il,org,isoc,whois)/shaveh.co.il'
+    assert surt.surt("whois://whois.isoc.org.il/shaveh.co.il", trailing_comma=True) == 'il,org,isoc,whois,)/shaveh.co.il'
+    assert surt.surt("whois://whois.isoc.org.il/shaveh.co.il", trailing_comma=True, with_scheme=True) == 'whois://(il,org,isoc,whois,)/shaveh.co.il'
+    assert surt.surt("warcinfo:foo.warc.gz", trailing_comma=True) == 'warcinfo:foo.warc.gz'
+    assert surt.surt("warcinfo:foo.warc.gz", with_scheme=True) == 'warcinfo:foo.warc.gz'
+    assert surt.surt("warcinfo:foo.warc.gz", with_scheme=True, trailing_comma=True) == 'warcinfo:foo.warc.gz'

From 4b7d4eda1215c64d087e7df7c70efcc27eb8716c Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Fri, 18 Dec 2015 23:36:20 +0000
Subject: [PATCH 27/32] bump version to 0.3b4 to work around pypi strangeness
 -- https://pypi.python.org/pypi?:action=display&name=surt&version=0.3b3
 exists but missing from https://pypi.python.org/simple/surt/

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 85d06ef..72400f8 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@ def run_tests(self):
 
 
 setup(name='surt',
-      version='0.3b3',
+      version='0.3b4',
       author='rajbot',
       author_email='raj@archive.org',
       classifiers=[

From e97fd14f10e26cf30ddbf5be1fb390313f2ec862 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Thu, 7 Jan 2016 19:58:55 +0000
Subject: [PATCH 28/32] chmod a-x surt/*.py

---
 surt/DefaultIAURLCanonicalizer.py | 0
 surt/GoogleURLCanonicalizer.py    | 0
 surt/IAURLCanonicalizer.py        | 0
 surt/URLRegexTransformer.py       | 0
 surt/handyurl.py                  | 0
 surt/surt.py                      | 0
 6 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 surt/DefaultIAURLCanonicalizer.py
 mode change 100755 => 100644 surt/GoogleURLCanonicalizer.py
 mode change 100755 => 100644 surt/IAURLCanonicalizer.py
 mode change 100755 => 100644 surt/URLRegexTransformer.py
 mode change 100755 => 100644 surt/handyurl.py
 mode change 100755 => 100644 surt/surt.py

diff --git a/surt/DefaultIAURLCanonicalizer.py b/surt/DefaultIAURLCanonicalizer.py
old mode 100755
new mode 100644
diff --git a/surt/GoogleURLCanonicalizer.py b/surt/GoogleURLCanonicalizer.py
old mode 100755
new mode 100644
diff --git a/surt/IAURLCanonicalizer.py b/surt/IAURLCanonicalizer.py
old mode 100755
new mode 100644
diff --git a/surt/URLRegexTransformer.py b/surt/URLRegexTransformer.py
old mode 100755
new mode 100644
diff --git a/surt/handyurl.py b/surt/handyurl.py
old mode 100755
new mode 100644
diff --git a/surt/surt.py b/surt/surt.py
old mode 100755
new mode 100644

From 53af06c17d73126ae8f2ed7bd4066fd07888b1e7 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Thu, 18 Feb 2016 20:15:09 +0000
Subject: [PATCH 29/32] some more tests

---
 tests/test_surt.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/test_surt.py b/tests/test_surt.py
index 39aa211..cd694b6 100644
--- a/tests/test_surt.py
+++ b/tests/test_surt.py
@@ -257,7 +257,6 @@ def test_stripQuerySessionID():
 def test_hostToSURT():
     assert surt.URLRegexTransformer.hostToSURT("www.archive.org") == 'org,archive,www'
 
-
 def test_surt():
     # These tests are from WaybackURLKeyMakerTest.java
 
@@ -320,3 +319,9 @@ def test_surt():
     assert surt.surt("warcinfo:foo.warc.gz", trailing_comma=True) == 'warcinfo:foo.warc.gz'
     assert surt.surt("warcinfo:foo.warc.gz", with_scheme=True) == 'warcinfo:foo.warc.gz'
     assert surt.surt("warcinfo:foo.warc.gz", with_scheme=True, trailing_comma=True) == 'warcinfo:foo.warc.gz'
+
+def test_options():
+    assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse('http://example.com/foo?X=Y')).getURLString() == 'http://example.com/foo?x=y'
+    assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse('http://example.com/foo?X=Y'), query_lowercase=False).getURLString() == 'http://example.com/foo?X=Y'
+    assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse('http://example.com/foo?X=Y')).getURLString() == 'http://example.com/foo?x=y'
+    assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse('http://example.com/foo?X=Y'), query_lowercase=False).getURLString() == 'http://example.com/foo?X=Y'

From 3bcf8ffaea8edf067bb595a109efac582456d47e Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Tue, 22 Mar 2016 01:10:04 +0000
Subject: [PATCH 30/32] port new fix:
 https://github.com/internetarchive/webarchive-commons/pull/17/commits/0cc72a1c3d1db464d27a3fe8d89e4138e28be171
 -- Make canonicalizer be able to strip session id params even if they are the
 first params in the query string. And add session id strip test.

---
 surt/IAURLCanonicalizer.py  | 31 +++++++++++++++++++++++++------
 surt/URLRegexTransformer.py | 20 ++++++++++----------
 2 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/surt/IAURLCanonicalizer.py b/surt/IAURLCanonicalizer.py
index 0603d6a..da721e7 100644
--- a/surt/IAURLCanonicalizer.py
+++ b/surt/IAURLCanonicalizer.py
@@ -41,6 +41,27 @@ def canonicalize(url, host_lowercase=True, host_massage=True,
                  query_strip_empty=True, query_alpha_reorder=True,
                  hash_strip=True, **_ignored):
     """The input url is a handyurl instance
+
+    These doctests are from IAURLCanonicalizerTest.java:
+
+    >>> canonicalize(handyurl.parse("http://ARCHIVE.ORG/")).getURLString()
+    'http://archive.org/'
+    >>> canonicalize(handyurl.parse("http://www.archive.org:80/")).getURLString()
+    'http://archive.org/'
+    >>> canonicalize(handyurl.parse("https://www.archive.org:80/")).getURLString()
+    'https://archive.org:80/'
+    >>> canonicalize(handyurl.parse("http://www.archive.org:443/")).getURLString()
+    'http://archive.org:443/'
+    >>> canonicalize(handyurl.parse("https://www.archive.org:443/")).getURLString()
+    'https://archive.org/'
+    >>> canonicalize(handyurl.parse("http://www.archive.org/big/")).getURLString()
+    'http://archive.org/big'
+    >>> canonicalize(handyurl.parse("dns:www.archive.org")).getURLString()
+    'dns:www.archive.org'
+    >>> canonicalize(handyurl.parse("http://www.nsf.gov/statistics/sed/2009/SED_2009.zip?CFID=14387305&CFTOKEN=72942008&jsessionid=f030eacc7e49c4ca0b077922347418418766")).getURLString()
+    'http://nsf.gov/statistics/sed/2009/sed_2009.zip?jsessionid=f030eacc7e49c4ca0b077922347418418766'
+    >>> canonicalize(handyurl.parse("http://www.nsf.gov/statistics/sed/2009/SED_2009.zip?CFID=14387305&CFTOKEN=72942008")).getURLString()
+    'http://nsf.gov/statistics/sed/2009/sed_2009.zip'
     """
     if host_lowercase and url.host:
         url.host = url.host.lower()
@@ -77,17 +98,15 @@ def canonicalize(url, host_lowercase=True, host_massage=True,
 
     query = url.query
     if query:
-        if '' == query and query_strip_empty:
-            query = None
-        elif len(query) > 0:
+        if len(query) > 0:
             if query_strip_session_id:
-                #This function expects the query to start with a '?'
-                query = stripQuerySessionID('?'+query)
-                query = query[1:] #now strip off '?' that we just added
+                query = stripQuerySessionID(query)
             if query_lowercase:
                 query = query.lower()
             if query_alpha_reorder:
                 query = alphaReorderQuery(query)
+        if '' == query and query_strip_empty:
+            query = None
         url.query = query
     else:
         if query_strip_empty:
diff --git a/surt/URLRegexTransformer.py b/surt/URLRegexTransformer.py
index 04f829b..a36fbf4 100644
--- a/surt/URLRegexTransformer.py
+++ b/surt/URLRegexTransformer.py
@@ -47,23 +47,23 @@ def stripPathSessionID(path):
 # stripQuerySessionID
 #_______________________________________________________________________________
 _RES_QUERY_SESSIONID = [
-    re.compile("^(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
-    re.compile("^(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
-    re.compile("^(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
-    re.compile("^(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I),
-    re.compile("^(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I),
+    re.compile("^(.*)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
+    re.compile("^(.*)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
+    re.compile("^(.*)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
+    re.compile("^(.*)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I),
+    re.compile("^(.*)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I),
     ]
 
-def stripQuerySessionID(path):
+def stripQuerySessionID(query):
     for pattern in _RES_QUERY_SESSIONID:
-        m = pattern.match(path)
+        m = pattern.match(query)
         if m:
             if m.group(2):
-                path = m.group(1) + m.group(2)
+                query = m.group(1) + m.group(2)
             else:
-                path = m.group(1)
+                query = m.group(1)
 
-    return path
+    return query
 
 
 # hostToSURT

From 639735168dfccfc3343ccbbf608ebf177a12dcdf Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Tue, 24 May 2016 13:39:17 -0700
Subject: [PATCH 31/32] update for tldextract 2.0

---
 setup.py         | 2 +-
 surt/handyurl.py | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 72400f8..a0fb691 100644
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,7 @@ def run_tests(self):
       zip_safe=True,
       install_requires=[
           'six',
-          'tldextract',
+          'tldextract>=2.0',
       ],
       provides=[ 'surt' ],
       packages=[ 'surt' ],
diff --git a/surt/handyurl.py b/surt/handyurl.py
index 866268c..cfc01b7 100644
--- a/surt/handyurl.py
+++ b/surt/handyurl.py
@@ -215,9 +215,7 @@ def getPublicSuffix(self):
         """Uses the tldextract module to get the public suffix via the
         Public Suffix List.
         """
-        r = tldextract.extract(self.host)
-        return "%s.%s" % (r.domain, r.tld)
-
+        return tldextract.extract(self.host).registered_domain
 
     # getPublicPrefix
     #___________________________________________________________________________

From 571ab7592b355a6a11c0ee3f6a8f9178d35dbdcb Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Tue, 24 May 2016 13:39:41 -0700
Subject: [PATCH 32/32] version 0.3.0

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index a0fb691..e737058 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@ def run_tests(self):
 
 
 setup(name='surt',
-      version='0.3b4',
+      version='0.3.0',
       author='rajbot',
       author_email='raj@archive.org',
       classifiers=[