From d5e1775a4632f13d54db0f2dbb8be4adf427ddfd Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 17 Mar 2015 16:26:21 -0700 Subject: [PATCH 01/32] setup.py: make 'python setup.py test' run py.test with coverage --- setup.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/setup.py b/setup.py index 03df6fa..e7ccf09 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,19 @@ from setuptools import setup +from setuptools.command.test import test as TestCommand + +class PyTest(TestCommand): + def finalize_options(self): + TestCommand.finalize_options(self) + self.test_suite = True + + def run_tests(self): + import pytest + import sys + cmdline = ' -v --doctest-module --cov surt surt/' + errcode = pytest.main(cmdline) + sys.exit(errcode) + + setup(name='surt', version='0.2', author='rajbot', @@ -15,4 +30,8 @@ provides=[ 'surt' ], packages=[ 'surt' ], scripts=[], + # Tests + tests_require=[ 'pytest' ], + test_suite='', + cmdclass={'test': PyTest}, ) From e00716f7877e1290cb19e800bd4bd5f34de3d039 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 17 Mar 2015 16:42:38 -0700 Subject: [PATCH 02/32] add support for options to surt() command add 'trailing_comma' option, which will yield a surt with trailing comma restored: com,example,)/ instead of (current default) com,example)/ --- surt/handyurl.py | 8 +++++++- surt/surt.py | 29 +++++++++++++++++++++++------ 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/surt/handyurl.py b/surt/handyurl.py index 06f355f..449c228 100755 --- a/surt/handyurl.py +++ b/surt/handyurl.py @@ -200,7 +200,11 @@ def geturl(self): # getURLString() #___________________________________________________________________________ - def getURLString(self, surt=False, public_suffix=False): + def getURLString(self, + surt=False, + public_suffix=False, + trailing_comma=False, + **options): if None != self.opaque: return self.opaque @@ -229,6 +233,8 @@ def getURLString(self, surt=False, public_suffix=False): s += ":%d" % self.port if surt: + if trailing_comma: + s += ',' s += ')' hasPath = (None != self.path) and (len(self.path) > 0) diff --git a/surt/surt.py b/surt/surt.py index 99c289c..8b47e61 100755 --- a/surt/surt.py +++ b/surt/surt.py @@ -48,7 +48,7 @@ def _normalize(canonicalizer): # surt() #_______________________________________________________________________________ -def surt(url, canonicalizer=None): +def surt(url, canonicalizer=None, **options): """ These doctests are from WaybackURLKeyMakerTest.java @@ -82,6 +82,16 @@ def surt(url, canonicalizer=None): >>> surt("http://archive.org/goo/?a=2&b&a=1") 'org,archive)/goo?a=1&a=2&b' + # trailing comma mode + >>> surt("http://archive.org/goo/?a=2&b&a=1", trailing_comma=True) + 'org,archive,)/goo?a=1&a=2&b' + + >>> surt("dns:archive.org", trailing_comma=True) + 'org,archive,)' + + >>> surt("warcinfo:foo.warc.gz", trailing_comma=True) + 'warcinfo:foo.warc.gz' + PHP session id: >>> surt("http://archive.org/index.php?PHPSESSID=0123456789abcdefghijklemopqrstuv&action=profile;u=4221") 'org,archive)/index.php?action=profile;u=4221' @@ -95,7 +105,7 @@ def surt(url, canonicalizer=None): 'com,yahoo,webhosting,visit)/visit.gif?&b=netscape%205.0%20(windows;%20en-us)&c=24&j=true&o=win32&r=http://web.archive.org/web/20090517140029/http://anthonystewarthead.electric-chi.com/&s=1366x768&v=1.2' Simple customization: - >>> surt("http://www.example.com/", canonicalizer=lambda x: x) + >>> surt("http://www.example.com/", canonicalizer=lambda x, **opts: x) 'com,example,www)/' """ @@ -109,7 +119,11 @@ def surt(url, canonicalizer=None): return url if url.startswith("dns:"): - return hostToSURT(url[4:]) + ')' + res = hostToSURT(url[4:]) + if options.get('trailing_comma'): + res += ',' + res += ')' + return res if url.startswith("whois://"): return url @@ -122,9 +136,12 @@ def surt(url, canonicalizer=None): elif (not hasattr(canonicalizer, '__call__') and hasattr(canonicalizer, 'canonicalize')): canonicalizer = canonicalizer.canonicalize - - hurl = canonicalizer(handyurl.parse(url)) - key = hurl.getURLString(surt=True) + + if 'surt' not in options: + options['surt'] = True + + hurl = canonicalizer(handyurl.parse(url), **options) + key = hurl.getURLString(**options) parenIdx = key.find('(') if -1 == parenIdx: From df00f11e5c559eeb76d96326501a6d55f034f8b6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 10 Apr 2015 19:42:50 -0700 Subject: [PATCH 03/32] make surt py3 compatible --- setup.py | 1 + surt/DefaultIAURLCanonicalizer.py | 11 ++++++++--- surt/GoogleURLCanonicalizer.py | 23 ++++++++++++++--------- surt/IAURLCanonicalizer.py | 9 +++++++-- surt/__init__.py | 9 +++++++-- surt/handyurl.py | 13 +++++++++---- surt/surt.py | 13 ++++++++++--- 7 files changed, 56 insertions(+), 23 deletions(-) diff --git a/setup.py b/setup.py index e7ccf09..ca688ef 100644 --- a/setup.py +++ b/setup.py @@ -25,6 +25,7 @@ def run_tests(self): long_description=open('README.md').read(), url='https://github.com/rajbot/surt', install_requires=[ + 'six', 'tldextract', ], provides=[ 'surt' ], diff --git a/surt/DefaultIAURLCanonicalizer.py b/surt/DefaultIAURLCanonicalizer.py index a50c991..e352f92 100755 --- a/surt/DefaultIAURLCanonicalizer.py +++ b/surt/DefaultIAURLCanonicalizer.py @@ -26,8 +26,13 @@ http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/DefaultIAURLCanonicalizerTest.java?view=markup """ -import GoogleURLCanonicalizer -import IAURLCanonicalizer +try: #pragma: no cover + import GoogleURLCanonicalizer + import IAURLCanonicalizer + +except ImportError: #pragma: no cover + import surt.GoogleURLCanonicalizer as GoogleURLCanonicalizer + import surt.IAURLCanonicalizer as IAURLCanonicalizer # canonicalize() #_______________________________________________________________________________ @@ -36,7 +41,7 @@ def canonicalize(url, **options): These doctests are from DefaultIAURLCanonicalizerTest.java: - >>> from handyurl import handyurl + >>> from .handyurl import handyurl >>> canonicalize(handyurl.parse("http://www.alexa.com/")).getURLString() 'http://alexa.com/' >>> canonicalize(handyurl.parse("http://archive.org/index.html")).getURLString() diff --git a/surt/GoogleURLCanonicalizer.py b/surt/GoogleURLCanonicalizer.py index 7c7be91..02877ef 100755 --- a/surt/GoogleURLCanonicalizer.py +++ b/surt/GoogleURLCanonicalizer.py @@ -31,9 +31,14 @@ import struct import socket import encodings.idna -from handyurl import handyurl -from urllib import quote, unquote +import six +try: #pragma: no cover + from handyurl import handyurl +except ImportError: #pragma: no cover + from surt.handyurl import handyurl + +from six.moves.urllib.parse import quote, unquote # unescapeRepeatedly() #_______________________________________________________________________________ @@ -93,14 +98,14 @@ def canonicalize(url, **_ignored): #'http://%01%80.com/' for this case. If idna/punycode encoding of a hostname #is not possible, the python version encodes unicode domains as utf-8 before #percent encoding, so we get 'http://%01%C2%80.com/' - >>> print canonicalize(handyurl.parse(u"http://\u0001\u0080.com/")).getURLString() + >>> print(canonicalize(handyurl.parse(u"http://\u0001\u0080.com/")).getURLString()) http://%01%C2%80.com/ #Add these unicode tests: - >>> print canonicalize(handyurl.parse(u'B\xfccher.ch:8080')).getURLString() + >>> print(canonicalize(handyurl.parse(u'B\xfccher.ch:8080')).getURLString()) http://xn--bcher-kva.ch:8080/ - >>> url = '☃.com'.decode('utf-8') #doctest has trouble with utf-8 encoding - >>> print canonicalize(handyurl.parse(url)).getURLString() + >>> url = '☃.com' #doctest has trouble with utf-8 encoding + >>> print(canonicalize(handyurl.parse(url)).getURLString()) http://xn--n3h.com/ #Add these percent-encoded unicode tests @@ -140,7 +145,7 @@ def canonicalize(url, **_ignored): # if the host was an ascii string of percent-encoded bytes that represent # non-ascii unicode chars, then promote hostE from str to unicode. # e.g. "http://www.t%EF%BF%BD%04.82.net/", which contains the unicode replacement char - if isinstance(hostE, str): + if isinstance(hostE, six.binary_type): try: hostE.decode('ascii') except UnicodeDecodeError: @@ -155,7 +160,7 @@ def canonicalize(url, **_ignored): # be encodeced separately, and doesn't work correctly with # full hostnames. So use 'idna' encoding instead. #host = encodings.idna.ToASCII(hostE) - host = hostE.encode('idna') + host = hostE.encode('idna').decode('utf-8') except ValueError: host = hostE @@ -284,7 +289,7 @@ def escapeOnce(input): # percent encoding, since different encodings of the same unicode # characters will result in different surts. # We will use utf-8 for consistency. - if isinstance(input, unicode): + if isinstance(input, six.text_type): input = input.encode('utf-8') return quote(input, """!"$&'()*+,-./:;<=>?@[\]^_`{|}~""") else: diff --git a/surt/IAURLCanonicalizer.py b/surt/IAURLCanonicalizer.py index 2bf7746..aee2d36 100755 --- a/surt/IAURLCanonicalizer.py +++ b/surt/IAURLCanonicalizer.py @@ -27,8 +27,13 @@ """ import re -from handyurl import handyurl -from URLRegexTransformer import stripPathSessionID, stripQuerySessionID + +try: #pragma: no cover + from handyurl import handyurl + from URLRegexTransformer import stripPathSessionID, stripQuerySessionID +except ImportError: #pragma: no cover + from surt.handyurl import handyurl + from surt.URLRegexTransformer import stripPathSessionID, stripQuerySessionID # canonicalize() #_______________________________________________________________________________ diff --git a/surt/__init__.py b/surt/__init__.py index f0bfc83..f67faa4 100644 --- a/surt/__init__.py +++ b/surt/__init__.py @@ -25,8 +25,13 @@ http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/ """ -from .handyurl import handyurl -from .surt import surt +try: #pragma: no cover + from handyurl import handyurl + from surt import surt +except ImportError: #pragma: no cover + from surt.handyurl import handyurl + from surt.surt import surt + __all__= [ 'handyurl', diff --git a/surt/handyurl.py b/surt/handyurl.py index 449c228..260518d 100755 --- a/surt/handyurl.py +++ b/surt/handyurl.py @@ -22,8 +22,13 @@ import re import tldextract -from urlparse import urlsplit -from URLRegexTransformer import hostToSURT + +from six.moves.urllib.parse import urlsplit + +try: #pragma: no cover + from URLRegexTransformer import hostToSURT +except ImportError: #pragma: no cover + from surt.URLRegexTransformer import hostToSURT class handyurl(object): """A python port of the archive-commons org.archive.url HandyURL class @@ -96,10 +101,10 @@ def parse(cls, url): >>> handyurl.parse("http://www.archive.org:8080?#foo").geturl() 'http://www.archive.org:8080/#foo' - >>> print handyurl.parse(u"http://bücher.ch:8080?#foo").geturl() + >>> print(handyurl.parse(u"http://bücher.ch:8080?#foo").geturl()) http://b\xfccher.ch:8080/#foo - >>> print handyurl.parse(u"dns:bücher.ch").geturl() + >>> print(handyurl.parse(u"dns:bücher.ch").geturl()) dns:b\xfccher.ch ###From Tymm: diff --git a/surt/surt.py b/surt/surt.py index 8b47e61..ba6f0b8 100755 --- a/surt/surt.py +++ b/surt/surt.py @@ -24,9 +24,16 @@ http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/WaybackURLKeyMaker.java?view=markup """ -from handyurl import handyurl -from URLRegexTransformer import hostToSURT -import DefaultIAURLCanonicalizer +try: #pragma: no cover + from handyurl import handyurl + from URLRegexTransformer import hostToSURT + + import DefaultIAURLCanonicalizer +except ImportError: #pragma: no cover + from surt.handyurl import handyurl + from surt.URLRegexTransformer import hostToSURT + + import surt.DefaultIAURLCanonicalizer as DefaultIAURLCanonicalizer class CompositeCanonicalizer(object): def __init__(self, canonicalizers): From e1e3878dd9e9b25d7495b7a8487dbb4573e9c967 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 11 Apr 2015 11:45:02 -0700 Subject: [PATCH 04/32] use from __future__ import absolute_import add tox.ini and travis-ci support --- .travis.yml | 32 ++++++++++++++++++++++++------- surt/DefaultIAURLCanonicalizer.py | 13 +++++-------- surt/GoogleURLCanonicalizer.py | 7 +++---- surt/IAURLCanonicalizer.py | 10 ++++------ surt/__init__.py | 10 ++++------ surt/handyurl.py | 7 +++---- surt/surt.py | 12 ++++-------- tox.ini | 13 +++++++++++++ 8 files changed, 61 insertions(+), 43 deletions(-) create mode 100644 tox.ini diff --git a/.travis.yml b/.travis.yml index 8db6574..d6fb14a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,8 +1,26 @@ +# vim: set sw=4 et: +# +# tox approach stolen from +# https://github.com/pypa/pip/blob/abdb597dbfb51b21cc76c1cff068b72c80f3a77d/.travis.yml +# + language: python -python: - - "2.6" - - "2.7" -install: pip install -r requirements.txt -script: - - py.test --doctest-modules -v surt/ - - pylint --disable=all --enable=W0312 --reports=n surt/ + +env: + - TOXENV=py26 + - TOXENV=py27 + - TOXENV=py32 + - TOXENV=py33 + - TOXENV=py34 + +before_install: + - sudo apt-get update + - pip install coveralls --use-mirrors + +before_script: + - pip install tox + +script: tox + +#after_success: + #coveralls diff --git a/surt/DefaultIAURLCanonicalizer.py b/surt/DefaultIAURLCanonicalizer.py index e352f92..11ae90e 100755 --- a/surt/DefaultIAURLCanonicalizer.py +++ b/surt/DefaultIAURLCanonicalizer.py @@ -25,14 +25,11 @@ The doctests are copied from DefaultIAURLCanonicalizerTest.java: http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/DefaultIAURLCanonicalizerTest.java?view=markup """ +from __future__ import absolute_import -try: #pragma: no cover - import GoogleURLCanonicalizer - import IAURLCanonicalizer +import surt.GoogleURLCanonicalizer +import surt.IAURLCanonicalizer -except ImportError: #pragma: no cover - import surt.GoogleURLCanonicalizer as GoogleURLCanonicalizer - import surt.IAURLCanonicalizer as IAURLCanonicalizer # canonicalize() #_______________________________________________________________________________ @@ -58,8 +55,8 @@ def canonicalize(url, **options): 'http://archive.org/index.html?a=b&b=a&b=b' """ - url = GoogleURLCanonicalizer.canonicalize(url, **options) - url = IAURLCanonicalizer.canonicalize(url, **options) + url = surt.GoogleURLCanonicalizer.canonicalize(url, **options) + url = surt.IAURLCanonicalizer.canonicalize(url, **options) return url diff --git a/surt/GoogleURLCanonicalizer.py b/surt/GoogleURLCanonicalizer.py index 02877ef..51b9c07 100755 --- a/surt/GoogleURLCanonicalizer.py +++ b/surt/GoogleURLCanonicalizer.py @@ -27,16 +27,15 @@ http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/GoogleURLCanonicalizerTest.java?view=markup """ +from __future__ import absolute_import + import re import struct import socket import encodings.idna import six -try: #pragma: no cover - from handyurl import handyurl -except ImportError: #pragma: no cover - from surt.handyurl import handyurl +from surt.handyurl import handyurl from six.moves.urllib.parse import quote, unquote diff --git a/surt/IAURLCanonicalizer.py b/surt/IAURLCanonicalizer.py index aee2d36..bbfe0a2 100755 --- a/surt/IAURLCanonicalizer.py +++ b/surt/IAURLCanonicalizer.py @@ -26,14 +26,12 @@ http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/IAURLCanonicalizerTest.java?view=markup """ +from __future__ import absolute_import + import re -try: #pragma: no cover - from handyurl import handyurl - from URLRegexTransformer import stripPathSessionID, stripQuerySessionID -except ImportError: #pragma: no cover - from surt.handyurl import handyurl - from surt.URLRegexTransformer import stripPathSessionID, stripQuerySessionID +from surt.handyurl import handyurl +from surt.URLRegexTransformer import stripPathSessionID, stripQuerySessionID # canonicalize() #_______________________________________________________________________________ diff --git a/surt/__init__.py b/surt/__init__.py index f67faa4..1af4c40 100644 --- a/surt/__init__.py +++ b/surt/__init__.py @@ -25,12 +25,10 @@ http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/ """ -try: #pragma: no cover - from handyurl import handyurl - from surt import surt -except ImportError: #pragma: no cover - from surt.handyurl import handyurl - from surt.surt import surt +from __future__ import absolute_import + +from surt.handyurl import handyurl +from surt.surt import surt __all__= [ diff --git a/surt/handyurl.py b/surt/handyurl.py index 260518d..f6ae5bc 100755 --- a/surt/handyurl.py +++ b/surt/handyurl.py @@ -20,15 +20,14 @@ # # The surt source is hosted at https://github.com/internetarchive/surt +from __future__ import absolute_import + import re import tldextract from six.moves.urllib.parse import urlsplit -try: #pragma: no cover - from URLRegexTransformer import hostToSURT -except ImportError: #pragma: no cover - from surt.URLRegexTransformer import hostToSURT +from surt.URLRegexTransformer import hostToSURT class handyurl(object): """A python port of the archive-commons org.archive.url HandyURL class diff --git a/surt/surt.py b/surt/surt.py index ba6f0b8..6c10aac 100755 --- a/surt/surt.py +++ b/surt/surt.py @@ -24,16 +24,12 @@ http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/WaybackURLKeyMaker.java?view=markup """ -try: #pragma: no cover - from handyurl import handyurl - from URLRegexTransformer import hostToSURT +from __future__ import absolute_import - import DefaultIAURLCanonicalizer -except ImportError: #pragma: no cover - from surt.handyurl import handyurl - from surt.URLRegexTransformer import hostToSURT +from surt.handyurl import handyurl +from surt.URLRegexTransformer import hostToSURT - import surt.DefaultIAURLCanonicalizer as DefaultIAURLCanonicalizer +import surt.DefaultIAURLCanonicalizer as DefaultIAURLCanonicalizer class CompositeCanonicalizer(object): def __init__(self, canonicalizers): diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..697a8b3 --- /dev/null +++ b/tox.ini @@ -0,0 +1,13 @@ +# Tox (http://tox.testrun.org/) is a tool for running tests +# in multiple virtualenvs. This configuration file will run the +# test suite on all supported python versions. To use it, "pip install tox" +# and then run "tox" from this directory. + +[tox] +envlist = py26, py27, py34 + +[testenv] +commands = python setup.py test +deps = + pytest + pytest-cov From 07307962541173b5ac4793fe9cbdc637e34f3d4c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 11 Apr 2015 11:46:32 -0700 Subject: [PATCH 05/32] add py32, py33 to tox --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 697a8b3..fcbf2a9 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py26, py27, py34 +envlist = py26, py27, py32, py33, py34 [testenv] commands = python setup.py test From 4a1ff4a15904470c141cd40d31cf0df4b114cdd4 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 11 Apr 2015 12:06:38 -0700 Subject: [PATCH 06/32] add zip_safe=True, use six.u() for unicode literals to support py32 --- setup.py | 1 + surt/GoogleURLCanonicalizer.py | 10 +++++----- surt/handyurl.py | 9 +++++---- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/setup.py b/setup.py index ca688ef..67613ef 100644 --- a/setup.py +++ b/setup.py @@ -24,6 +24,7 @@ def run_tests(self): description='Sort-friendly URI Reordering Transform (SURT) python package.', long_description=open('README.md').read(), url='https://github.com/rajbot/surt', + zip_safe=True, install_requires=[ 'six', 'tldextract', diff --git a/surt/GoogleURLCanonicalizer.py b/surt/GoogleURLCanonicalizer.py index 51b9c07..efc3ce3 100755 --- a/surt/GoogleURLCanonicalizer.py +++ b/surt/GoogleURLCanonicalizer.py @@ -33,11 +33,11 @@ import struct import socket import encodings.idna -import six from surt.handyurl import handyurl from six.moves.urllib.parse import quote, unquote +from six import u, text_type, binary_type # unescapeRepeatedly() #_______________________________________________________________________________ @@ -97,11 +97,11 @@ def canonicalize(url, **_ignored): #'http://%01%80.com/' for this case. If idna/punycode encoding of a hostname #is not possible, the python version encodes unicode domains as utf-8 before #percent encoding, so we get 'http://%01%C2%80.com/' - >>> print(canonicalize(handyurl.parse(u"http://\u0001\u0080.com/")).getURLString()) + >>> print(canonicalize(handyurl.parse(u("http://\u0001\u0080.com/"))).getURLString()) http://%01%C2%80.com/ #Add these unicode tests: - >>> print(canonicalize(handyurl.parse(u'B\xfccher.ch:8080')).getURLString()) + >>> print(canonicalize(handyurl.parse(u('B\xfccher.ch:8080'))).getURLString()) http://xn--bcher-kva.ch:8080/ >>> url = '☃.com' #doctest has trouble with utf-8 encoding >>> print(canonicalize(handyurl.parse(url)).getURLString()) @@ -144,7 +144,7 @@ def canonicalize(url, **_ignored): # if the host was an ascii string of percent-encoded bytes that represent # non-ascii unicode chars, then promote hostE from str to unicode. # e.g. "http://www.t%EF%BF%BD%04.82.net/", which contains the unicode replacement char - if isinstance(hostE, six.binary_type): + if isinstance(hostE, binary_type): try: hostE.decode('ascii') except UnicodeDecodeError: @@ -288,7 +288,7 @@ def escapeOnce(input): # percent encoding, since different encodings of the same unicode # characters will result in different surts. # We will use utf-8 for consistency. - if isinstance(input, six.text_type): + if isinstance(input, text_type): input = input.encode('utf-8') return quote(input, """!"$&'()*+,-./:;<=>?@[\]^_`{|}~""") else: diff --git a/surt/handyurl.py b/surt/handyurl.py index f6ae5bc..0f2f644 100755 --- a/surt/handyurl.py +++ b/surt/handyurl.py @@ -26,6 +26,7 @@ import tldextract from six.moves.urllib.parse import urlsplit +from six import u from surt.URLRegexTransformer import hostToSURT @@ -74,7 +75,7 @@ def __init__(self, scheme=None, authUser=None, authPass=None, #___________________________________________________________________________ @classmethod def parse(cls, url): - u"""This method was in the java URLParser class, but we don't need + u("""This method was in the java URLParser class, but we don't need a whole class to parse a url, when we can just use python's urlparse. These doctests come from URLParserTest.java: @@ -100,10 +101,10 @@ def parse(cls, url): >>> handyurl.parse("http://www.archive.org:8080?#foo").geturl() 'http://www.archive.org:8080/#foo' - >>> print(handyurl.parse(u"http://bücher.ch:8080?#foo").geturl()) + >>> print(handyurl.parse(u("http://bücher.ch:8080?#foo")).geturl()) http://b\xfccher.ch:8080/#foo - >>> print(handyurl.parse(u"dns:bücher.ch").geturl()) + >>> print(handyurl.parse(u("dns:bücher.ch")).geturl()) dns:b\xfccher.ch ###From Tymm: @@ -115,7 +116,7 @@ def parse(cls, url): ###From Common Crawl, host ends with ':' without a port number >>> handyurl.parse("http://mineral.galleries.com:/minerals/silicate/chabazit/chabazit.htm").geturl() 'http://mineral.galleries.com/minerals/silicate/chabazit/chabazit.htm' - """ + """) url = url.strip() url = re.sub('[\n\r\t]', '', url) From 031586fd7115ba137a36c9c612b3c37e1d6e711b Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 20 Aug 2015 21:11:26 +0000 Subject: [PATCH 07/32] - get rid of special "opaque" thing in handyurl, unnecessary with proper handling of url without authority - handyurl.parse doctests were not running because docstring was in six.u(), and on top of that it turns out six.u() doesn't work for non-ascii strings, and u"" was restored in python3.3+, so get rid of all use of six.u(). the idn tests are failing in python2 for unknown reasons --- setup.py | 2 +- surt/GoogleURLCanonicalizer.py | 8 ++-- surt/handyurl.py | 83 +++++++++++++++++----------------- tox.ini | 2 +- 4 files changed, 48 insertions(+), 47 deletions(-) diff --git a/setup.py b/setup.py index 67613ef..d001bcd 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ def finalize_options(self): def run_tests(self): import pytest import sys - cmdline = ' -v --doctest-module --cov surt surt/' + cmdline = ' -v --doctest-modules --cov surt surt/' errcode = pytest.main(cmdline) sys.exit(errcode) diff --git a/surt/GoogleURLCanonicalizer.py b/surt/GoogleURLCanonicalizer.py index efc3ce3..3b83bb7 100755 --- a/surt/GoogleURLCanonicalizer.py +++ b/surt/GoogleURLCanonicalizer.py @@ -37,12 +37,12 @@ from surt.handyurl import handyurl from six.moves.urllib.parse import quote, unquote -from six import u, text_type, binary_type +from six import text_type, binary_type # unescapeRepeatedly() #_______________________________________________________________________________ def canonicalize(url, **_ignored): - """ + u""" >>> canonicalize(handyurl.parse("http://host/%25%32%35")).getURLString() 'http://host/%25' >>> canonicalize(handyurl.parse("http://host/%25%32%35%25%32%35")).getURLString() @@ -97,11 +97,11 @@ def canonicalize(url, **_ignored): #'http://%01%80.com/' for this case. If idna/punycode encoding of a hostname #is not possible, the python version encodes unicode domains as utf-8 before #percent encoding, so we get 'http://%01%C2%80.com/' - >>> print(canonicalize(handyurl.parse(u("http://\u0001\u0080.com/"))).getURLString()) + >>> print(canonicalize(handyurl.parse(u"http://\u0001\u0080.com/")).getURLString()) http://%01%C2%80.com/ #Add these unicode tests: - >>> print(canonicalize(handyurl.parse(u('B\xfccher.ch:8080'))).getURLString()) + >>> print(canonicalize(handyurl.parse(u'B\xfccher.ch:8080')).getURLString()) http://xn--bcher-kva.ch:8080/ >>> url = '☃.com' #doctest has trouble with utf-8 encoding >>> print(canonicalize(handyurl.parse(url)).getURLString()) diff --git a/surt/handyurl.py b/surt/handyurl.py index 0f2f644..4c65e88 100755 --- a/surt/handyurl.py +++ b/surt/handyurl.py @@ -26,7 +26,6 @@ import tldextract from six.moves.urllib.parse import urlsplit -from six import u from surt.URLRegexTransformer import hostToSURT @@ -59,7 +58,7 @@ class strips empty queries. #___________________________________________________________________________ def __init__(self, scheme=None, authUser=None, authPass=None, host=None, port=DEFAULT_PORT, path=None, - query=None, hash=None, opaque=None, last_delimiter=None): + query=None, hash=None, last_delimiter=None): self.scheme = scheme self.authUser = authUser self.authPass = authPass @@ -68,14 +67,13 @@ def __init__(self, scheme=None, authUser=None, authPass=None, self.path = path self.query = query self.hash = hash - self.opaque = opaque self.last_delimiter = last_delimiter #added in python version # parse() classmethod #___________________________________________________________________________ @classmethod def parse(cls, url): - u("""This method was in the java URLParser class, but we don't need + u"""This method was in the java URLParser class, but we don't need a whole class to parse a url, when we can just use python's urlparse. These doctests come from URLParserTest.java: @@ -101,10 +99,16 @@ def parse(cls, url): >>> handyurl.parse("http://www.archive.org:8080?#foo").geturl() 'http://www.archive.org:8080/#foo' - >>> print(handyurl.parse(u("http://bücher.ch:8080?#foo")).geturl()) + >>> handyurl.parse(u"http://bücher.ch:8080?#foo").geturl() + 'http://bücher.ch:8080/#foo' + + >>> handyurl.parse(u"dns:bücher.ch").geturl() + 'dns:bücher.ch' + + >>> print(handyurl.parse(u"http://bücher.ch:8080?#foo").geturl()) http://b\xfccher.ch:8080/#foo - >>> print(handyurl.parse(u("dns:bücher.ch")).geturl()) + >>> print(handyurl.parse(u"dns:bücher.ch").geturl()) dns:b\xfccher.ch ###From Tymm: @@ -116,15 +120,16 @@ def parse(cls, url): ###From Common Crawl, host ends with ':' without a port number >>> handyurl.parse("http://mineral.galleries.com:/minerals/silicate/chabazit/chabazit.htm").geturl() 'http://mineral.galleries.com/minerals/silicate/chabazit/chabazit.htm' - """) + + >>> handyurl.parse("mailto:bot@archive.org").scheme + 'mailto' + + >>> handyurl.parse("mailto:bot@archive.org").geturl() + 'mailto:bot@archive.org' + """ url = url.strip() url = re.sub('[\n\r\t]', '', url) - ### DNS URLs are treated separately as opaque urls by URLParser.java - # However, we want to surtify dns urls as well. - if re.match("^(filedesc|warcinfo):.*", url): - return cls(opaque=url) - url = cls.addDefaultSchemeIfNeeded(url) #From Tymm: deal with http://https/order.1and1.com @@ -185,12 +190,8 @@ def addDefaultSchemeIfNeeded(cls, url): if not url: return url - ###raj: DNS URLs are treated separately as opaque urls by URLParser.java, - #but we want to surtify dns urls as well - if url.startswith('dns:'): - return url - - if re.match("^(http|https|ftp|mms|rtsp|wais)://.*", url): + ###noah: accept anything that looks like it starts with a scheme: + if re.match("^([a-zA-Z][a-zA-Z0-9\+\-\.]*):", url): return url else: return "http://"+url @@ -211,36 +212,36 @@ def getURLString(self, trailing_comma=False, **options): - if None != self.opaque: - return self.opaque - - if 'dns' == self.scheme: - s = self.scheme + ':' ###java version adds :// regardless of scheme - else: ###java version uses opaque type for dns urls, but this version supports dns urls - s = self.scheme + '://' - if surt: - s += "(" - - if self.authUser: - s += self.authUser - if self.authPass: - s += self.authPass - s += '@' + s = self.scheme + ':' hostSrc = self.host if public_suffix: hostSrc = self.getPublicSuffix() if surt: hostSrc = hostToSURT(hostSrc) - s += hostSrc - if self.port != self.DEFAULT_PORT: - s += ":%d" % self.port + if hostSrc: + if self.scheme != 'dns': + s += '//' - if surt: - if trailing_comma: - s += ',' - s += ')' + if surt: + s += "(" + + if self.authUser: + s += self.authUser + if self.authPass: + s += self.authPass + s += '@' + + s += hostSrc + + if self.port != self.DEFAULT_PORT: + s += ":%d" % self.port + + if surt: + if trailing_comma: + s += ',' + s += ')' hasPath = (None != self.path) and (len(self.path) > 0) if hasPath: @@ -320,7 +321,7 @@ def getPublicSuffix(self): # commented out because of http://bugs.python.org/issue5876 # "__repr__ returning unicode doesn't work when called implicitly" #def __repr__(self): - # return u"""handyurl(scheme=%s, authUser=%s, authPass=%s, host=%s, port=%s, path=%s, query=%s, hash=%s, opaque=%s)""".encode('utf-8') % (self.scheme, self.authUser, self.authPass, self.host, self.port, self.path, self.query, self.hash, self.opaque) + # return u"""handyurl(scheme=%s, authUser=%s, authPass=%s, host=%s, port=%s, path=%s, query=%s, hash=%s)""".encode('utf-8') % (self.scheme, self.authUser, self.authPass, self.host, self.port, self.path, self.query, self.hash) diff --git a/tox.ini b/tox.ini index fcbf2a9..8b6ba6d 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py26, py27, py32, py33, py34 +envlist = py26, py27, py33, py34 [testenv] commands = python setup.py test From 5c1e62318731d28d6675789f477ac11f1c1d316f Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 31 Aug 2015 19:12:33 +0000 Subject: [PATCH 08/32] bump version number so pip install --upgrade picks it up --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d001bcd..e149dd7 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ def run_tests(self): setup(name='surt', - version='0.2', + version='0.3', author='rajbot', author_email='raj@archive.org', classifiers=[ From 8fea26bfe224e10e59626598e186b4381ac76208 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 7 Sep 2015 05:47:36 +0000 Subject: [PATCH 09/32] fix canonicalization of urls without authority --- surt/GoogleURLCanonicalizer.py | 69 +++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 31 deletions(-) diff --git a/surt/GoogleURLCanonicalizer.py b/surt/GoogleURLCanonicalizer.py index 3b83bb7..be8ad7d 100755 --- a/surt/GoogleURLCanonicalizer.py +++ b/surt/GoogleURLCanonicalizer.py @@ -129,6 +129,9 @@ def canonicalize(url, **_ignored): 'http://host.com/ab%23cd' >>> canonicalize(handyurl.parse("http://host.com//twoslashes?more//slashes")).getURLString() 'http://host.com/twoslashes?more//slashes' + + >>> canonicalize(handyurl.parse("mailto:foo@example.com")).getURLString() + 'mailto:foo@example.com' """ url.hash = None @@ -139,42 +142,46 @@ def canonicalize(url, **_ignored): if url.query: url.query = minimalEscape(url.query) - hostE = unescapeRepeatedly(url.host) + if url.host: + hostE = unescapeRepeatedly(url.host) + + # if the host was an ascii string of percent-encoded bytes that represent + # non-ascii unicode chars, then promote hostE from str to unicode. + # e.g. "http://www.t%EF%BF%BD%04.82.net/", which contains the unicode replacement char + if isinstance(hostE, binary_type): + try: + hostE.decode('ascii') + except UnicodeDecodeError: + hostE = hostE.decode('utf-8', 'ignore') + - # if the host was an ascii string of percent-encoded bytes that represent - # non-ascii unicode chars, then promote hostE from str to unicode. - # e.g. "http://www.t%EF%BF%BD%04.82.net/", which contains the unicode replacement char - if isinstance(hostE, binary_type): + host = None try: - hostE.decode('ascii') - except UnicodeDecodeError: - hostE = hostE.decode('utf-8', 'ignore') - - - host = None - try: - # Note: I copied the use of the ToASCII(hostE) from - # the java code. This function implements RFC3490, which - # requires that each component of the hostname (i.e. each label) - # be encodeced separately, and doesn't work correctly with - # full hostnames. So use 'idna' encoding instead. - #host = encodings.idna.ToASCII(hostE) - host = hostE.encode('idna').decode('utf-8') - except ValueError: - host = hostE - - host = host.replace('..', '.').strip('.') - - ip = attemptIPFormats(host) - if ip: - host = ip; - else: - host = escapeOnce(host.lower()) + # Note: I copied the use of the ToASCII(hostE) from + # the java code. This function implements RFC3490, which + # requires that each component of the hostname (i.e. each label) + # be encodeced separately, and doesn't work correctly with + # full hostnames. So use 'idna' encoding instead. + #host = encodings.idna.ToASCII(hostE) + host = hostE.encode('idna').decode('utf-8') + except ValueError: + host = hostE + + host = host.replace('..', '.').strip('.') + + ip = attemptIPFormats(host) + if ip: + host = ip; + else: + host = escapeOnce(host.lower()) - url.host = host + url.host = host path = unescapeRepeatedly(url.path) - url.path = escapeOnce(normalizePath(path)) + if url.host: + path = normalizePath(path) + # else path is free-form sort of thing, not /directory/thing + url.path = escapeOnce(path) return url From e2a2d796dbab84f5f4cbde9fbcffdb89ca13feb4 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 7 Sep 2015 06:11:50 +0000 Subject: [PATCH 10/32] another fix for urls without authority --- surt/handyurl.py | 9 +++++---- surt/surt.py | 3 +++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/surt/handyurl.py b/surt/handyurl.py index 4c65e88..3c13ce9 100755 --- a/surt/handyurl.py +++ b/surt/handyurl.py @@ -215,10 +215,11 @@ def getURLString(self, s = self.scheme + ':' hostSrc = self.host - if public_suffix: - hostSrc = self.getPublicSuffix() - if surt: - hostSrc = hostToSURT(hostSrc) + if hostSrc: + if public_suffix: + hostSrc = self.getPublicSuffix() + if surt: + hostSrc = hostToSURT(hostSrc) if hostSrc: if self.scheme != 'dns': diff --git a/surt/surt.py b/surt/surt.py index 6c10aac..d3798d6 100755 --- a/surt/surt.py +++ b/surt/surt.py @@ -110,6 +110,9 @@ def surt(url, canonicalizer=None, **options): Simple customization: >>> surt("http://www.example.com/", canonicalizer=lambda x, **opts: x) 'com,example,www)/' + + >>> surt("mailto:foo@example.com") + 'mailto:foo@example.com' """ if not url: From c8e1f94d4c388bbb0d1dc327e8c875abf6ecc9cf Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 15 Sep 2015 20:18:55 +0000 Subject: [PATCH 11/32] add option "with_scheme" to surt.surt() to produce surt with leading "scheme://(" --- surt/surt.py | 45 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/surt/surt.py b/surt/surt.py index d3798d6..9774ca9 100755 --- a/surt/surt.py +++ b/surt/surt.py @@ -113,6 +113,39 @@ def surt(url, canonicalizer=None, **options): >>> surt("mailto:foo@example.com") 'mailto:foo@example.com' + + >>> surt("http://www.example.com/", with_scheme=True) + 'http://(com,example)/' + + >>> surt("http://www.example.com/", with_scheme=True, host_massage=True) + 'http://(com,example)/' + + >>> surt("http://www.example.com/", with_scheme=False) + 'com,example)/' + + >>> surt("http://www.example.com/", with_scheme=True, trailing_comma=True) + 'http://(com,example,)/' + + >>> surt("https://www.example.com/", with_scheme=True, trailing_comma=True) + 'https://(com,example,)/' + + >>> surt("ftp://www.example.com/", with_scheme=True, trailing_comma=True) + 'ftp://(com,example,)/' + + >>> surt("http://www.example.com/", with_scheme=True, host_massage=False) + 'http://(com,example,www)/' + + >>> surt("http://www.example.com/", with_scheme=False, host_massage=False) + 'com,example,www)/' + + >>> surt("http://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False) + 'http://(com,example,www,)/' + + >>> surt("https://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False) + 'https://(com,example,www,)/' + + >>> surt("ftp://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False) + 'ftp://(com,example,www,)/' """ if not url: @@ -149,11 +182,13 @@ def surt(url, canonicalizer=None, **options): hurl = canonicalizer(handyurl.parse(url), **options) key = hurl.getURLString(**options) - parenIdx = key.find('(') - if -1 == parenIdx: - return url #something very wrong - - return key[parenIdx+1:] + if not options.get('with_scheme'): + parenIdx = key.find('(') + if -1 == parenIdx: + return url #something very wrong + return key[parenIdx+1:] + else: + return key # main() From d03b93284b39de6348b8ddf11b878f05b1b6ba7e Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 22 Sep 2015 23:34:04 +0000 Subject: [PATCH 12/32] requirements are specified in setup.y --- requirements.txt | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 43349b8..0000000 --- a/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -tldextract==1.0 -pylint -pytest From 3ec34349c473331677f2d70ce9932054efb78597 Mon Sep 17 00:00:00 2001 From: Kenji Nagahashi Date: Fri, 23 Oct 2015 18:36:56 +0000 Subject: [PATCH 13/32] start 0.3b line --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 03df6fa..3bb52e7 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from setuptools import setup setup(name='surt', - version='0.2', + version='0.3b1', author='rajbot', author_email='raj@archive.org', classifiers=[ From 38f6dfb260e10bccf566a9d3d55c774df03c2bde Mon Sep 17 00:00:00 2001 From: Kenji Nagahashi Date: Fri, 23 Oct 2015 18:38:12 +0000 Subject: [PATCH 14/32] Several performance improvements (~25% faster) - prepare regular expression outside of methods. - don't use RE if cheaper alternative exists. - use simpler equivalent code, remove shortcut that in reality is slower - eliminate sub template compilation by passing a function --- surt/GoogleURLCanonicalizer.py | 9 ++++----- surt/IAURLCanonicalizer.py | 4 +++- surt/URLRegexTransformer.py | 33 +++++++++++++++------------------ surt/handyurl.py | 27 +++++++++++++++++---------- 4 files changed, 39 insertions(+), 34 deletions(-) diff --git a/surt/GoogleURLCanonicalizer.py b/surt/GoogleURLCanonicalizer.py index 7c7be91..d734f55 100755 --- a/surt/GoogleURLCanonicalizer.py +++ b/surt/GoogleURLCanonicalizer.py @@ -219,6 +219,9 @@ def normalizePath(path): return path +OCTAL_IP = re.compile(r"^(0[0-7]*)(\.[0-7]+)?(\.[0-7]+)?(\.[0-7]+)?$") +DECIMAL_IP = re.compile(r"^([1-9][0-9]*)(\.[0-9]+)?(\.[0-9]+)?(\.[0-9]+)?$") + # attemptIPFormats() #_______________________________________________________________________________ def attemptIPFormats(host): @@ -242,14 +245,10 @@ def attemptIPFormats(host): >>> attemptIPFormats("39024579298") '22.11.210.226' """ - - OCTAL_IP = re.compile("^(0[0-7]*)(\.[0-7]+)?(\.[0-7]+)?(\.[0-7]+)?$") - DECIMAL_IP = re.compile("^([1-9][0-9]*)(\.[0-9]+)?(\.[0-9]+)?(\.[0-9]+)?$") - if None == host: return None - if re.match("^\d+$", host): + if host.isdigit(): #mask hostname to lower four bytes to workaround issue with liveweb arc files return socket.inet_ntoa(struct.pack('>L', int(host) & 0xffffffff)) else: diff --git a/surt/IAURLCanonicalizer.py b/surt/IAURLCanonicalizer.py index 2bf7746..1613ae5 100755 --- a/surt/IAURLCanonicalizer.py +++ b/surt/IAURLCanonicalizer.py @@ -172,6 +172,8 @@ def alphaReorderQuery(orig): # massageHost() #_______________________________________________________________________________ +_RE_WWWDIGITS = re.compile('www\d*\.') + def massageHost(host): """These doctests are from IAURLCanonicalizerTest.java: @@ -188,7 +190,7 @@ def massageHost(host): 'www2foo.com' """ - m = re.match('www\d*\.', host) + m = _RE_WWWDIGITS.match(host) if m: return host[len(m.group(0)):] else: diff --git a/surt/URLRegexTransformer.py b/surt/URLRegexTransformer.py index 1949508..babe76e 100755 --- a/surt/URLRegexTransformer.py +++ b/surt/URLRegexTransformer.py @@ -30,6 +30,11 @@ # stripPathSessionID #_______________________________________________________________________________ +_RES_PATH_SESSIONID = [ + re.compile("^(.*/)(\((?:[a-z]\([0-9a-z]{24}\))+\)/)([^\?]+\.aspx.*)$", re.I), + re.compile("^(.*/)(\\([0-9a-z]{24}\\)/)([^\\?]+\\.aspx.*)$", re.I), + ] + def stripPathSessionID(path): """It looks like the java version returns a lowercased path.. So why does it uses a case-insensitive regex? We won't lowercase here. @@ -52,11 +57,7 @@ def stripPathSessionID(path): >>> stripPathSessionID("/photos/36050182@N05/") '/photos/36050182@N05/' """ - patterns = [re.compile("^(.*/)(\((?:[a-z]\([0-9a-z]{24}\))+\)/)([^\?]+\.aspx.*)$", re.I), - re.compile("^(.*/)(\\([0-9a-z]{24}\\)/)([^\\?]+\\.aspx.*)$", re.I), - ] - - for pattern in patterns: + for pattern in _RES_PATH_SESSIONID: m = pattern.match(path) if m: path = m.group(1) + m.group(3) @@ -66,6 +67,14 @@ def stripPathSessionID(path): # stripQuerySessionID #_______________________________________________________________________________ +_RES_QUERY_SESSIONID = [ + re.compile("^(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I), + re.compile("^(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I), + re.compile("^(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I), + re.compile("^(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I), + re.compile("^(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I), + ] + def stripQuerySessionID(path): """These doctests are from IAURLCanonicalizerTest.java: @@ -148,14 +157,7 @@ def stripQuerySessionID(path): '?requestID=200608200458360%2E39414378' """ - patterns = [re.compile("^(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I), - re.compile("^(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I), - re.compile("^(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I), - re.compile("^(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I), - re.compile("^(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I), - ] - - for pattern in patterns: + for pattern in _RES_QUERY_SESSIONID: m = pattern.match(path) if m: if m.group(2): @@ -175,12 +177,7 @@ def hostToSURT(host): """ # TODO: ensure we DONT reverse IP addresses! parts = host.split('.') - - if 1 == len(parts): - return host - parts.reverse() - return ','.join(parts) # main() diff --git a/surt/handyurl.py b/surt/handyurl.py index 06f355f..10b19be 100755 --- a/surt/handyurl.py +++ b/surt/handyurl.py @@ -25,6 +25,11 @@ from urlparse import urlsplit from URLRegexTransformer import hostToSURT +_RE_MULTIPLE_PROTOCOLS = re.compile(r'^(https?://)+') +_RE_HAS_PROTOCOL = re.compile("^(?:http|https|ftp|mms|rtsp|wais)://") +_RE_OPAQUE_URLS = re.compile("^(?:filedesc|warcinfo):") +_RE_SPACES = re.compile('[\n\r\t]') + class handyurl(object): """A python port of the archive-commons org.archive.url HandyURL class @@ -112,18 +117,22 @@ def parse(cls, url): >>> handyurl.parse("http://mineral.galleries.com:/minerals/silicate/chabazit/chabazit.htm").geturl() 'http://mineral.galleries.com/minerals/silicate/chabazit/chabazit.htm' """ + # Note RE_SPACES does not match regular space (0x20). That is, + # regular spaces are removed at head and tail, but not in the middle. + # There's a test case for GoogleURLCanonicalizer.canonicalize that + # asserts this behavior. url = url.strip() - url = re.sub('[\n\r\t]', '', url) + url = _RE_SPACES.sub('', url) ### DNS URLs are treated separately as opaque urls by URLParser.java # However, we want to surtify dns urls as well. - if re.match("^(filedesc|warcinfo):.*", url): + if _RE_OPAQUE_URLS.match(url): return cls(opaque=url) url = cls.addDefaultSchemeIfNeeded(url) #From Tymm: deal with http://https/order.1and1.com - url = re.sub('^(https?://)+', r'\1', url) + url = _RE_MULTIPLE_PROTOCOLS.sub(lambda m: m.group(1), url) """The java code seems to use this regex: re.compile("^(([a-zA-Z][a-zA-Z0-9\+\-\.]*):)?((//([^/?#]*))?([^?#]*)(\?([^#]*))?)?(#(.*))?") @@ -185,7 +194,7 @@ def addDefaultSchemeIfNeeded(cls, url): if url.startswith('dns:'): return url - if re.match("^(http|https|ftp|mms|rtsp|wais)://.*", url): + if _RE_HAS_PROTOCOL.match(url): return url else: return "http://"+url @@ -231,13 +240,11 @@ def getURLString(self, surt=False, public_suffix=False): if surt: s += ')' - hasPath = (None != self.path) and (len(self.path) > 0) - if hasPath: + if self.path: s += self.path - else: - if (None != self.query) or (None != self.hash): - #must have '/' with query or hash: - s += '/' + elif self.query is not None or self.hash is not None: + #must have '/' with query or hash: + s += '/' if None != self.query: s += '?' + self.query From 640a75664a1665818071ca45ef28e898ce0ab9b5 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 29 Oct 2015 18:04:49 +0000 Subject: [PATCH 15/32] fix double definition of getPublicSuffix, rename second one to correct getPublicPrefix (!!!) --- surt/handyurl.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/surt/handyurl.py b/surt/handyurl.py index 4969a26..75cb92f 100755 --- a/surt/handyurl.py +++ b/surt/handyurl.py @@ -299,26 +299,26 @@ def getPublicSuffix(self): # getPublicPrefix #___________________________________________________________________________ - def getPublicSuffix(self): + def getPublicPrefix(self): """Uses the tldextract module to get the subdomain, using the Public Suffix List. These doctests are based off the ones found in HandyURLTest.java: >>> h = handyurl(host='www.fool.com') - >>> h.getPublicSuffix() + >>> h.getPublicPrefix() 'www' >>> h = handyurl(host='www.amazon.co.uk') - >>> h.getPublicSuffix() + >>> h.getPublicPrefix() 'www' >>> h = handyurl(host='www.images.amazon.co.uk') - >>> h.getPublicSuffix() + >>> h.getPublicPrefix() 'www.images' >>> h = handyurl(host='funky-images.fancy.co.jp') - >>> h.getPublicSuffix() + >>> h.getPublicPrefix() 'funky-images' """ return tldextract.extract(self.host).subdomain From ffdcf38ed73d456b15c1ad3e5d282079fa0fde05 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 29 Oct 2015 19:00:18 +0000 Subject: [PATCH 16/32] replace doctests with regular tests to avoid character set encoding issues --- setup.py | 4 +- surt/DefaultIAURLCanonicalizer.py | 27 --- surt/GoogleURLCanonicalizer.py | 140 -------------- surt/IAURLCanonicalizer.py | 78 -------- surt/URLRegexTransformer.py | 112 ----------- surt/handyurl.py | 94 --------- surt/surt.py | 102 ---------- tests/test_surt.py | 308 ++++++++++++++++++++++++++++++ 8 files changed, 310 insertions(+), 555 deletions(-) create mode 100644 tests/test_surt.py diff --git a/setup.py b/setup.py index f653dc7..4031299 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ def finalize_options(self): def run_tests(self): import pytest import sys - cmdline = ' -v --doctest-modules --cov surt surt/' + cmdline = ' -v --doctest-modules --cov surt tests/' errcode = pytest.main(cmdline) sys.exit(errcode) @@ -33,7 +33,7 @@ def run_tests(self): packages=[ 'surt' ], scripts=[], # Tests - tests_require=[ 'pytest' ], + tests_require=[ 'pytest', 'pytest-cov' ], test_suite='', cmdclass={'test': PyTest}, ) diff --git a/surt/DefaultIAURLCanonicalizer.py b/surt/DefaultIAURLCanonicalizer.py index 11ae90e..5c0b862 100755 --- a/surt/DefaultIAURLCanonicalizer.py +++ b/surt/DefaultIAURLCanonicalizer.py @@ -21,9 +21,6 @@ """This is a python port of DefaultIAURLCanonicalizer.java: http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/DefaultIAURLCanonicalizer.java?view=markup - -The doctests are copied from DefaultIAURLCanonicalizerTest.java: -http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/DefaultIAURLCanonicalizerTest.java?view=markup """ from __future__ import absolute_import @@ -35,33 +32,9 @@ #_______________________________________________________________________________ def canonicalize(url, **options): """The input url is a handyurl instance - - These doctests are from DefaultIAURLCanonicalizerTest.java: - - >>> from .handyurl import handyurl - >>> canonicalize(handyurl.parse("http://www.alexa.com/")).getURLString() - 'http://alexa.com/' - >>> canonicalize(handyurl.parse("http://archive.org/index.html")).getURLString() - 'http://archive.org/index.html' - >>> canonicalize(handyurl.parse("http://archive.org/index.html?")).getURLString() - 'http://archive.org/index.html' - >>> canonicalize(handyurl.parse("http://archive.org/index.html?a=b")).getURLString() - 'http://archive.org/index.html?a=b' - >>> canonicalize(handyurl.parse("http://archive.org/index.html?b=b&a=b")).getURLString() - 'http://archive.org/index.html?a=b&b=b' - >>> canonicalize(handyurl.parse("http://archive.org/index.html?b=a&b=b&a=b")).getURLString() - 'http://archive.org/index.html?a=b&b=a&b=b' - >>> canonicalize(handyurl.parse("http://www34.archive.org/index.html?b=a&b=b&a=b")).getURLString() - 'http://archive.org/index.html?a=b&b=a&b=b' """ url = surt.GoogleURLCanonicalizer.canonicalize(url, **options) url = surt.IAURLCanonicalizer.canonicalize(url, **options) return url - -# main() -#_______________________________________________________________________________ -if __name__ == "__main__": - import doctest - doctest.testmod() diff --git a/surt/GoogleURLCanonicalizer.py b/surt/GoogleURLCanonicalizer.py index 3f9049f..9757eb9 100755 --- a/surt/GoogleURLCanonicalizer.py +++ b/surt/GoogleURLCanonicalizer.py @@ -22,9 +22,6 @@ """This is a python port of GoogleURLCanonicalizer.java: http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/GoogleURLCanonicalizer.java?view=markup - -The doctests are copied from GoogleURLCanonicalizerTest.java: -http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/GoogleURLCanonicalizerTest.java?view=markup """ from __future__ import absolute_import @@ -42,98 +39,6 @@ # unescapeRepeatedly() #_______________________________________________________________________________ def canonicalize(url, **_ignored): - u""" - >>> canonicalize(handyurl.parse("http://host/%25%32%35")).getURLString() - 'http://host/%25' - >>> canonicalize(handyurl.parse("http://host/%25%32%35%25%32%35")).getURLString() - 'http://host/%25%25' - >>> canonicalize(handyurl.parse("http://host/%2525252525252525")).getURLString() - 'http://host/%25' - >>> canonicalize(handyurl.parse("http://host/asdf%25%32%35asd")).getURLString() - 'http://host/asdf%25asd' - >>> canonicalize(handyurl.parse("http://host/%%%25%32%35asd%%")).getURLString() - 'http://host/%25%25%25asd%25%25' - >>> canonicalize(handyurl.parse("http://www.google.com/")).getURLString() - 'http://www.google.com/' - >>> canonicalize(handyurl.parse("http://%31%36%38%2e%31%38%38%2e%39%39%2e%32%36/%2E%73%65%63%75%72%65/%77%77%77%2E%65%62%61%79%2E%63%6F%6D/")).getURLString() - 'http://168.188.99.26/.secure/www.ebay.com/' - >>> canonicalize(handyurl.parse("http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/")).getURLString() - 'http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/' - >>> canonicalize(handyurl.parse("http://host%23.com/%257Ea%2521b%2540c%2523d%2524e%25f%255E00%252611%252A22%252833%252944_55%252B")).getURLString() - 'http://host%23.com/~a!b@c%23d$e%25f^00&11*22(33)44_55+' - >>> canonicalize(handyurl.parse("http://3279880203/blah")).getURLString() - 'http://195.127.0.11/blah' - >>> canonicalize(handyurl.parse("http://www.google.com/blah/..")).getURLString() - 'http://www.google.com/' - >>> canonicalize(handyurl.parse("www.google.com/")).getURLString() - 'http://www.google.com/' - >>> canonicalize(handyurl.parse("www.google.com")).getURLString() - 'http://www.google.com/' - >>> canonicalize(handyurl.parse("http://www.evil.com/blah#frag")).getURLString() - 'http://www.evil.com/blah' - >>> canonicalize(handyurl.parse("http://www.GOOgle.com/")).getURLString() - 'http://www.google.com/' - >>> canonicalize(handyurl.parse("http://www.google.com.../")).getURLString() - 'http://www.google.com/' - - #This works but the newline in the docstring messes up doctest - #>>> canonicalize(handyurl.parse("http://www.google.com/foo\tbar\rbaz\n2")).getURLString() - #'http://www.google.com/foobarbaz2' - - >>> canonicalize(handyurl.parse("http://www.google.com/q?")).getURLString() - 'http://www.google.com/q?' - >>> canonicalize(handyurl.parse("http://www.google.com/q?r?")).getURLString() - 'http://www.google.com/q?r?' - >>> canonicalize(handyurl.parse("http://www.google.com/q?r?s")).getURLString() - 'http://www.google.com/q?r?s' - >>> canonicalize(handyurl.parse("http://evil.com/foo#bar#baz")).getURLString() - 'http://evil.com/foo' - >>> canonicalize(handyurl.parse("http://evil.com/foo;")).getURLString() - 'http://evil.com/foo;' - >>> canonicalize(handyurl.parse("http://evil.com/foo?bar;")).getURLString() - 'http://evil.com/foo?bar;' - - #This test case differs from the Java version. The Java version returns - #'http://%01%80.com/' for this case. If idna/punycode encoding of a hostname - #is not possible, the python version encodes unicode domains as utf-8 before - #percent encoding, so we get 'http://%01%C2%80.com/' - >>> print(canonicalize(handyurl.parse(u"http://\u0001\u0080.com/")).getURLString()) - http://%01%C2%80.com/ - - #Add these unicode tests: - >>> print(canonicalize(handyurl.parse(u'B\xfccher.ch:8080')).getURLString()) - http://xn--bcher-kva.ch:8080/ - >>> url = '☃.com' #doctest has trouble with utf-8 encoding - >>> print(canonicalize(handyurl.parse(url)).getURLString()) - http://xn--n3h.com/ - - #Add these percent-encoded unicode tests - >>> canonicalize(handyurl.parse("http://www.t%EF%BF%BD%04.82.net/")).getURLString() - 'http://www.t%EF%BF%BD%04.82.net/' - - >>> canonicalize(handyurl.parse("http://notrailingslash.com")).getURLString() - 'http://notrailingslash.com/' - >>> canonicalize(handyurl.parse("http://www.gotaport.com:1234/")).getURLString() - 'http://www.gotaport.com:1234/' - >>> canonicalize(handyurl.parse(" http://www.google.com/ ")).getURLString() - 'http://www.google.com/' - >>> canonicalize(handyurl.parse("http:// leadingspace.com/")).getURLString() - 'http://%20leadingspace.com/' - >>> canonicalize(handyurl.parse("http://%20leadingspace.com/")).getURLString() - 'http://%20leadingspace.com/' - >>> canonicalize(handyurl.parse("%20leadingspace.com/")).getURLString() - 'http://%20leadingspace.com/' - >>> canonicalize(handyurl.parse("https://www.securesite.com/")).getURLString() - 'https://www.securesite.com/' - >>> canonicalize(handyurl.parse("http://host.com/ab%23cd")).getURLString() - 'http://host.com/ab%23cd' - >>> canonicalize(handyurl.parse("http://host.com//twoslashes?more//slashes")).getURLString() - 'http://host.com/twoslashes?more//slashes' - - >>> canonicalize(handyurl.parse("mailto:foo@example.com")).getURLString() - 'mailto:foo@example.com' - """ - url.hash = None if url.authUser: url.authUser = minimalEscape(url.authUser) @@ -236,26 +141,6 @@ def normalizePath(path): # attemptIPFormats() #_______________________________________________________________________________ def attemptIPFormats(host): - """ - The doctests are copied from GoogleURLCanonicalizerTest.java: - - >>> attemptIPFormats(None) - >>> attemptIPFormats("www.foo.com") #returns None - >>> attemptIPFormats("127.0.0.1") - '127.0.0.1' - >>> attemptIPFormats("017.0.0.1") - '15.0.0.1' - >>> attemptIPFormats("168.188.99.26") - '168.188.99.26' - >>> attemptIPFormats("10.0.258") #java version returns null, ours returns the correct ipv4 - '10.0.1.2' - >>> attemptIPFormats("1.2.3.256") #returns None - - ARC files from the wayback machine's liveweb proxy contain numeric - hostnames > 2^32 for some reason. We'll copy the behavior of the java code. - >>> attemptIPFormats("39024579298") - '22.11.210.226' - """ if None == host: return None @@ -304,26 +189,6 @@ def escapeOnce(input): # unescapeRepeatedly() #_______________________________________________________________________________ def unescapeRepeatedly(input): - """ - The doctests are copied from GoogleURLCanonicalizerTest.java: - - >>> unescapeRepeatedly("%!A%21%21%25") - '%!A!!%' - >>> unescapeRepeatedly("%") - '%' - >>> unescapeRepeatedly("%2") - '%2' - >>> unescapeRepeatedly("%25") - '%' - >>> unescapeRepeatedly("%25%") - '%%' - >>> unescapeRepeatedly("%2525") - '%' - >>> unescapeRepeatedly("%252525") - '%' - >>> unescapeRepeatedly("%25%32%35") - '%' - """ if None == input: return None @@ -333,8 +198,3 @@ def unescapeRepeatedly(input): return input input = un -# main() -#_______________________________________________________________________________ -if __name__ == "__main__": - import doctest - doctest.testmod() diff --git a/surt/IAURLCanonicalizer.py b/surt/IAURLCanonicalizer.py index 142e8fa..0603d6a 100755 --- a/surt/IAURLCanonicalizer.py +++ b/surt/IAURLCanonicalizer.py @@ -21,9 +21,6 @@ """This is a python port of IAURLCanonicalizer.java: http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/IAURLCanonicalizer.java?view=markup - -The doctests are copied from IAURLCanonicalizerTest.java: -http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/IAURLCanonicalizerTest.java?view=markup """ from __future__ import absolute_import @@ -44,23 +41,6 @@ def canonicalize(url, host_lowercase=True, host_massage=True, query_strip_empty=True, query_alpha_reorder=True, hash_strip=True, **_ignored): """The input url is a handyurl instance - - These doctests are from IAURLCanonicalizerTest.java: - - >>> canonicalize(handyurl.parse("http://ARCHIVE.ORG/")).getURLString() - 'http://archive.org/' - >>> canonicalize(handyurl.parse("http://www.archive.org:80/")).getURLString() - 'http://archive.org/' - >>> canonicalize(handyurl.parse("https://www.archive.org:80/")).getURLString() - 'https://archive.org:80/' - >>> canonicalize(handyurl.parse("http://www.archive.org:443/")).getURLString() - 'http://archive.org:443/' - >>> canonicalize(handyurl.parse("https://www.archive.org:443/")).getURLString() - 'https://archive.org/' - >>> canonicalize(handyurl.parse("http://www.archive.org/big/")).getURLString() - 'http://archive.org/big' - >>> canonicalize(handyurl.parse("dns:www.archive.org")).getURLString() - 'dns:www.archive.org' """ if host_lowercase and url.host: url.host = url.host.lower() @@ -122,34 +102,6 @@ def alphaReorderQuery(orig): """It's a shame that we can't use urlparse.parse_qsl() for this, but this function does keeps the trailing '=' if there is a query arg with no value: "?foo" vs "?foo=", and we want to exactly match the java version - - These doctests are from IAURLCanonicalizerTest.java: - - >>> alphaReorderQuery(None) - >>> alphaReorderQuery("") - '' - >>> alphaReorderQuery("") - '' - >>> alphaReorderQuery("a") - 'a' - >>> alphaReorderQuery("ab") - 'ab' - >>> alphaReorderQuery("a=1") - 'a=1' - >>> alphaReorderQuery("ab=1") - 'ab=1' - >>> alphaReorderQuery("a=1&") - '&a=1' - >>> alphaReorderQuery("a=1&b=1") - 'a=1&b=1' - >>> alphaReorderQuery("b=1&a=1") - 'a=1&b=1' - >>> alphaReorderQuery("a=a&a=a") - 'a=a&a=a' - >>> alphaReorderQuery("a=b&a=a") - 'a=a&a=b' - >>> alphaReorderQuery("b=b&a=b&b=a&a=a") - 'a=a&a=b&b=a&b=b' """ @@ -178,21 +130,6 @@ def alphaReorderQuery(orig): _RE_WWWDIGITS = re.compile('www\d*\.') def massageHost(host): - """These doctests are from IAURLCanonicalizerTest.java: - - >>> massageHost("foo.com") - 'foo.com' - >>> massageHost("www.foo.com") - 'foo.com' - >>> massageHost("www12.foo.com") - 'foo.com' - - >>> massageHost("www2foo.com") - 'www2foo.com' - >>> massageHost("www2.www2foo.com") - 'www2foo.com' - """ - m = _RE_WWWDIGITS.match(host) if m: return host[len(m.group(0)):] @@ -202,15 +139,6 @@ def massageHost(host): # getDefaultPort() #_______________________________________________________________________________ def getDefaultPort(scheme): - """These doctests are from IAURLCanonicalizerTest.java: - - >>> getDefaultPort("foo") - 0 - >>> getDefaultPort("http") - 80 - >>> getDefaultPort("https") - 443 - """ scheme_lower = scheme.lower() if 'http' == scheme_lower: return 80 @@ -219,9 +147,3 @@ def getDefaultPort(scheme): else: return 0 -# main() -#_______________________________________________________________________________ -if __name__ == "__main__": - import doctest - doctest.testmod() - diff --git a/surt/URLRegexTransformer.py b/surt/URLRegexTransformer.py index babe76e..04f829b 100755 --- a/surt/URLRegexTransformer.py +++ b/surt/URLRegexTransformer.py @@ -21,9 +21,6 @@ """This is a python port of URLRegexTransformer.java: http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/URLRegexTransformer.java?view=markup - -The doctests are copied from URLRegexTransformerTest.java: -http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/test/java/org/archive/url/URLRegexTransformerTest.java?view=markup """ import re @@ -38,24 +35,6 @@ def stripPathSessionID(path): """It looks like the java version returns a lowercased path.. So why does it uses a case-insensitive regex? We won't lowercase here. - - These doctests are from IAURLCanonicalizerTest.java: - - Check ASP_SESSIONID2: - >>> stripPathSessionID("/(S(4hqa0555fwsecu455xqckv45))/mileg.aspx") - '/mileg.aspx' - - Check ASP_SESSIONID2 (again): - >>> stripPathSessionID("/(4hqa0555fwsecu455xqckv45)/mileg.aspx") - '/mileg.aspx' - - Check ASP_SESSIONID3: - >>> stripPathSessionID("/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=sessionschedules") - '/mileg.aspx?page=sessionschedules' - - '@' in path: - >>> stripPathSessionID("/photos/36050182@N05/") - '/photos/36050182@N05/' """ for pattern in _RES_PATH_SESSIONID: m = pattern.match(path) @@ -76,87 +55,6 @@ def stripPathSessionID(path): ] def stripQuerySessionID(path): - """These doctests are from IAURLCanonicalizerTest.java: - - >>> #base = "http://www.archive.org/index.html" - >>> base = "" - >>> str32id = "0123456789abcdefghijklemopqrstuv" - >>> url = base + "?jsessionid=" + str32id - >>> stripQuerySessionID(url) - '?' - - Test that we don't strip if not 32 chars only. - >>> url = base + "?jsessionid=" + str32id + '0' - >>> stripQuerySessionID(url) - '?jsessionid=0123456789abcdefghijklemopqrstuv0' - - Test what happens when followed by another key/value pair. - >>> url = base + "?jsessionid=" + str32id + "&x=y" - >>> stripQuerySessionID(url) - '?x=y' - - Test what happens when followed by another key/value pair and - prefixed by a key/value pair. - >>> url = base + "?one=two&jsessionid=" + str32id + "&x=y" - >>> stripQuerySessionID(url) - '?one=two&x=y' - - Test what happens when prefixed by a key/value pair. - >>> url = base + "?one=two&jsessionid=" + str32id - >>> stripQuerySessionID(url) - '?one=two&' - - Test aspsession. - >>> url = base + "?aspsessionidABCDEFGH=" + "ABCDEFGHIJKLMNOPQRSTUVWX" + "&x=y" - >>> stripQuerySessionID(url) - '?x=y' - - Test archive phpsession. - >>> url = base + "?phpsessid=" + str32id + "&x=y" - >>> stripQuerySessionID(url) - '?x=y' - - With prefix too. - >>> url = base + "?one=two&phpsessid=" + str32id + "&x=y" - >>> stripQuerySessionID(url) - '?one=two&x=y' - - With only prefix - >>> url = base + "?one=two&phpsessid=" + str32id - >>> stripQuerySessionID(url) - '?one=two&' - - Test sid. - >>> url = base + "?" + "sid=9682993c8daa2c5497996114facdc805" + "&x=y"; - >>> stripQuerySessionID(url) - '?x=y' - - Igor test. - >>> url = base + "?" + "sid=9682993c8daa2c5497996114facdc805" + "&" + "jsessionid=" + str32id - >>> stripQuerySessionID(url) - '?' - - >>> url = "?CFID=1169580&CFTOKEN=48630702&dtstamp=22%2F08%2F2006%7C06%3A58%3A11" - >>> stripQuerySessionID(url) - '?dtstamp=22%2F08%2F2006%7C06%3A58%3A11' - - >>> url = "?CFID=12412453&CFTOKEN=15501799&dt=19_08_2006_22_39_28" - >>> stripQuerySessionID(url) - '?dt=19_08_2006_22_39_28' - - >>> url = "?CFID=14475712&CFTOKEN=2D89F5AF-3048-2957-DA4EE4B6B13661AB&r=468710288378&m=forgotten" - >>> stripQuerySessionID(url) - '?r=468710288378&m=forgotten' - - >>> url = "?CFID=16603925&CFTOKEN=2AE13EEE-3048-85B0-56CEDAAB0ACA44B8" - >>> stripQuerySessionID(url) - '?' - - >>> url = "?CFID=4308017&CFTOKEN=63914124&requestID=200608200458360%2E39414378" - >>> stripQuerySessionID(url) - '?requestID=200608200458360%2E39414378' - - """ for pattern in _RES_QUERY_SESSIONID: m = pattern.match(path) if m: @@ -171,18 +69,8 @@ def stripQuerySessionID(path): # hostToSURT #_______________________________________________________________________________ def hostToSURT(host): - """This doctest comes from IAURLCanonicalizerTest.java: - >>> hostToSURT("www.archive.org") - 'org,archive,www' - """ # TODO: ensure we DONT reverse IP addresses! parts = host.split('.') parts.reverse() return ','.join(parts) -# main() -#_______________________________________________________________________________ -if __name__ == "__main__": - import doctest - doctest.testmod() - diff --git a/surt/handyurl.py b/surt/handyurl.py index 75cb92f..d2fb984 100755 --- a/surt/handyurl.py +++ b/surt/handyurl.py @@ -80,56 +80,6 @@ def parse(cls, url): u"""This method was in the java URLParser class, but we don't need a whole class to parse a url, when we can just use python's urlparse. - These doctests come from URLParserTest.java: - - >>> handyurl.parse("http://www.archive.org/index.html#foo").geturl() - 'http://www.archive.org/index.html#foo' - - >>> handyurl.parse("http://www.archive.org/").geturl() - 'http://www.archive.org/' - - >>> handyurl.parse("http://www.archive.org").geturl() - 'http://www.archive.org' - - >>> handyurl.parse("http://www.archive.org?").geturl() - 'http://www.archive.org?' - - >>> handyurl.parse("http://www.archive.org:8080/index.html?query#foo").geturl() - 'http://www.archive.org:8080/index.html?query#foo' - - >>> handyurl.parse("http://www.archive.org:8080/index.html?#foo").geturl() - 'http://www.archive.org:8080/index.html#foo' - - >>> handyurl.parse("http://www.archive.org:8080?#foo").geturl() - 'http://www.archive.org:8080/#foo' - - >>> handyurl.parse(u"http://bücher.ch:8080?#foo").geturl() - 'http://bücher.ch:8080/#foo' - - >>> handyurl.parse(u"dns:bücher.ch").geturl() - 'dns:bücher.ch' - - >>> print(handyurl.parse(u"http://bücher.ch:8080?#foo").geturl()) - http://b\xfccher.ch:8080/#foo - - >>> print(handyurl.parse(u"dns:bücher.ch").geturl()) - dns:b\xfccher.ch - - ###From Tymm: - >>> handyurl.parse("http:////////////////www.vikings.com").geturl() - 'http://www.vikings.com/' - >>> handyurl.parse("http://https://order.1and1.com").geturl() - 'https://order.1and1.com' - - ###From Common Crawl, host ends with ':' without a port number - >>> handyurl.parse("http://mineral.galleries.com:/minerals/silicate/chabazit/chabazit.htm").geturl() - 'http://mineral.galleries.com/minerals/silicate/chabazit/chabazit.htm' - - >>> handyurl.parse("mailto:bot@archive.org").scheme - 'mailto' - - >>> handyurl.parse("mailto:bot@archive.org").geturl() - 'mailto:bot@archive.org' """ # Note RE_SPACES does not match regular space (0x20). That is, # regular spaces are removed at head and tail, but not in the middle. @@ -273,26 +223,7 @@ def getURLString(self, def getPublicSuffix(self): """Uses the tldextract module to get the public suffix via the Public Suffix List. - - These doctests are based off the ones found in HandyURLTest.java: - - >>> h = handyurl(host='www.fool.com') - >>> h.getPublicSuffix() - 'fool.com' - - >>> h = handyurl(host='www.amazon.co.uk') - >>> h.getPublicSuffix() - 'amazon.co.uk' - - >>> h = handyurl(host='www.images.amazon.co.uk') - >>> h.getPublicSuffix() - 'amazon.co.uk' - - >>> h = handyurl(host='funky-images.fancy.co.jp') - >>> h.getPublicSuffix() - 'fancy.co.jp' """ - r = tldextract.extract(self.host) return "%s.%s" % (r.domain, r.tld) @@ -302,24 +233,6 @@ def getPublicSuffix(self): def getPublicPrefix(self): """Uses the tldextract module to get the subdomain, using the Public Suffix List. - - These doctests are based off the ones found in HandyURLTest.java: - - >>> h = handyurl(host='www.fool.com') - >>> h.getPublicPrefix() - 'www' - - >>> h = handyurl(host='www.amazon.co.uk') - >>> h.getPublicPrefix() - 'www' - - >>> h = handyurl(host='www.images.amazon.co.uk') - >>> h.getPublicPrefix() - 'www.images' - - >>> h = handyurl(host='funky-images.fancy.co.jp') - >>> h.getPublicPrefix() - 'funky-images' """ return tldextract.extract(self.host).subdomain @@ -330,10 +243,3 @@ def getPublicPrefix(self): #def __repr__(self): # return u"""handyurl(scheme=%s, authUser=%s, authPass=%s, host=%s, port=%s, path=%s, query=%s, hash=%s)""".encode('utf-8') % (self.scheme, self.authUser, self.authPass, self.host, self.port, self.path, self.query, self.hash) - - -# main() -#_______________________________________________________________________________ -if __name__ == "__main__": - import doctest - doctest.testmod() diff --git a/surt/surt.py b/surt/surt.py index 9774ca9..8a937bd 100755 --- a/surt/surt.py +++ b/surt/surt.py @@ -52,102 +52,6 @@ def _normalize(canonicalizer): # surt() #_______________________________________________________________________________ def surt(url, canonicalizer=None, **options): - """ - These doctests are from WaybackURLKeyMakerTest.java - - >>> surt(None) - '-' - >>> surt('') - '-' - >>> surt("filedesc:foo.arc.gz") - 'filedesc:foo.arc.gz' - >>> surt("filedesc:/foo.arc.gz") - 'filedesc:/foo.arc.gz' - >>> surt("filedesc://foo.arc.gz") - 'filedesc://foo.arc.gz' - >>> surt("warcinfo:foo.warc.gz") - 'warcinfo:foo.warc.gz' - >>> surt("dns:alexa.com") - 'com,alexa)' - >>> surt("dns:archive.org") - 'org,archive)' - - >>> surt("http://www.archive.org/") - 'org,archive)/' - >>> surt("http://archive.org/") - 'org,archive)/' - >>> surt("http://archive.org/goo/") - 'org,archive)/goo' - >>> surt("http://archive.org/goo/?") - 'org,archive)/goo' - >>> surt("http://archive.org/goo/?b&a") - 'org,archive)/goo?a&b' - >>> surt("http://archive.org/goo/?a=2&b&a=1") - 'org,archive)/goo?a=1&a=2&b' - - # trailing comma mode - >>> surt("http://archive.org/goo/?a=2&b&a=1", trailing_comma=True) - 'org,archive,)/goo?a=1&a=2&b' - - >>> surt("dns:archive.org", trailing_comma=True) - 'org,archive,)' - - >>> surt("warcinfo:foo.warc.gz", trailing_comma=True) - 'warcinfo:foo.warc.gz' - - PHP session id: - >>> surt("http://archive.org/index.php?PHPSESSID=0123456789abcdefghijklemopqrstuv&action=profile;u=4221") - 'org,archive)/index.php?action=profile;u=4221' - - WHOIS url: - >>> surt("whois://whois.isoc.org.il/shaveh.co.il") - 'whois://whois.isoc.org.il/shaveh.co.il' - - Yahoo web bug. See https://github.com/internetarchive/surt/issues/1 - >>> surt('http://visit.webhosting.yahoo.com/visit.gif?&r=http%3A//web.archive.org/web/20090517140029/http%3A//anthonystewarthead.electric-chi.com/&b=Netscape%205.0%20%28Windows%3B%20en-US%29&s=1366x768&o=Win32&c=24&j=true&v=1.2') - 'com,yahoo,webhosting,visit)/visit.gif?&b=netscape%205.0%20(windows;%20en-us)&c=24&j=true&o=win32&r=http://web.archive.org/web/20090517140029/http://anthonystewarthead.electric-chi.com/&s=1366x768&v=1.2' - - Simple customization: - >>> surt("http://www.example.com/", canonicalizer=lambda x, **opts: x) - 'com,example,www)/' - - >>> surt("mailto:foo@example.com") - 'mailto:foo@example.com' - - >>> surt("http://www.example.com/", with_scheme=True) - 'http://(com,example)/' - - >>> surt("http://www.example.com/", with_scheme=True, host_massage=True) - 'http://(com,example)/' - - >>> surt("http://www.example.com/", with_scheme=False) - 'com,example)/' - - >>> surt("http://www.example.com/", with_scheme=True, trailing_comma=True) - 'http://(com,example,)/' - - >>> surt("https://www.example.com/", with_scheme=True, trailing_comma=True) - 'https://(com,example,)/' - - >>> surt("ftp://www.example.com/", with_scheme=True, trailing_comma=True) - 'ftp://(com,example,)/' - - >>> surt("http://www.example.com/", with_scheme=True, host_massage=False) - 'http://(com,example,www)/' - - >>> surt("http://www.example.com/", with_scheme=False, host_massage=False) - 'com,example,www)/' - - >>> surt("http://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False) - 'http://(com,example,www,)/' - - >>> surt("https://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False) - 'https://(com,example,www,)/' - - >>> surt("ftp://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False) - 'ftp://(com,example,www,)/' - """ - if not url: return "-" @@ -190,9 +94,3 @@ def surt(url, canonicalizer=None, **options): else: return key - -# main() -#_______________________________________________________________________________ -if __name__ == "__main__": - import doctest - doctest.testmod() diff --git a/tests/test_surt.py b/tests/test_surt.py new file mode 100644 index 0000000..5e8461e --- /dev/null +++ b/tests/test_surt.py @@ -0,0 +1,308 @@ +# -*- coding: utf-8 -*- + +from __future__ import absolute_import + +import surt +from surt import handyurl + +def test_handyurl_parse(): + # These tests come from URLParserTest.java + assert handyurl.parse("http://www.archive.org/index.html#foo").geturl() == 'http://www.archive.org/index.html#foo' + assert handyurl.parse("http://www.archive.org/").geturl() == 'http://www.archive.org/' + assert handyurl.parse("http://www.archive.org").geturl() == 'http://www.archive.org' + assert handyurl.parse("http://www.archive.org?").geturl() == 'http://www.archive.org?' + assert handyurl.parse("http://www.archive.org:8080/index.html?query#foo").geturl() == 'http://www.archive.org:8080/index.html?query#foo' + assert handyurl.parse("http://www.archive.org:8080/index.html?#foo").geturl() == 'http://www.archive.org:8080/index.html#foo' + assert handyurl.parse("http://www.archive.org:8080?#foo").geturl() == 'http://www.archive.org:8080/#foo' + assert handyurl.parse(u"http://bücher.ch:8080?#foo").geturl() == u'http://bücher.ch:8080/#foo' + assert handyurl.parse(u"dns:bücher.ch").geturl() == u'dns:bücher.ch' + # XXX assert print(handyurl.parse(u"http://bücher.ch:8080?#foo").geturl()) == http://b\xfccher.ch:8080/#foo + # XXX assert print(handyurl.parse(u"dns:bücher.ch").geturl()) == dns:b\xfccher.ch + assert handyurl.parse(u"http://bücher.ch:8080?#foo").geturl() == u"http://b\xfccher.ch:8080/#foo" + assert handyurl.parse(u"dns:bücher.ch").geturl() == u"dns:b\xfccher.ch" + + ###From Tymm: + assert handyurl.parse("http:////////////////www.vikings.com").geturl() == 'http://www.vikings.com/' + assert handyurl.parse("http://https://order.1and1.com").geturl() == 'https://order.1and1.com' + + ###From Common Crawl, host ends with ':' without a port number + assert handyurl.parse("http://mineral.galleries.com:/minerals/silicate/chabazit/chabazit.htm").geturl() == 'http://mineral.galleries.com/minerals/silicate/chabazit/chabazit.htm' + + assert handyurl.parse("mailto:bot@archive.org").scheme == 'mailto' + assert handyurl.parse("mailto:bot@archive.org").geturl() == 'mailto:bot@archive.org' + +def test_getPublicSuffix(): + # These tests are based off the ones found in HandyURLTest.java + assert handyurl(host='www.fool.com').getPublicSuffix() == 'fool.com' + assert handyurl(host='www.amazon.co.uk').getPublicSuffix() == 'amazon.co.uk' + assert handyurl(host='www.images.amazon.co.uk').getPublicSuffix() == 'amazon.co.uk' + assert handyurl(host='funky-images.fancy.co.jp').getPublicSuffix() == 'fancy.co.jp' + +def test_getPublicPrefix(): + # These tests are based off the ones found in HandyURLTest.java + assert handyurl(host='www.fool.com').getPublicPrefix() == 'www' + assert handyurl(host='www.amazon.co.uk').getPublicPrefix() == 'www' + assert handyurl(host='www.images.amazon.co.uk').getPublicPrefix() == 'www.images' + assert handyurl(host='funky-images.fancy.co.jp').getPublicPrefix() == 'funky-images' + +def test_DefaultIAURLCanonicalizer(): + # These tests are from DefaultIAURLCanonicalizerTest.java + assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://www.alexa.com/")).getURLString() == 'http://alexa.com/' + assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://archive.org/index.html")).getURLString() == 'http://archive.org/index.html' + assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://archive.org/index.html?")).getURLString() == 'http://archive.org/index.html' + assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://archive.org/index.html?a=b")).getURLString() == 'http://archive.org/index.html?a=b' + assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://archive.org/index.html?b=b&a=b")).getURLString() == 'http://archive.org/index.html?a=b&b=b' + assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://archive.org/index.html?b=a&b=b&a=b")).getURLString() == 'http://archive.org/index.html?a=b&b=a&b=b' + assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://www34.archive.org/index.html?b=a&b=b&a=b")).getURLString() == 'http://archive.org/index.html?a=b&b=a&b=b' + +def test_GoogleURLCanonicalizer(): + # The tests are copied from GoogleURLCanonicalizerTest.java + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host/%25%32%35")).getURLString() == 'http://host/%25' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host/%25%32%35%25%32%35")).getURLString() == 'http://host/%25%25' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host/%2525252525252525")).getURLString() == 'http://host/%25' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host/asdf%25%32%35asd")).getURLString() == 'http://host/asdf%25asd' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host/%%%25%32%35asd%%")).getURLString() == 'http://host/%25%25%25asd%25%25' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com/")).getURLString() == 'http://www.google.com/' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://%31%36%38%2e%31%38%38%2e%39%39%2e%32%36/%2E%73%65%63%75%72%65/%77%77%77%2E%65%62%61%79%2E%63%6F%6D/")).getURLString() == 'http://168.188.99.26/.secure/www.ebay.com/' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/")).getURLString() == 'http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host%23.com/%257Ea%2521b%2540c%2523d%2524e%25f%255E00%252611%252A22%252833%252944_55%252B")).getURLString() == 'http://host%23.com/~a!b@c%23d$e%25f^00&11*22(33)44_55+' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://3279880203/blah")).getURLString() == 'http://195.127.0.11/blah' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com/blah/..")).getURLString() == 'http://www.google.com/' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("www.google.com/")).getURLString() == 'http://www.google.com/' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("www.google.com")).getURLString() == 'http://www.google.com/' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.evil.com/blah#frag")).getURLString() == 'http://www.evil.com/blah' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.GOOgle.com/")).getURLString() == 'http://www.google.com/' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com.../")).getURLString() == 'http://www.google.com/' + + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com/foo\tbar\rbaz\n2")).getURLString() == 'http://www.google.com/foobarbaz2' + + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com/q?")).getURLString() == 'http://www.google.com/q?' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com/q?r?")).getURLString() == 'http://www.google.com/q?r?' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com/q?r?s")).getURLString() == 'http://www.google.com/q?r?s' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://evil.com/foo#bar#baz")).getURLString() == 'http://evil.com/foo' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://evil.com/foo;")).getURLString() == 'http://evil.com/foo;' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://evil.com/foo?bar;")).getURLString() == 'http://evil.com/foo?bar;' + + #This test case differs from the Java version. The Java version returns + #'http://%01%80.com/' for this case. If idna/punycode encoding of a hostname + #is not possible, the python version encodes unicode domains as utf-8 before + #percent encoding, so we get 'http://%01%C2%80.com/' + # assert print(canonicalize(handyurl.parse(u"http://\u0001\u0080.com/")).getURLString()) http://%01%C2%80.com/ + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse(u"http://\u0001\u0080.com/")).getURLString() == 'http://%01%C2%80.com/' + + #Add these unicode tests: + # assert print(canonicalize(handyurl.parse(u'B\xfccher.ch:8080')).getURLString()) http://xn--bcher-kva.ch:8080/ + # assert print(canonicalize(handyurl.parse('☃.com')).getURLString()) == http://xn--n3h.com/ + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse(u'B\xfccher.ch:8080')).getURLString() == 'http://xn--bcher-kva.ch:8080/' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse('☃.com')).getURLString() == 'http://xn--n3h.com/' + + #Add these percent-encoded unicode tests + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.t%EF%BF%BD%04.82.net/")).getURLString() == 'http://www.t%EF%BF%BD%04.82.net/' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://notrailingslash.com")).getURLString() == 'http://notrailingslash.com/' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.gotaport.com:1234/")).getURLString() == 'http://www.gotaport.com:1234/' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse(" http://www.google.com/ ")).getURLString() == 'http://www.google.com/' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http:// leadingspace.com/")).getURLString() == 'http://%20leadingspace.com/' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://%20leadingspace.com/")).getURLString() == 'http://%20leadingspace.com/' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("%20leadingspace.com/")).getURLString() == 'http://%20leadingspace.com/' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("https://www.securesite.com/")).getURLString() == 'https://www.securesite.com/' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host.com/ab%23cd")).getURLString() == 'http://host.com/ab%23cd' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host.com//twoslashes?more//slashes")).getURLString() == 'http://host.com/twoslashes?more//slashes' + assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("mailto:foo@example.com")).getURLString() == 'mailto:foo@example.com' + +def test_attemptIPFormats(): + # The tests are copied from GoogleURLCanonicalizerTest.java + assert surt.GoogleURLCanonicalizer.attemptIPFormats(None) is None + assert surt.GoogleURLCanonicalizer.attemptIPFormats("www.foo.com") is None + assert surt.GoogleURLCanonicalizer.attemptIPFormats("127.0.0.1") == '127.0.0.1' + assert surt.GoogleURLCanonicalizer.attemptIPFormats("017.0.0.1") == '15.0.0.1' + assert surt.GoogleURLCanonicalizer.attemptIPFormats("168.188.99.26") == '168.188.99.26' + #java version returns null, ours returns the correct ipv4 + assert surt.GoogleURLCanonicalizer.attemptIPFormats("10.0.258") == '10.0.1.2' + assert surt.GoogleURLCanonicalizer.attemptIPFormats("1.2.3.256") is None #returns None + + # ARC files from the wayback machine's liveweb proxy contain numeric + # hostnames > 2^32 for some reason. We'll copy the behavior of the java code. + assert surt.GoogleURLCanonicalizer.attemptIPFormats("39024579298") == '22.11.210.226' + +def test_unescapeRepeatedly(): + # The tests are copied from GoogleURLCanonicalizerTest.java + assert surt.GoogleURLCanonicalizer.unescapeRepeatedly("%!A%21%21%25") == '%!A!!%' + assert surt.GoogleURLCanonicalizer.unescapeRepeatedly("%") == '%' + assert surt.GoogleURLCanonicalizer.unescapeRepeatedly("%2") == '%2' + assert surt.GoogleURLCanonicalizer.unescapeRepeatedly("%25") == '%' + assert surt.GoogleURLCanonicalizer.unescapeRepeatedly("%25%") == '%%' + assert surt.GoogleURLCanonicalizer.unescapeRepeatedly("%2525") == '%' + assert surt.GoogleURLCanonicalizer.unescapeRepeatedly("%252525") == '%' + assert surt.GoogleURLCanonicalizer.unescapeRepeatedly("%25%32%35") == '%' + +def test_IAURLCanonicalizer(): + # These tests are from IAURLCanonicalizerTest.java + assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("http://ARCHIVE.ORG/")).getURLString() == 'http://archive.org/' + assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("http://www.archive.org:80/")).getURLString() == 'http://archive.org/' + assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("https://www.archive.org:80/")).getURLString() == 'https://archive.org:80/' + assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("http://www.archive.org:443/")).getURLString() == 'http://archive.org:443/' + assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("https://www.archive.org:443/")).getURLString() == 'https://archive.org/' + assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("http://www.archive.org/big/")).getURLString() == 'http://archive.org/big' + assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("dns:www.archive.org")).getURLString() == 'dns:www.archive.org' + +def test_alphaReorderQuery(): + # These tests are from IAURLCanonicalizerTest.java + assert surt.IAURLCanonicalizer.alphaReorderQuery(None) is None + assert surt.IAURLCanonicalizer.alphaReorderQuery("") == '' + assert surt.IAURLCanonicalizer.alphaReorderQuery("") == '' + assert surt.IAURLCanonicalizer.alphaReorderQuery("a") == 'a' + assert surt.IAURLCanonicalizer.alphaReorderQuery("ab") == 'ab' + assert surt.IAURLCanonicalizer.alphaReorderQuery("a=1") == 'a=1' + assert surt.IAURLCanonicalizer.alphaReorderQuery("ab=1") == 'ab=1' + assert surt.IAURLCanonicalizer.alphaReorderQuery("a=1&") == '&a=1' + assert surt.IAURLCanonicalizer.alphaReorderQuery("a=1&b=1") == 'a=1&b=1' + assert surt.IAURLCanonicalizer.alphaReorderQuery("b=1&a=1") == 'a=1&b=1' + assert surt.IAURLCanonicalizer.alphaReorderQuery("a=a&a=a") == 'a=a&a=a' + assert surt.IAURLCanonicalizer.alphaReorderQuery("a=b&a=a") == 'a=a&a=b' + assert surt.IAURLCanonicalizer.alphaReorderQuery("b=b&a=b&b=a&a=a") == 'a=a&a=b&b=a&b=b' + +def test_massageHost(): + # These tests are from IAURLCanonicalizerTest.java + assert surt.IAURLCanonicalizer.massageHost("foo.com") == 'foo.com' + assert surt.IAURLCanonicalizer.massageHost("www.foo.com") == 'foo.com' + assert surt.IAURLCanonicalizer.massageHost("www12.foo.com") == 'foo.com' + + assert surt.IAURLCanonicalizer.massageHost("www2foo.com") == 'www2foo.com' + assert surt.IAURLCanonicalizer.massageHost("www2.www2foo.com") == 'www2foo.com' + +def test_getDefaultPort(): + # These tests are from IAURLCanonicalizerTest.java + assert surt.IAURLCanonicalizer.getDefaultPort("foo") == 0 + assert surt.IAURLCanonicalizer.getDefaultPort("http") == 80 + assert surt.IAURLCanonicalizer.getDefaultPort("https") == 443 + +def test_stripPathSessionID(): + # These tests are from IAURLCanonicalizerTest.java + # Check ASP_SESSIONID2: + assert surt.URLRegexTransformer.stripPathSessionID("/(S(4hqa0555fwsecu455xqckv45))/mileg.aspx") == '/mileg.aspx' + + # Check ASP_SESSIONID2 (again): + assert surt.URLRegexTransformer.stripPathSessionID("/(4hqa0555fwsecu455xqckv45)/mileg.aspx") == '/mileg.aspx' + + # Check ASP_SESSIONID3: + assert surt.URLRegexTransformer.stripPathSessionID("/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=sessionschedules") == '/mileg.aspx?page=sessionschedules' + + # '@' in path: + assert surt.URLRegexTransformer.stripPathSessionID("/photos/36050182@N05/") == '/photos/36050182@N05/' + + +def test_stripQuerySessionID(): + #base = "http://www.archive.org/index.html" + base = "" + str32id = "0123456789abcdefghijklemopqrstuv" + url = base + "?jsessionid=" + str32id + assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?' + + # Test that we don't strip if not 32 chars only. + url = base + "?jsessionid=" + str32id + '0' + assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?jsessionid=0123456789abcdefghijklemopqrstuv0' + + # Test what happens when followed by another key/value pair. + url = base + "?jsessionid=" + str32id + "&x=y" + assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?x=y' + + # Test what happens when followed by another key/value pair and + # prefixed by a key/value pair. + url = base + "?one=two&jsessionid=" + str32id + "&x=y" + assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?one=two&x=y' + + # Test what happens when prefixed by a key/value pair. + url = base + "?one=two&jsessionid=" + str32id + assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?one=two&' + + # Test aspsession. + url = base + "?aspsessionidABCDEFGH=" + "ABCDEFGHIJKLMNOPQRSTUVWX" + "&x=y" + assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?x=y' + + # Test archive phpsession. + url = base + "?phpsessid=" + str32id + "&x=y" + assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?x=y' + + # With prefix too. + url = base + "?one=two&phpsessid=" + str32id + "&x=y" + assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?one=two&x=y' + + # With only prefix + url = base + "?one=two&phpsessid=" + str32id + assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?one=two&' + + # Test sid. + url = base + "?" + "sid=9682993c8daa2c5497996114facdc805" + "&x=y"; + assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?x=y' + + # Igor test. + url = base + "?" + "sid=9682993c8daa2c5497996114facdc805" + "&" + "jsessionid=" + str32id + assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?' + + url = "?CFID=1169580&CFTOKEN=48630702&dtstamp=22%2F08%2F2006%7C06%3A58%3A11" + assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?dtstamp=22%2F08%2F2006%7C06%3A58%3A11' + + url = "?CFID=12412453&CFTOKEN=15501799&dt=19_08_2006_22_39_28" + assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?dt=19_08_2006_22_39_28' + + url = "?CFID=14475712&CFTOKEN=2D89F5AF-3048-2957-DA4EE4B6B13661AB&r=468710288378&m=forgotten" + assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?r=468710288378&m=forgotten' + + url = "?CFID=16603925&CFTOKEN=2AE13EEE-3048-85B0-56CEDAAB0ACA44B8" + assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?' + + url = "?CFID=4308017&CFTOKEN=63914124&requestID=200608200458360%2E39414378" + assert surt.URLRegexTransformer.stripQuerySessionID(url) == '?requestID=200608200458360%2E39414378' + +def test_hostToSURT(): + assert surt.URLRegexTransformer.hostToSURT("www.archive.org") == 'org,archive,www' + + +def test_surt(): + # These tests are from WaybackURLKeyMakerTest.java + + assert surt.surt(None) == '-' + assert surt.surt('') == '-' + assert surt.surt("filedesc:foo.arc.gz") == 'filedesc:foo.arc.gz' + assert surt.surt("filedesc:/foo.arc.gz") == 'filedesc:/foo.arc.gz' + assert surt.surt("filedesc://foo.arc.gz") == 'filedesc://foo.arc.gz' + assert surt.surt("warcinfo:foo.warc.gz") == 'warcinfo:foo.warc.gz' + assert surt.surt("dns:alexa.com") == 'com,alexa)' + assert surt.surt("dns:archive.org") == 'org,archive)' + + assert surt.surt("http://www.archive.org/") == 'org,archive)/' + assert surt.surt("http://archive.org/") == 'org,archive)/' + assert surt.surt("http://archive.org/goo/") == 'org,archive)/goo' + assert surt.surt("http://archive.org/goo/?") == 'org,archive)/goo' + assert surt.surt("http://archive.org/goo/?b&a") == 'org,archive)/goo?a&b' + assert surt.surt("http://archive.org/goo/?a=2&b&a=1") == 'org,archive)/goo?a=1&a=2&b' + + # trailing comma mode + assert surt.surt("http://archive.org/goo/?a=2&b&a=1", trailing_comma=True) == 'org,archive,)/goo?a=1&a=2&b' + assert surt.surt("dns:archive.org", trailing_comma=True) == 'org,archive,)' + assert surt.surt("warcinfo:foo.warc.gz", trailing_comma=True) == 'warcinfo:foo.warc.gz' + + # PHP session id: + assert surt.surt("http://archive.org/index.php?PHPSESSID=0123456789abcdefghijklemopqrstuv&action=profile;u=4221") == 'org,archive)/index.php?action=profile;u=4221' + + # WHOIS url: + assert surt.surt("whois://whois.isoc.org.il/shaveh.co.il") == 'whois://whois.isoc.org.il/shaveh.co.il' + + # Yahoo web bug. See https://github.com/internetarchive/surt/issues/1 + assert surt.surt('http://visit.webhosting.yahoo.com/visit.gif?&r=http%3A//web.archive.org/web/20090517140029/http%3A//anthonystewarthead.electric-chi.com/&b=Netscape%205.0%20%28Windows%3B%20en-US%29&s=1366x768&o=Win32&c=24&j=true&v=1.2') == 'com,yahoo,webhosting,visit)/visit.gif?&b=netscape%205.0%20(windows;%20en-us)&c=24&j=true&o=win32&r=http://web.archive.org/web/20090517140029/http://anthonystewarthead.electric-chi.com/&s=1366x768&v=1.2' + + # Simple customization: + assert surt.surt("http://www.example.com/", canonicalizer=lambda x, **opts: x) == 'com,example,www)/' + assert surt.surt("mailto:foo@example.com") == 'mailto:foo@example.com' + assert surt.surt("http://www.example.com/", with_scheme=True) == 'http://(com,example)/' + assert surt.surt("http://www.example.com/", with_scheme=True, host_massage=True) == 'http://(com,example)/' + assert surt.surt("http://www.example.com/", with_scheme=False) == 'com,example)/' + assert surt.surt("http://www.example.com/", with_scheme=True, trailing_comma=True) == 'http://(com,example,)/' + assert surt.surt("https://www.example.com/", with_scheme=True, trailing_comma=True) == 'https://(com,example,)/' + assert surt.surt("ftp://www.example.com/", with_scheme=True, trailing_comma=True) == 'ftp://(com,example,)/' + assert surt.surt("http://www.example.com/", with_scheme=True, host_massage=False) == 'http://(com,example,www)/' + assert surt.surt("http://www.example.com/", with_scheme=False, host_massage=False) == 'com,example,www)/' + assert surt.surt("http://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False) == 'http://(com,example,www,)/' + assert surt.surt("https://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False) == 'https://(com,example,www,)/' + assert surt.surt("ftp://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False) == 'ftp://(com,example,www,)/' + From f1aac724bb6f1602e92f3aeb35181713bc9f4b1e Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 29 Oct 2015 19:06:35 +0000 Subject: [PATCH 17/32] remove python 3.2 test which fails due to https://github.com/menegazzo/travispy/issues/20 --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index d6fb14a..db0b8e6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,6 @@ language: python env: - TOXENV=py26 - TOXENV=py27 - - TOXENV=py32 - TOXENV=py33 - TOXENV=py34 From c16d673b7a463bc6209a818839eea6820210bbfc Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 29 Oct 2015 19:10:13 +0000 Subject: [PATCH 18/32] let's see if tests pass on travis-ci for all these versions of python... --- .travis.yml | 4 ++++ tox.ini | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index db0b8e6..db0d07c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,6 +11,10 @@ env: - TOXENV=py27 - TOXENV=py33 - TOXENV=py34 + - TOXENV=py35 + - TOXENV=jython + - TOXENV=pypy + - TOXENV=pypy3 before_install: - sudo apt-get update diff --git a/tox.ini b/tox.ini index 8b6ba6d..99aefa0 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py26, py27, py33, py34 +envlist = py26, py27, py33, py34, py35, jython, pypy, pypy3 [testenv] commands = python setup.py test From dc74830ba008388a9624e84cca123697dac50b61 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 29 Oct 2015 19:19:35 +0000 Subject: [PATCH 19/32] fiddling with how tests run --- .travis.yml | 27 +++++++++++---------------- setup.py | 2 +- tox.ini | 4 ++-- 3 files changed, 14 insertions(+), 19 deletions(-) diff --git a/.travis.yml b/.travis.yml index db0d07c..1922529 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,20 +1,15 @@ -# vim: set sw=4 et: -# -# tox approach stolen from -# https://github.com/pypa/pip/blob/abdb597dbfb51b21cc76c1cff068b72c80f3a77d/.travis.yml -# - language: python -env: - - TOXENV=py26 - - TOXENV=py27 - - TOXENV=py33 - - TOXENV=py34 - - TOXENV=py35 - - TOXENV=jython - - TOXENV=pypy - - TOXENV=pypy3 +python: + - 2.6 + - 2.7 + - 3.3 + - 3.4 + - 3.5 + - 3.5-dev # 3.5 development branch + - nightly # currently points to 3.6-dev + - pypy # currently points to 3.6-dev + - pypy3 # currently points to 3.6-dev before_install: - sudo apt-get update @@ -23,7 +18,7 @@ before_install: before_script: - pip install tox -script: tox +script: py.test -v --cov surt tests/ #after_success: #coveralls diff --git a/setup.py b/setup.py index 4031299..4993c29 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ def finalize_options(self): def run_tests(self): import pytest import sys - cmdline = ' -v --doctest-modules --cov surt tests/' + cmdline = ' -v --cov surt tests/' errcode = pytest.main(cmdline) sys.exit(errcode) diff --git a/tox.ini b/tox.ini index 99aefa0..fe18a1c 100644 --- a/tox.ini +++ b/tox.ini @@ -4,10 +4,10 @@ # and then run "tox" from this directory. [tox] -envlist = py26, py27, py33, py34, py35, jython, pypy, pypy3 +envlist = py26, py27, py33, py34, py35, pypy, pypy3 [testenv] -commands = python setup.py test +commands = py.test -v --cov surt tests/ deps = pytest pytest-cov From e4ef467e369207d50a4fe3909eac5b0db500cef6 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 29 Oct 2015 20:13:41 +0000 Subject: [PATCH 20/32] try again travis --- .travis.yml | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/.travis.yml b/.travis.yml index 1922529..71851af 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,14 +11,9 @@ python: - pypy # currently points to 3.6-dev - pypy3 # currently points to 3.6-dev -before_install: - - sudo apt-get update - - pip install coveralls --use-mirrors +script: + - py.test -v --cov surt tests/ -before_script: - - pip install tox +install: + - pip install . pytest coveralls -script: py.test -v --cov surt tests/ - -#after_success: - #coveralls From a2073fac829214f311eae9c278c31a096a668407 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 29 Oct 2015 20:25:28 +0000 Subject: [PATCH 21/32] fix coveralls dependency --- .travis.yml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 71851af..7c38dd1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,9 +11,6 @@ python: - pypy # currently points to 3.6-dev - pypy3 # currently points to 3.6-dev -script: - - py.test -v --cov surt tests/ - -install: - - pip install . pytest coveralls +install: pip install . pytest pytest-cov +script: py.test -v --cov=surt tests/ From 64e992997e16074a87ed43e75e634715082c4506 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 30 Oct 2015 00:37:24 +0000 Subject: [PATCH 22/32] remove incorrect comments --- .travis.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 7c38dd1..2e51b89 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,10 +6,10 @@ python: - 3.3 - 3.4 - 3.5 - - 3.5-dev # 3.5 development branch - - nightly # currently points to 3.6-dev - - pypy # currently points to 3.6-dev - - pypy3 # currently points to 3.6-dev + - 3.5-dev + - nightly + - pypy + - pypy3 install: pip install . pytest pytest-cov script: py.test -v --cov=surt tests/ From 524677944c5cf9259757da23909448323bc471b4 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 2 Nov 2015 21:46:28 +0000 Subject: [PATCH 23/32] update links --- README.md | 8 ++++---- setup.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 772bbd1..afd1665 100644 --- a/README.md +++ b/README.md @@ -12,17 +12,17 @@ Installation: Or install the dev version from git: - pip install git+git://github.com/rajbot/surt#egg=surt + pip install git+https://github.com/internetarchive/surt.git#egg=surt More information about SURTs: http://crawler.archive.org/articles/user_manual/glossary.html#surt -This is mostly a python port of the archive-commons org.archive.url package. +This is mostly a python port of the webarchive-commons org.archive.url package. The original java version of the org.archive.url package is here: -http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/ +https://github.com/iipc/webarchive-commons/tree/master/src/main/java/org/archive/url This module depends on the `tldextract` module to query the Public Suffix List. `tldextract` can be installed via `pip` -[![Build Status](https://secure.travis-ci.org/rajbot/surt.png?branch=master)](http://travis-ci.org/rajbot/surt) +[![Build Status](https://travis-ci.org/internetarchive/surt.svg)](https://travis-ci.org/internetarchive/surt) diff --git a/setup.py b/setup.py index 4993c29..d8ae970 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ def run_tests(self): ], description='Sort-friendly URI Reordering Transform (SURT) python package.', long_description=open('README.md').read(), - url='https://github.com/rajbot/surt', + url='https://github.com/internetarchive/surt', zip_safe=True, install_requires=[ 'six', From 9348d098bfd743623f77a1b8fc399300bbe902fa Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 2 Nov 2015 21:50:18 +0000 Subject: [PATCH 24/32] switch to restructuredtext for pypi --- README.md => README.rst | 23 ++++++++++++++++------- setup.py | 2 +- 2 files changed, 17 insertions(+), 8 deletions(-) rename README.md => README.rst (54%) diff --git a/README.md b/README.rst similarity index 54% rename from README.md rename to README.rst index afd1665..251337c 100644 --- a/README.md +++ b/README.rst @@ -2,27 +2,36 @@ Sort-friendly URI Reordering Transform (SURT) python package. Usage: +:: + >>> from surt import surt >>> surt("http://archive.org/goo/?a=2&b&a=1") 'org,archive)/goo?a=1&a=2&b' Installation: +:: + pip install surt Or install the dev version from git: - pip install git+https://github.com/internetarchive/surt.git#egg=surt +:: + pip install git+https://github.com/internetarchive/surt.git#egg=surt More information about SURTs: -http://crawler.archive.org/articles/user_manual/glossary.html#surt +http://crawler.archive.org/articles/user\_manual/glossary.html#surt -This is mostly a python port of the webarchive-commons org.archive.url package. -The original java version of the org.archive.url package is here: +This is mostly a python port of the webarchive-commons org.archive.url +package. The original java version of the org.archive.url package is +here: https://github.com/iipc/webarchive-commons/tree/master/src/main/java/org/archive/url -This module depends on the `tldextract` module to query the Public Suffix -List. `tldextract` can be installed via `pip` +This module depends on the ``tldextract`` module to query the Public +Suffix List. ``tldextract`` can be installed via ``pip`` + +|Build Status| -[![Build Status](https://travis-ci.org/internetarchive/surt.svg)](https://travis-ci.org/internetarchive/surt) +.. |Build Status| image:: https://travis-ci.org/internetarchive/surt.svg + :target: https://travis-ci.org/internetarchive/surt diff --git a/setup.py b/setup.py index d8ae970..5755168 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ def run_tests(self): 'License :: OSI Approved :: GNU Affero General Public License v3', ], description='Sort-friendly URI Reordering Transform (SURT) python package.', - long_description=open('README.md').read(), + long_description=open('README.rst').read(), url='https://github.com/internetarchive/surt', zip_safe=True, install_requires=[ From 732ddd3e195062fa93e54a7cf251daa1995634ee Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 2 Nov 2015 21:52:57 +0000 Subject: [PATCH 25/32] switch to new travis-ci docker based build --- .travis.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 2e51b89..41411eb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,7 @@ -language: python +# http://docs.travis-ci.com/user/migrating-from-legacy/ +sudo: false +language: python python: - 2.6 - 2.7 From 94cf1414e8746671be6545c9bd5c79c8ad308e34 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 16 Dec 2015 00:04:15 +0000 Subject: [PATCH 26/32] remove special handling of dns: and warcinfo: urls, which have no authority section --- setup.py | 2 +- surt/handyurl.py | 13 ++----------- surt/surt.py | 13 ------------- tests/test_surt.py | 22 ++++++++++++++++++---- 4 files changed, 21 insertions(+), 29 deletions(-) diff --git a/setup.py b/setup.py index 5755168..85d06ef 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ def run_tests(self): setup(name='surt', - version='0.3b2', + version='0.3b3', author='rajbot', author_email='raj@archive.org', classifiers=[ diff --git a/surt/handyurl.py b/surt/handyurl.py index d2fb984..866268c 100755 --- a/surt/handyurl.py +++ b/surt/handyurl.py @@ -107,17 +107,8 @@ def parse(cls, url): o = o._replace(netloc=o.netloc.rstrip(':')) port = o.port or None - """One more special-case for dns urls or broken http urls. From the docs: - Following the syntax specifications in RFC 1808, urlparse recognizes - a netloc only if it is properly introduced by ‘//’. Otherwise the input - is presumed to be a relative URL and thus to start with a path component. - """ - if 'dns' == scheme: - hostname = o.path or None - path = None - else: - hostname = o.hostname or None - path = o.path or None + hostname = o.hostname or None + path = o.path or None if scheme.startswith('http'): #deal with "http:////////////////www.vikings.com" diff --git a/surt/surt.py b/surt/surt.py index 8a937bd..829a27c 100755 --- a/surt/surt.py +++ b/surt/surt.py @@ -58,19 +58,6 @@ def surt(url, canonicalizer=None, **options): if url.startswith("filedesc"): return url - if url.startswith("warcinfo"): - return url - - if url.startswith("dns:"): - res = hostToSURT(url[4:]) - if options.get('trailing_comma'): - res += ',' - res += ')' - return res - - if url.startswith("whois://"): - return url - if canonicalizer is None: canonicalizer = DefaultIAURLCanonicalizer.canonicalize else: diff --git a/tests/test_surt.py b/tests/test_surt.py index 5e8461e..39aa211 100644 --- a/tests/test_surt.py +++ b/tests/test_surt.py @@ -267,8 +267,8 @@ def test_surt(): assert surt.surt("filedesc:/foo.arc.gz") == 'filedesc:/foo.arc.gz' assert surt.surt("filedesc://foo.arc.gz") == 'filedesc://foo.arc.gz' assert surt.surt("warcinfo:foo.warc.gz") == 'warcinfo:foo.warc.gz' - assert surt.surt("dns:alexa.com") == 'com,alexa)' - assert surt.surt("dns:archive.org") == 'org,archive)' + assert surt.surt("dns:alexa.com") == 'dns:alexa.com' + assert surt.surt("dns:archive.org") == 'dns:archive.org' assert surt.surt("http://www.archive.org/") == 'org,archive)/' assert surt.surt("http://archive.org/") == 'org,archive)/' @@ -279,14 +279,14 @@ def test_surt(): # trailing comma mode assert surt.surt("http://archive.org/goo/?a=2&b&a=1", trailing_comma=True) == 'org,archive,)/goo?a=1&a=2&b' - assert surt.surt("dns:archive.org", trailing_comma=True) == 'org,archive,)' + assert surt.surt("dns:archive.org", trailing_comma=True) == 'dns:archive.org' assert surt.surt("warcinfo:foo.warc.gz", trailing_comma=True) == 'warcinfo:foo.warc.gz' # PHP session id: assert surt.surt("http://archive.org/index.php?PHPSESSID=0123456789abcdefghijklemopqrstuv&action=profile;u=4221") == 'org,archive)/index.php?action=profile;u=4221' # WHOIS url: - assert surt.surt("whois://whois.isoc.org.il/shaveh.co.il") == 'whois://whois.isoc.org.il/shaveh.co.il' + assert surt.surt("whois://whois.isoc.org.il/shaveh.co.il") == 'il,org,isoc,whois)/shaveh.co.il' # Yahoo web bug. See https://github.com/internetarchive/surt/issues/1 assert surt.surt('http://visit.webhosting.yahoo.com/visit.gif?&r=http%3A//web.archive.org/web/20090517140029/http%3A//anthonystewarthead.electric-chi.com/&b=Netscape%205.0%20%28Windows%3B%20en-US%29&s=1366x768&o=Win32&c=24&j=true&v=1.2') == 'com,yahoo,webhosting,visit)/visit.gif?&b=netscape%205.0%20(windows;%20en-us)&c=24&j=true&o=win32&r=http://web.archive.org/web/20090517140029/http://anthonystewarthead.electric-chi.com/&s=1366x768&v=1.2' @@ -299,6 +299,8 @@ def test_surt(): assert surt.surt("http://www.example.com/", with_scheme=False) == 'com,example)/' assert surt.surt("http://www.example.com/", with_scheme=True, trailing_comma=True) == 'http://(com,example,)/' assert surt.surt("https://www.example.com/", with_scheme=True, trailing_comma=True) == 'https://(com,example,)/' + assert surt.surt("ftp://www.example.com/", with_scheme=False, trailing_comma=True) == 'com,example,)/' + assert surt.surt("ftp://www.example.com/", with_scheme=False, trailing_comma=False) == 'com,example)/' assert surt.surt("ftp://www.example.com/", with_scheme=True, trailing_comma=True) == 'ftp://(com,example,)/' assert surt.surt("http://www.example.com/", with_scheme=True, host_massage=False) == 'http://(com,example,www)/' assert surt.surt("http://www.example.com/", with_scheme=False, host_massage=False) == 'com,example,www)/' @@ -306,3 +308,15 @@ def test_surt(): assert surt.surt("https://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False) == 'https://(com,example,www,)/' assert surt.surt("ftp://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False) == 'ftp://(com,example,www,)/' + assert surt.surt("mailto:foo@example.com", with_scheme=True) == 'mailto:foo@example.com' + assert surt.surt("mailto:foo@example.com", trailing_comma=True) == 'mailto:foo@example.com' + assert surt.surt("mailto:foo@example.com", with_scheme=True, trailing_comma=True) == 'mailto:foo@example.com' + assert surt.surt("dns:archive.org", with_scheme=True) == 'dns:archive.org' + assert surt.surt("dns:archive.org", trailing_comma=True) == 'dns:archive.org' + assert surt.surt("dns:archive.org", with_scheme=True, trailing_comma=True) == 'dns:archive.org' + assert surt.surt("whois://whois.isoc.org.il/shaveh.co.il", with_scheme=True) == 'whois://(il,org,isoc,whois)/shaveh.co.il' + assert surt.surt("whois://whois.isoc.org.il/shaveh.co.il", trailing_comma=True) == 'il,org,isoc,whois,)/shaveh.co.il' + assert surt.surt("whois://whois.isoc.org.il/shaveh.co.il", trailing_comma=True, with_scheme=True) == 'whois://(il,org,isoc,whois,)/shaveh.co.il' + assert surt.surt("warcinfo:foo.warc.gz", trailing_comma=True) == 'warcinfo:foo.warc.gz' + assert surt.surt("warcinfo:foo.warc.gz", with_scheme=True) == 'warcinfo:foo.warc.gz' + assert surt.surt("warcinfo:foo.warc.gz", with_scheme=True, trailing_comma=True) == 'warcinfo:foo.warc.gz' From 4b7d4eda1215c64d087e7df7c70efcc27eb8716c Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 18 Dec 2015 23:36:20 +0000 Subject: [PATCH 27/32] bump version to 0.3b4 to work around pypi strangeness -- https://pypi.python.org/pypi?:action=display&name=surt&version=0.3b3 exists but missing from https://pypi.python.org/simple/surt/ --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 85d06ef..72400f8 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ def run_tests(self): setup(name='surt', - version='0.3b3', + version='0.3b4', author='rajbot', author_email='raj@archive.org', classifiers=[ From e97fd14f10e26cf30ddbf5be1fb390313f2ec862 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 7 Jan 2016 19:58:55 +0000 Subject: [PATCH 28/32] chmod a-x surt/*.py --- surt/DefaultIAURLCanonicalizer.py | 0 surt/GoogleURLCanonicalizer.py | 0 surt/IAURLCanonicalizer.py | 0 surt/URLRegexTransformer.py | 0 surt/handyurl.py | 0 surt/surt.py | 0 6 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 surt/DefaultIAURLCanonicalizer.py mode change 100755 => 100644 surt/GoogleURLCanonicalizer.py mode change 100755 => 100644 surt/IAURLCanonicalizer.py mode change 100755 => 100644 surt/URLRegexTransformer.py mode change 100755 => 100644 surt/handyurl.py mode change 100755 => 100644 surt/surt.py diff --git a/surt/DefaultIAURLCanonicalizer.py b/surt/DefaultIAURLCanonicalizer.py old mode 100755 new mode 100644 diff --git a/surt/GoogleURLCanonicalizer.py b/surt/GoogleURLCanonicalizer.py old mode 100755 new mode 100644 diff --git a/surt/IAURLCanonicalizer.py b/surt/IAURLCanonicalizer.py old mode 100755 new mode 100644 diff --git a/surt/URLRegexTransformer.py b/surt/URLRegexTransformer.py old mode 100755 new mode 100644 diff --git a/surt/handyurl.py b/surt/handyurl.py old mode 100755 new mode 100644 diff --git a/surt/surt.py b/surt/surt.py old mode 100755 new mode 100644 From 53af06c17d73126ae8f2ed7bd4066fd07888b1e7 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 18 Feb 2016 20:15:09 +0000 Subject: [PATCH 29/32] some more tests --- tests/test_surt.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/test_surt.py b/tests/test_surt.py index 39aa211..cd694b6 100644 --- a/tests/test_surt.py +++ b/tests/test_surt.py @@ -257,7 +257,6 @@ def test_stripQuerySessionID(): def test_hostToSURT(): assert surt.URLRegexTransformer.hostToSURT("www.archive.org") == 'org,archive,www' - def test_surt(): # These tests are from WaybackURLKeyMakerTest.java @@ -320,3 +319,9 @@ def test_surt(): assert surt.surt("warcinfo:foo.warc.gz", trailing_comma=True) == 'warcinfo:foo.warc.gz' assert surt.surt("warcinfo:foo.warc.gz", with_scheme=True) == 'warcinfo:foo.warc.gz' assert surt.surt("warcinfo:foo.warc.gz", with_scheme=True, trailing_comma=True) == 'warcinfo:foo.warc.gz' + +def test_options(): + assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse('http://example.com/foo?X=Y')).getURLString() == 'http://example.com/foo?x=y' + assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse('http://example.com/foo?X=Y'), query_lowercase=False).getURLString() == 'http://example.com/foo?X=Y' + assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse('http://example.com/foo?X=Y')).getURLString() == 'http://example.com/foo?x=y' + assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse('http://example.com/foo?X=Y'), query_lowercase=False).getURLString() == 'http://example.com/foo?X=Y' From 3bcf8ffaea8edf067bb595a109efac582456d47e Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 22 Mar 2016 01:10:04 +0000 Subject: [PATCH 30/32] port new fix: https://github.com/internetarchive/webarchive-commons/pull/17/commits/0cc72a1c3d1db464d27a3fe8d89e4138e28be171 -- Make canonicalizer be able to strip session id params even if they are the first params in the query string. And add session id strip test. --- surt/IAURLCanonicalizer.py | 31 +++++++++++++++++++++++++------ surt/URLRegexTransformer.py | 20 ++++++++++---------- 2 files changed, 35 insertions(+), 16 deletions(-) diff --git a/surt/IAURLCanonicalizer.py b/surt/IAURLCanonicalizer.py index 0603d6a..da721e7 100644 --- a/surt/IAURLCanonicalizer.py +++ b/surt/IAURLCanonicalizer.py @@ -41,6 +41,27 @@ def canonicalize(url, host_lowercase=True, host_massage=True, query_strip_empty=True, query_alpha_reorder=True, hash_strip=True, **_ignored): """The input url is a handyurl instance + + These doctests are from IAURLCanonicalizerTest.java: + + >>> canonicalize(handyurl.parse("http://ARCHIVE.ORG/")).getURLString() + 'http://archive.org/' + >>> canonicalize(handyurl.parse("http://www.archive.org:80/")).getURLString() + 'http://archive.org/' + >>> canonicalize(handyurl.parse("https://www.archive.org:80/")).getURLString() + 'https://archive.org:80/' + >>> canonicalize(handyurl.parse("http://www.archive.org:443/")).getURLString() + 'http://archive.org:443/' + >>> canonicalize(handyurl.parse("https://www.archive.org:443/")).getURLString() + 'https://archive.org/' + >>> canonicalize(handyurl.parse("http://www.archive.org/big/")).getURLString() + 'http://archive.org/big' + >>> canonicalize(handyurl.parse("dns:www.archive.org")).getURLString() + 'dns:www.archive.org' + >>> canonicalize(handyurl.parse("http://www.nsf.gov/statistics/sed/2009/SED_2009.zip?CFID=14387305&CFTOKEN=72942008&jsessionid=f030eacc7e49c4ca0b077922347418418766")).getURLString() + 'http://nsf.gov/statistics/sed/2009/sed_2009.zip?jsessionid=f030eacc7e49c4ca0b077922347418418766' + >>> canonicalize(handyurl.parse("http://www.nsf.gov/statistics/sed/2009/SED_2009.zip?CFID=14387305&CFTOKEN=72942008")).getURLString() + 'http://nsf.gov/statistics/sed/2009/sed_2009.zip' """ if host_lowercase and url.host: url.host = url.host.lower() @@ -77,17 +98,15 @@ def canonicalize(url, host_lowercase=True, host_massage=True, query = url.query if query: - if '' == query and query_strip_empty: - query = None - elif len(query) > 0: + if len(query) > 0: if query_strip_session_id: - #This function expects the query to start with a '?' - query = stripQuerySessionID('?'+query) - query = query[1:] #now strip off '?' that we just added + query = stripQuerySessionID(query) if query_lowercase: query = query.lower() if query_alpha_reorder: query = alphaReorderQuery(query) + if '' == query and query_strip_empty: + query = None url.query = query else: if query_strip_empty: diff --git a/surt/URLRegexTransformer.py b/surt/URLRegexTransformer.py index 04f829b..a36fbf4 100644 --- a/surt/URLRegexTransformer.py +++ b/surt/URLRegexTransformer.py @@ -47,23 +47,23 @@ def stripPathSessionID(path): # stripQuerySessionID #_______________________________________________________________________________ _RES_QUERY_SESSIONID = [ - re.compile("^(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I), - re.compile("^(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I), - re.compile("^(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I), - re.compile("^(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I), - re.compile("^(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I), + re.compile("^(.*)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I), + re.compile("^(.*)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I), + re.compile("^(.*)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I), + re.compile("^(.*)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I), + re.compile("^(.*)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I), ] -def stripQuerySessionID(path): +def stripQuerySessionID(query): for pattern in _RES_QUERY_SESSIONID: - m = pattern.match(path) + m = pattern.match(query) if m: if m.group(2): - path = m.group(1) + m.group(2) + query = m.group(1) + m.group(2) else: - path = m.group(1) + query = m.group(1) - return path + return query # hostToSURT From 639735168dfccfc3343ccbbf608ebf177a12dcdf Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 24 May 2016 13:39:17 -0700 Subject: [PATCH 31/32] update for tldextract 2.0 --- setup.py | 2 +- surt/handyurl.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 72400f8..a0fb691 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ def run_tests(self): zip_safe=True, install_requires=[ 'six', - 'tldextract', + 'tldextract>=2.0', ], provides=[ 'surt' ], packages=[ 'surt' ], diff --git a/surt/handyurl.py b/surt/handyurl.py index 866268c..cfc01b7 100644 --- a/surt/handyurl.py +++ b/surt/handyurl.py @@ -215,9 +215,7 @@ def getPublicSuffix(self): """Uses the tldextract module to get the public suffix via the Public Suffix List. """ - r = tldextract.extract(self.host) - return "%s.%s" % (r.domain, r.tld) - + return tldextract.extract(self.host).registered_domain # getPublicPrefix #___________________________________________________________________________ From 571ab7592b355a6a11c0ee3f6a8f9178d35dbdcb Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 24 May 2016 13:39:41 -0700 Subject: [PATCH 32/32] version 0.3.0 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a0fb691..e737058 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ def run_tests(self): setup(name='surt', - version='0.3b4', + version='0.3.0', author='rajbot', author_email='raj@archive.org', classifiers=[