Merge remote-tracking branch 'nlevitt/hurl-bytes' into hurl-bytes-merge

internetarchive · Jun 1, 2017 · 6b8e656 · 6b8e656
2 parents 7aaf758 + 08806b8
commit 6b8e656
Show file tree

Hide file tree

Showing 6 changed files with 268 additions and 205 deletions.
diff --git a/setup.py b/setup.py
@@ -15,7 +15,7 @@ def run_tests(self):
 
 
 setup(name='surt',
-      version='0.3.0',
+      version='0.3.1b1',
       author='rajbot',
       author_email='[email protected]',
       classifiers=[

diff --git a/surt/GoogleURLCanonicalizer.py b/surt/GoogleURLCanonicalizer.py
@@ -33,10 +33,13 @@
 
 from surt.handyurl import handyurl
 
-from six.moves.urllib.parse import quote, unquote
+try:
+    from urllib.parse import quote_from_bytes, unquote_to_bytes
+except:
+    from urllib import quote as quote_from_bytes, unquote as unquote_to_bytes
 from six import text_type, binary_type
 
-# unescapeRepeatedly()
+# canonicalize()
 #_______________________________________________________________________________
 def canonicalize(url, **_ignored):
     url.hash = None
@@ -48,31 +51,16 @@ def canonicalize(url, **_ignored):
         url.query = minimalEscape(url.query)
 
     if url.host:
-        hostE = unescapeRepeatedly(url.host)
-
-        # if the host was an ascii string of percent-encoded bytes that represent
-        # non-ascii unicode chars, then promote hostE from str to unicode.
-        # e.g. "http://www.t%EF%BF%BD%04.82.net/", which contains the unicode replacement char
-        if isinstance(hostE, binary_type):
+        host = unescapeRepeatedly(url.host)
+        try:
+            host.decode('ascii')
+        except UnicodeDecodeError:
             try:
-                hostE.decode('ascii')
-            except UnicodeDecodeError:
-                hostE = hostE.decode('utf-8', 'ignore')
+                host = host.decode('utf-8', 'ignore').encode('idna')
+            except ValueError:
+                pass
 
-
-        host = None
-        try:
-            # Note: I copied the use of the ToASCII(hostE) from
-            # the java code. This function implements RFC3490, which
-            # requires that each component of the hostname (i.e. each label)
-            # be encodeced separately, and doesn't work correctly with
-            # full hostnames. So use 'idna' encoding instead.
-            #host = encodings.idna.ToASCII(hostE)
-            host = hostE.encode('idna').decode('utf-8')
-        except ValueError:
-            host = hostE
-
-        host = host.replace('..', '.').strip('.')
+        host = host.replace(b'..', b'.').strip(b'.')
 
         ip = attemptIPFormats(host)
         if ip:
@@ -95,21 +83,21 @@ def canonicalize(url, **_ignored):
 
 def normalizePath(path):
     if not path:
-        return '/'
+        return b'/'
 
     #gives an empty trailing element if path ends with '/':
-    paths       = path.split('/')
+    paths       = path.split(b'/')
     keptPaths   = []
     first       = True
 
     for p in paths:
         if first:
             first = False
             continue
-        elif '.' == p:
+        elif b'.' == p:
             # skip
             continue
-        elif '..' == p:
+        elif b'..' == p:
             #pop the last path, if present:
             if len(keptPaths) > 0:
                 keptPaths = keptPaths[:-1]
@@ -119,7 +107,7 @@ def normalizePath(path):
         else:
             keptPaths.append(p)
 
-    path = '/'
+    path = b'/'
 
     # If the path ends in '/', then the last element of keptPaths will be ''
     # Since we add a trailing '/' after the second-to-last element of keptPaths
@@ -130,13 +118,13 @@ def normalizePath(path):
             p = keptPaths[i]
             if len(p) > 0:
                 #this will omit multiple slashes:
-                path += p + '/'
+                path += p + b'/'
         path += keptPaths[numKept-1]
 
     return path
 
-OCTAL_IP = re.compile(r"^(0[0-7]*)(\.[0-7]+)?(\.[0-7]+)?(\.[0-7]+)?$")
-DECIMAL_IP = re.compile(r"^([1-9][0-9]*)(\.[0-9]+)?(\.[0-9]+)?(\.[0-9]+)?$")
+OCTAL_IP = re.compile(br"^(0[0-7]*)(\.[0-7]+)?(\.[0-7]+)?(\.[0-7]+)?$")
+DECIMAL_IP = re.compile(br"^([1-9][0-9]*)(\.[0-9]+)?(\.[0-9]+)?(\.[0-9]+)?$")
 
 # attemptIPFormats()
 #_______________________________________________________________________________
@@ -146,19 +134,20 @@ def attemptIPFormats(host):
 
     if host.isdigit():
         #mask hostname to lower four bytes to workaround issue with liveweb arc files
-        return socket.inet_ntoa(struct.pack('>L', int(host) & 0xffffffff))
+        return socket.inet_ntoa(
+                struct.pack('>L', int(host) & 0xffffffff)).encode('ascii')
     else:
         m = DECIMAL_IP.match(host)
         if m:
             try:
-                return socket.gethostbyname_ex(host)[2][0]
+                return socket.gethostbyname_ex(host)[2][0].encode('ascii')
             except (socket.gaierror, socket.herror):
                 return None
         else:
             m = OCTAL_IP.match(host)
             if m:
                 try:
-                    return socket.gethostbyname_ex(host)[2][0]
+                    return socket.gethostbyname_ex(host)[2][0].encode('ascii')
                 except socket.gaierror:
                     return None
 
@@ -175,25 +164,22 @@ def minimalEscape(input):
 def escapeOnce(input):
     """escape everything outside of 32-128, except #"""
     if input:
-        # If input is a unicode type, we need to chose an encoding before
-        # percent encoding, since different encodings of the same unicode
-        # characters will result in different surts.
-        # We will use utf-8 for consistency.
-        if isinstance(input, text_type):
-            input = input.encode('utf-8')
-        return quote(input, """!"$&'()*+,-./:;<=>?@[\]^_`{|}~""")
+        return quote_from_bytes(
+                input, safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~''').encode(
+                        'ascii')
     else:
         return input
 
 
 # unescapeRepeatedly()
 #_______________________________________________________________________________
 def unescapeRepeatedly(input):
+    '''Argument may be str or bytes. Returns bytes.'''
     if None == input:
         return None
 
     while True:
-        un = unquote(input)
+        un = unquote_to_bytes(input)
         if un == input:
             return input
         input = un

diff --git a/surt/IAURLCanonicalizer.py b/surt/IAURLCanonicalizer.py
@@ -40,33 +40,11 @@ def canonicalize(url, host_lowercase=True, host_massage=True,
                  query_lowercase=True, query_strip_session_id=True,
                  query_strip_empty=True, query_alpha_reorder=True,
                  hash_strip=True, **_ignored):
-    """The input url is a handyurl instance
-
-    These doctests are from IAURLCanonicalizerTest.java:
-
-    >>> canonicalize(handyurl.parse("http://ARCHIVE.ORG/")).getURLString()
-    'http://archive.org/'
-    >>> canonicalize(handyurl.parse("http://www.archive.org:80/")).getURLString()
-    'http://archive.org/'
-    >>> canonicalize(handyurl.parse("https://www.archive.org:80/")).getURLString()
-    'https://archive.org:80/'
-    >>> canonicalize(handyurl.parse("http://www.archive.org:443/")).getURLString()
-    'http://archive.org:443/'
-    >>> canonicalize(handyurl.parse("https://www.archive.org:443/")).getURLString()
-    'https://archive.org/'
-    >>> canonicalize(handyurl.parse("http://www.archive.org/big/")).getURLString()
-    'http://archive.org/big'
-    >>> canonicalize(handyurl.parse("dns:www.archive.org")).getURLString()
-    'dns:www.archive.org'
-    >>> canonicalize(handyurl.parse("http://www.nsf.gov/statistics/sed/2009/SED_2009.zip?CFID=14387305&CFTOKEN=72942008&jsessionid=f030eacc7e49c4ca0b077922347418418766")).getURLString()
-    'http://nsf.gov/statistics/sed/2009/sed_2009.zip?jsessionid=f030eacc7e49c4ca0b077922347418418766'
-    >>> canonicalize(handyurl.parse("http://www.nsf.gov/statistics/sed/2009/SED_2009.zip?CFID=14387305&CFTOKEN=72942008")).getURLString()
-    'http://nsf.gov/statistics/sed/2009/sed_2009.zip'
-    """
+    """The input url is a handyurl instance"""
     if host_lowercase and url.host:
         url.host = url.host.lower()
 
-    if host_massage and url.host and (url.scheme != 'dns'): ###java version calls massageHost regardless of scheme
+    if host_massage and url.host and (url.scheme != b'dns'): ###java version calls massageHost regardless of scheme
         url.host = massageHost(url.host)
 
     if auth_strip_user:
@@ -81,17 +59,17 @@ def canonicalize(url, host_lowercase=True, host_massage=True,
             url.port = handyurl.DEFAULT_PORT
 
     path = url.path
-    if path_strip_empty and '/' == path:
+    if path_strip_empty and b'/' == path:
         url.path = None
     else:
         if path_lowercase and path:
             path = path.lower()
         if path_strip_session_id and path:
             path = stripPathSessionID(path)
-        if path_strip_empty and '/' == path:
+        if path_strip_empty and b'/' == path:
             path = None
         if path_strip_trailing_slash_unless_empty and path:
-            if path.endswith("/") and len(path)>1:
+            if path.endswith(b'/') and len(path)>1:
                 path = path[:-1]
 
         url.path = path
@@ -105,7 +83,7 @@ def canonicalize(url, host_lowercase=True, host_massage=True,
                 query = query.lower()
             if query_alpha_reorder:
                 query = alphaReorderQuery(query)
-        if '' == query and query_strip_empty:
+        if b'' == query and query_strip_empty:
             query = None
         url.query = query
     else:
@@ -130,23 +108,23 @@ def alphaReorderQuery(orig):
     if len(orig) <= 1:
         return orig
 
-    args = orig.split("&")
-    qas = [tuple(arg.split('=', 1)) for arg in args]
+    args = orig.split(b'&')
+    qas = [tuple(arg.split(b'=', 1)) for arg in args]
     qas.sort()
 
-    s = ''
+    s = b''
     for t in qas:
         if 1 == len(t):
-            s += t[0] + '&'
+            s += t[0] + b'&'
         else:
-            s += t[0] + '=' + t[1] + '&'
+            s += t[0] + b'=' + t[1] + b'&'
 
     return s[:-1] #remove last &
 
 
 # massageHost()
 #_______________________________________________________________________________
-_RE_WWWDIGITS = re.compile('www\d*\.')
+_RE_WWWDIGITS = re.compile(b'www\d*\.')
 
 def massageHost(host):
     m = _RE_WWWDIGITS.match(host)
@@ -159,9 +137,9 @@ def massageHost(host):
 #_______________________________________________________________________________
 def getDefaultPort(scheme):
     scheme_lower = scheme.lower()
-    if 'http' == scheme_lower:
+    if b'http' == scheme_lower:
         return 80
-    elif 'https' == scheme_lower:
+    elif b'https' == scheme_lower:
         return 443
     else:
         return 0

diff --git a/surt/URLRegexTransformer.py b/surt/URLRegexTransformer.py
@@ -28,8 +28,8 @@
 # stripPathSessionID
 #_______________________________________________________________________________
 _RES_PATH_SESSIONID = [
-    re.compile("^(.*/)(\((?:[a-z]\([0-9a-z]{24}\))+\)/)([^\?]+\.aspx.*)$", re.I),
-    re.compile("^(.*/)(\\([0-9a-z]{24}\\)/)([^\\?]+\\.aspx.*)$", re.I),
+    re.compile(b"^(.*/)(\((?:[a-z]\([0-9a-z]{24}\))+\)/)([^\?]+\.aspx.*)$", re.I),
+    re.compile(b"^(.*/)(\\([0-9a-z]{24}\\)/)([^\\?]+\\.aspx.*)$", re.I),
     ]
 
 def stripPathSessionID(path):
@@ -47,11 +47,11 @@ def stripPathSessionID(path):
 # stripQuerySessionID
 #_______________________________________________________________________________
 _RES_QUERY_SESSIONID = [
-    re.compile("^(.*)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
-    re.compile("^(.*)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
-    re.compile("^(.*)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
-    re.compile("^(.*)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I),
-    re.compile("^(.*)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I),
+    re.compile(b"^(.*)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
+    re.compile(b"^(.*)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
+    re.compile(b"^(.*)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
+    re.compile(b"^(.*)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I),
+    re.compile(b"^(.*)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I),
     ]
 
 def stripQuerySessionID(query):
@@ -68,12 +68,12 @@ def stripQuerySessionID(query):
 
 # hostToSURT
 #_______________________________________________________________________________
-_RE_IP_ADDRESS = re.compile(r"(?:(?:\d{1,3}\.){3}\d{1,3})$")
+_RE_IP_ADDRESS = re.compile(br"(?:(?:\d{1,3}\.){3}\d{1,3})$")
 
 def hostToSURT(host, reverse_ipaddr=True):
     if not reverse_ipaddr and _RE_IP_ADDRESS.match(host):
         return host
 
-    parts = host.split('.')
+    parts = host.split(b'.')
     parts.reverse()
-    return ','.join(parts)
+    return b','.join(parts)