Skip to content

Commit

Permalink
Merge remote-tracking branch 'nlevitt/hurl-bytes' into hurl-bytes-merge
Browse files Browse the repository at this point in the history
  • Loading branch information
kngenie committed Jun 1, 2017
2 parents 7aaf758 + 08806b8 commit 6b8e656
Show file tree
Hide file tree
Showing 6 changed files with 268 additions and 205 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def run_tests(self):


setup(name='surt',
version='0.3.0',
version='0.3.1b1',
author='rajbot',
author_email='[email protected]',
classifiers=[
Expand Down
74 changes: 30 additions & 44 deletions surt/GoogleURLCanonicalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,13 @@

from surt.handyurl import handyurl

from six.moves.urllib.parse import quote, unquote
try:
from urllib.parse import quote_from_bytes, unquote_to_bytes
except:
from urllib import quote as quote_from_bytes, unquote as unquote_to_bytes
from six import text_type, binary_type

# unescapeRepeatedly()
# canonicalize()
#_______________________________________________________________________________
def canonicalize(url, **_ignored):
url.hash = None
Expand All @@ -48,31 +51,16 @@ def canonicalize(url, **_ignored):
url.query = minimalEscape(url.query)

if url.host:
hostE = unescapeRepeatedly(url.host)

# if the host was an ascii string of percent-encoded bytes that represent
# non-ascii unicode chars, then promote hostE from str to unicode.
# e.g. "http://www.t%EF%BF%BD%04.82.net/", which contains the unicode replacement char
if isinstance(hostE, binary_type):
host = unescapeRepeatedly(url.host)
try:
host.decode('ascii')
except UnicodeDecodeError:
try:
hostE.decode('ascii')
except UnicodeDecodeError:
hostE = hostE.decode('utf-8', 'ignore')
host = host.decode('utf-8', 'ignore').encode('idna')
except ValueError:
pass


host = None
try:
# Note: I copied the use of the ToASCII(hostE) from
# the java code. This function implements RFC3490, which
# requires that each component of the hostname (i.e. each label)
# be encodeced separately, and doesn't work correctly with
# full hostnames. So use 'idna' encoding instead.
#host = encodings.idna.ToASCII(hostE)
host = hostE.encode('idna').decode('utf-8')
except ValueError:
host = hostE

host = host.replace('..', '.').strip('.')
host = host.replace(b'..', b'.').strip(b'.')

ip = attemptIPFormats(host)
if ip:
Expand All @@ -95,21 +83,21 @@ def canonicalize(url, **_ignored):

def normalizePath(path):
if not path:
return '/'
return b'/'

#gives an empty trailing element if path ends with '/':
paths = path.split('/')
paths = path.split(b'/')
keptPaths = []
first = True

for p in paths:
if first:
first = False
continue
elif '.' == p:
elif b'.' == p:
# skip
continue
elif '..' == p:
elif b'..' == p:
#pop the last path, if present:
if len(keptPaths) > 0:
keptPaths = keptPaths[:-1]
Expand All @@ -119,7 +107,7 @@ def normalizePath(path):
else:
keptPaths.append(p)

path = '/'
path = b'/'

# If the path ends in '/', then the last element of keptPaths will be ''
# Since we add a trailing '/' after the second-to-last element of keptPaths
Expand All @@ -130,13 +118,13 @@ def normalizePath(path):
p = keptPaths[i]
if len(p) > 0:
#this will omit multiple slashes:
path += p + '/'
path += p + b'/'
path += keptPaths[numKept-1]

return path

OCTAL_IP = re.compile(r"^(0[0-7]*)(\.[0-7]+)?(\.[0-7]+)?(\.[0-7]+)?$")
DECIMAL_IP = re.compile(r"^([1-9][0-9]*)(\.[0-9]+)?(\.[0-9]+)?(\.[0-9]+)?$")
OCTAL_IP = re.compile(br"^(0[0-7]*)(\.[0-7]+)?(\.[0-7]+)?(\.[0-7]+)?$")
DECIMAL_IP = re.compile(br"^([1-9][0-9]*)(\.[0-9]+)?(\.[0-9]+)?(\.[0-9]+)?$")

# attemptIPFormats()
#_______________________________________________________________________________
Expand All @@ -146,19 +134,20 @@ def attemptIPFormats(host):

if host.isdigit():
#mask hostname to lower four bytes to workaround issue with liveweb arc files
return socket.inet_ntoa(struct.pack('>L', int(host) & 0xffffffff))
return socket.inet_ntoa(
struct.pack('>L', int(host) & 0xffffffff)).encode('ascii')
else:
m = DECIMAL_IP.match(host)
if m:
try:
return socket.gethostbyname_ex(host)[2][0]
return socket.gethostbyname_ex(host)[2][0].encode('ascii')
except (socket.gaierror, socket.herror):
return None
else:
m = OCTAL_IP.match(host)
if m:
try:
return socket.gethostbyname_ex(host)[2][0]
return socket.gethostbyname_ex(host)[2][0].encode('ascii')
except socket.gaierror:
return None

Expand All @@ -175,25 +164,22 @@ def minimalEscape(input):
def escapeOnce(input):
"""escape everything outside of 32-128, except #"""
if input:
# If input is a unicode type, we need to chose an encoding before
# percent encoding, since different encodings of the same unicode
# characters will result in different surts.
# We will use utf-8 for consistency.
if isinstance(input, text_type):
input = input.encode('utf-8')
return quote(input, """!"$&'()*+,-./:;<=>?@[\]^_`{|}~""")
return quote_from_bytes(
input, safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~''').encode(
'ascii')
else:
return input


# unescapeRepeatedly()
#_______________________________________________________________________________
def unescapeRepeatedly(input):
'''Argument may be str or bytes. Returns bytes.'''
if None == input:
return None

while True:
un = unquote(input)
un = unquote_to_bytes(input)
if un == input:
return input
input = un
Expand Down
50 changes: 14 additions & 36 deletions surt/IAURLCanonicalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,33 +40,11 @@ def canonicalize(url, host_lowercase=True, host_massage=True,
query_lowercase=True, query_strip_session_id=True,
query_strip_empty=True, query_alpha_reorder=True,
hash_strip=True, **_ignored):
"""The input url is a handyurl instance
These doctests are from IAURLCanonicalizerTest.java:
>>> canonicalize(handyurl.parse("http://ARCHIVE.ORG/")).getURLString()
'http://archive.org/'
>>> canonicalize(handyurl.parse("http://www.archive.org:80/")).getURLString()
'http://archive.org/'
>>> canonicalize(handyurl.parse("https://www.archive.org:80/")).getURLString()
'https://archive.org:80/'
>>> canonicalize(handyurl.parse("http://www.archive.org:443/")).getURLString()
'http://archive.org:443/'
>>> canonicalize(handyurl.parse("https://www.archive.org:443/")).getURLString()
'https://archive.org/'
>>> canonicalize(handyurl.parse("http://www.archive.org/big/")).getURLString()
'http://archive.org/big'
>>> canonicalize(handyurl.parse("dns:www.archive.org")).getURLString()
'dns:www.archive.org'
>>> canonicalize(handyurl.parse("http://www.nsf.gov/statistics/sed/2009/SED_2009.zip?CFID=14387305&CFTOKEN=72942008&jsessionid=f030eacc7e49c4ca0b077922347418418766")).getURLString()
'http://nsf.gov/statistics/sed/2009/sed_2009.zip?jsessionid=f030eacc7e49c4ca0b077922347418418766'
>>> canonicalize(handyurl.parse("http://www.nsf.gov/statistics/sed/2009/SED_2009.zip?CFID=14387305&CFTOKEN=72942008")).getURLString()
'http://nsf.gov/statistics/sed/2009/sed_2009.zip'
"""
"""The input url is a handyurl instance"""
if host_lowercase and url.host:
url.host = url.host.lower()

if host_massage and url.host and (url.scheme != 'dns'): ###java version calls massageHost regardless of scheme
if host_massage and url.host and (url.scheme != b'dns'): ###java version calls massageHost regardless of scheme
url.host = massageHost(url.host)

if auth_strip_user:
Expand All @@ -81,17 +59,17 @@ def canonicalize(url, host_lowercase=True, host_massage=True,
url.port = handyurl.DEFAULT_PORT

path = url.path
if path_strip_empty and '/' == path:
if path_strip_empty and b'/' == path:
url.path = None
else:
if path_lowercase and path:
path = path.lower()
if path_strip_session_id and path:
path = stripPathSessionID(path)
if path_strip_empty and '/' == path:
if path_strip_empty and b'/' == path:
path = None
if path_strip_trailing_slash_unless_empty and path:
if path.endswith("/") and len(path)>1:
if path.endswith(b'/') and len(path)>1:
path = path[:-1]

url.path = path
Expand All @@ -105,7 +83,7 @@ def canonicalize(url, host_lowercase=True, host_massage=True,
query = query.lower()
if query_alpha_reorder:
query = alphaReorderQuery(query)
if '' == query and query_strip_empty:
if b'' == query and query_strip_empty:
query = None
url.query = query
else:
Expand All @@ -130,23 +108,23 @@ def alphaReorderQuery(orig):
if len(orig) <= 1:
return orig

args = orig.split("&")
qas = [tuple(arg.split('=', 1)) for arg in args]
args = orig.split(b'&')
qas = [tuple(arg.split(b'=', 1)) for arg in args]
qas.sort()

s = ''
s = b''
for t in qas:
if 1 == len(t):
s += t[0] + '&'
s += t[0] + b'&'
else:
s += t[0] + '=' + t[1] + '&'
s += t[0] + b'=' + t[1] + b'&'

return s[:-1] #remove last &


# massageHost()
#_______________________________________________________________________________
_RE_WWWDIGITS = re.compile('www\d*\.')
_RE_WWWDIGITS = re.compile(b'www\d*\.')

def massageHost(host):
m = _RE_WWWDIGITS.match(host)
Expand All @@ -159,9 +137,9 @@ def massageHost(host):
#_______________________________________________________________________________
def getDefaultPort(scheme):
scheme_lower = scheme.lower()
if 'http' == scheme_lower:
if b'http' == scheme_lower:
return 80
elif 'https' == scheme_lower:
elif b'https' == scheme_lower:
return 443
else:
return 0
Expand Down
20 changes: 10 additions & 10 deletions surt/URLRegexTransformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
# stripPathSessionID
#_______________________________________________________________________________
_RES_PATH_SESSIONID = [
re.compile("^(.*/)(\((?:[a-z]\([0-9a-z]{24}\))+\)/)([^\?]+\.aspx.*)$", re.I),
re.compile("^(.*/)(\\([0-9a-z]{24}\\)/)([^\\?]+\\.aspx.*)$", re.I),
re.compile(b"^(.*/)(\((?:[a-z]\([0-9a-z]{24}\))+\)/)([^\?]+\.aspx.*)$", re.I),
re.compile(b"^(.*/)(\\([0-9a-z]{24}\\)/)([^\\?]+\\.aspx.*)$", re.I),
]

def stripPathSessionID(path):
Expand All @@ -47,11 +47,11 @@ def stripPathSessionID(path):
# stripQuerySessionID
#_______________________________________________________________________________
_RES_QUERY_SESSIONID = [
re.compile("^(.*)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
re.compile("^(.*)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
re.compile("^(.*)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
re.compile("^(.*)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I),
re.compile("^(.*)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I),
re.compile(b"^(.*)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
re.compile(b"^(.*)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
re.compile(b"^(.*)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
re.compile(b"^(.*)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I),
re.compile(b"^(.*)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I),
]

def stripQuerySessionID(query):
Expand All @@ -68,12 +68,12 @@ def stripQuerySessionID(query):

# hostToSURT
#_______________________________________________________________________________
_RE_IP_ADDRESS = re.compile(r"(?:(?:\d{1,3}\.){3}\d{1,3})$")
_RE_IP_ADDRESS = re.compile(br"(?:(?:\d{1,3}\.){3}\d{1,3})$")

def hostToSURT(host, reverse_ipaddr=True):
if not reverse_ipaddr and _RE_IP_ADDRESS.match(host):
return host

parts = host.split('.')
parts = host.split(b'.')
parts.reverse()
return ','.join(parts)
return b','.join(parts)
Loading

0 comments on commit 6b8e656

Please sign in to comment.