From 8da352359a53f81e36f08d3b268fe14bd8231027 Mon Sep 17 00:00:00 2001 From: Dave Buchfuhrer Date: Mon, 21 Dec 2015 19:10:18 -0800 Subject: [PATCH] Unescapes html in PageParser.href_match_to_url PageParser breaks if the links contain any escaped characters. This fixes that bug. --- pex/crawler.py | 11 ++++++++++- tests/test_crawler.py | 6 ++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/pex/crawler.py b/pex/crawler.py index 430025d64..00b70fb73 100644 --- a/pex/crawler.py +++ b/pex/crawler.py @@ -22,6 +22,15 @@ from urlparse import urlparse +def unescape(s): + """Unescapes html. Taken from https://wiki.python.org/moin/EscapingHtml""" + s = s.replace("<", "<") + s = s.replace(">", ">") + # this has to be last: + s = s.replace("&", "&") + return s + + class PageParser(object): """A helper class to extract and differentiate ordinary and download links from webpages.""" @@ -34,7 +43,7 @@ class PageParser(object): def href_match_to_url(cls, match): def pick(group): return '' if group is None else group - return pick(match.group(1)) or pick(match.group(2)) or pick(match.group(3)) + return unescape(pick(match.group(1)) or pick(match.group(2)) or pick(match.group(3))) @classmethod def rel_links(cls, page): diff --git a/tests/test_crawler.py b/tests/test_crawler.py index 92935539e..3fc44ebec 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -46,6 +46,12 @@ def test_page_parser_basic(): assert lpp(" " % target) == (['stuff', href], []) +def test_page_parser_escaped_html(): + url = 'url?param1=val¶m2=val2' + link = 'a href="%s"' % url.replace('&', '&') + assert lpp(link) == ([url], []) + + def test_page_parser_rels(): VALID_RELS = tuple(PageParser.REL_TYPES) for rel in VALID_RELS + ('', ' ', 'blah'):