From 8da352359a53f81e36f08d3b268fe14bd8231027 Mon Sep 17 00:00:00 2001
From: Dave Buchfuhrer <buck@houzz.com>
Date: Mon, 21 Dec 2015 19:10:18 -0800
Subject: [PATCH] Unescapes html in PageParser.href_match_to_url

PageParser breaks if the links contain any escaped characters. This fixes that
bug.
---
 pex/crawler.py        | 11 ++++++++++-
 tests/test_crawler.py |  6 ++++++
 2 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/pex/crawler.py b/pex/crawler.py
index 430025d64..00b70fb73 100644
--- a/pex/crawler.py
+++ b/pex/crawler.py
@@ -22,6 +22,15 @@
   from urlparse import urlparse
 
 
+def unescape(s):
+  """Unescapes html. Taken from https://wiki.python.org/moin/EscapingHtml"""
+  s = s.replace("&lt;", "<")
+  s = s.replace("&gt;", ">")
+  # this has to be last:
+  s = s.replace("&amp;", "&")
+  return s
+
+
 class PageParser(object):
   """A helper class to extract and differentiate ordinary and download links from webpages."""
 
@@ -34,7 +43,7 @@ class PageParser(object):
   def href_match_to_url(cls, match):
     def pick(group):
       return '' if group is None else group
-    return pick(match.group(1)) or pick(match.group(2)) or pick(match.group(3))
+    return unescape(pick(match.group(1)) or pick(match.group(2)) or pick(match.group(3)))
 
   @classmethod
   def rel_links(cls, page):
diff --git a/tests/test_crawler.py b/tests/test_crawler.py
index 92935539e..3fc44ebec 100644
--- a/tests/test_crawler.py
+++ b/tests/test_crawler.py
@@ -46,6 +46,12 @@ def test_page_parser_basic():
       assert lpp("<a href='stuff'> <a href=%s>" % target) == (['stuff', href], [])
 
 
+def test_page_parser_escaped_html():
+  url = 'url?param1=val&param2=val2'
+  link = 'a href="%s"' % url.replace('&', '&amp;')
+  assert lpp(link) == ([url], [])
+
+
 def test_page_parser_rels():
   VALID_RELS = tuple(PageParser.REL_TYPES)
   for rel in VALID_RELS + ('', ' ', 'blah'):