Memoize calls to Crawler.crawl() for performance win in find-links ba…

…sed resolution.
pex-tool · Dec 17, 2015 · 4f0e54e · 4f0e54e
1 parent 1e70fdd
commit 4f0e54e
Show file tree

Hide file tree

Showing 2 changed files with 86 additions and 6 deletions.
diff --git a/pex/crawler.py b/pex/crawler.py
@@ -12,6 +12,7 @@
 from .http import Context
 from .link import Link
 from .tracer import TRACER
+from .util import Memoizer
 
 if PY3:
   from queue import Empty, Queue
@@ -64,6 +65,14 @@ def partition(L, pred):
 class Crawler(object):
   """A multi-threaded crawler that supports local (disk) and remote (web) crawling."""
 
+  # Memoizer for calls to Crawler.crawl().
+  _CRAWL_CACHE = Memoizer()
+
+  @classmethod
+  def reset_cache(cls):
+    """Reset the internal crawl cache. This is intended primarily for tests."""
+    cls._CRAWL_CACHE = Memoizer()
+
   @classmethod
   def crawl_local(cls, link):
     try:
@@ -99,7 +108,22 @@ def __init__(self, context=None, threads=1):
     self._threads = threads
     self.context = context or Context.get()
 
+  def _make_cache_key(self, links, follow_links):
+    return (follow_links,) + tuple(links)
+
   def crawl(self, link_or_links, follow_links=False):
+    links = list(Link.wrap_iterable(link_or_links))
+    cache_key = self._make_cache_key(links, follow_links)
+
+    # Memoize crawling to a global Memoizer (Crawler._CRAWL_CACHE).
+    result = self._CRAWL_CACHE.get(cache_key)
+    if result is None:
+      result = self._crawl(links, follow_links)
+      self._CRAWL_CACHE.store(cache_key, result)
+
+    return result
+
+  def _crawl(self, link_or_links, follow_links):
     links, seen = set(), set()
     queue = Queue()
     converged = threading.Event()
@@ -127,7 +151,8 @@ def execute():
                 queue.put(rel)
         queue.task_done()
 
-    for link in Link.wrap_iterable(link_or_links):
+    for i, link in enumerate(link_or_links):
+      TRACER.log('crawling link i=%s link=%s follow_links=%s' % (i, link, follow_links), V=3)
       queue.put(link)
 
     workers = []
@@ -140,6 +165,5 @@ def execute():
     queue.join()
     converged.set()
 
-    # We deliberately not join back the worker threads, since they are no longer of
-    # any use to us.
+    # We deliberately do not join the worker threads, since they are no longer of any use to us.
     return links
diff --git a/tests/test_crawler.py b/tests/test_crawler.py
@@ -6,8 +6,14 @@
 from twitter.common.contextutil import temporary_dir
 
 from pex.crawler import Crawler, PageParser
+from pex.http import Context
 from pex.link import Link
 
+try:
+  from unittest import mock
+except ImportError:
+  import mock
+
 
 def lpp(page):
   links = PageParser.links(page)
@@ -100,6 +106,56 @@ def test_crawler_unknown_scheme():
   Crawler().crawl('ftp://ftp.cdrom.com') == (set(), set())
 
 
-# TODO(wickman)
-#   test remote http crawling via mock
-#   test page decoding via mock
+MOCK_INDEX_TMPL = '''
+<h1>Index of /home/third_party/python</h1>
+<table>
+<tr>
+  <td valign="top"><img src="/icons/back.gif" alt="[DIR]"></td>
+  <td>&nbsp;</td>
+  <td align="right">  - </td>
+  <td>&nbsp;</td>
+</tr>
+%s
+</table>
+'''
+
+MOCK_INDEX_A = MOCK_INDEX_TMPL % '''
+<tr>
+  <td valign="top"><img src="/icons/compressed.gif" alt="[   ]"></td>
+  <td><a href="3to2-1.0.tar.gz">3to2-1.0.tar.gz</a></td>
+  <td align="right">16-Apr-2015 23:18  </td>
+  <td align="right"> 45K</td>
+  <td>GZIP compressed docume></td>
+</tr>
+'''
+
+MOCK_INDEX_B = MOCK_INDEX_TMPL % '''
+<tr>
+  <td valign="top"><img src="/icons/compressed.gif" alt="[   ]"></td>
+  <td>
+    <a href="APScheduler-2.1.0.tar.gz">APScheduler-2.1.0.tar.gz</a>
+  </td>
+  <td align="right">16-Apr-2015 23:18  </td>
+  <td align="right"> 41K</td>
+  <td>GZIP compressed docume></td>
+</tr>
+'''
+
+
+def test_crawler_remote():
+  Crawler.reset_cache()
+
+  mock_context = mock.create_autospec(Context, spec_set=True)
+  mock_context.content.side_effect = [MOCK_INDEX_A, MOCK_INDEX_B, Exception('shouldnt get here')]
+  expected_output = set([Link('http://url1.test.com/3to2-1.0.tar.gz'),
+                         Link('http://url2.test.com/APScheduler-2.1.0.tar.gz')])
+
+  c = Crawler(mock_context)
+  test_links = [Link('http://url1.test.com'), Link('http://url2.test.com')]
+  assert c.crawl(test_links) == expected_output
+
+  # Test memoization of Crawler.crawl().
+  assert c.crawl(test_links) == expected_output
+
+
+# TODO(wickman): test page decoding via mock