diff --git a/pex/crawler.py b/pex/crawler.py index e61e54036..430025d64 100644 --- a/pex/crawler.py +++ b/pex/crawler.py @@ -12,6 +12,7 @@ from .http import Context from .link import Link from .tracer import TRACER +from .util import Memoizer if PY3: from queue import Empty, Queue @@ -64,6 +65,14 @@ def partition(L, pred): class Crawler(object): """A multi-threaded crawler that supports local (disk) and remote (web) crawling.""" + # Memoizer for calls to Crawler.crawl(). + _CRAWL_CACHE = Memoizer() + + @classmethod + def reset_cache(cls): + """Reset the internal crawl cache. This is intended primarily for tests.""" + cls._CRAWL_CACHE = Memoizer() + @classmethod def crawl_local(cls, link): try: @@ -99,7 +108,22 @@ def __init__(self, context=None, threads=1): self._threads = threads self.context = context or Context.get() + def _make_cache_key(self, links, follow_links): + return (follow_links,) + tuple(links) + def crawl(self, link_or_links, follow_links=False): + links = list(Link.wrap_iterable(link_or_links)) + cache_key = self._make_cache_key(links, follow_links) + + # Memoize crawling to a global Memoizer (Crawler._CRAWL_CACHE). + result = self._CRAWL_CACHE.get(cache_key) + if result is None: + result = self._crawl(links, follow_links) + self._CRAWL_CACHE.store(cache_key, result) + + return result + + def _crawl(self, link_or_links, follow_links): links, seen = set(), set() queue = Queue() converged = threading.Event() @@ -127,7 +151,8 @@ def execute(): queue.put(rel) queue.task_done() - for link in Link.wrap_iterable(link_or_links): + for i, link in enumerate(link_or_links): + TRACER.log('crawling link i=%s link=%s follow_links=%s' % (i, link, follow_links), V=3) queue.put(link) workers = [] @@ -140,6 +165,5 @@ def execute(): queue.join() converged.set() - # We deliberately not join back the worker threads, since they are no longer of - # any use to us. + # We deliberately do not join the worker threads, since they are no longer of any use to us. return links diff --git a/tests/test_crawler.py b/tests/test_crawler.py index 470c4293a..92935539e 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -6,8 +6,14 @@ from twitter.common.contextutil import temporary_dir from pex.crawler import Crawler, PageParser +from pex.http import Context from pex.link import Link +try: + from unittest import mock +except ImportError: + import mock + def lpp(page): links = PageParser.links(page) @@ -100,6 +106,56 @@ def test_crawler_unknown_scheme(): Crawler().crawl('ftp://ftp.cdrom.com') == (set(), set()) -# TODO(wickman) -# test remote http crawling via mock -# test page decoding via mock +MOCK_INDEX_TMPL = ''' +

Index of /home/third_party/python

+ + + + + + + +%s +
[DIR]  -  
+''' + +MOCK_INDEX_A = MOCK_INDEX_TMPL % ''' + + [   ] + 3to2-1.0.tar.gz + 16-Apr-2015 23:18 + 45K + GZIP compressed docume> + +''' + +MOCK_INDEX_B = MOCK_INDEX_TMPL % ''' + + [   ] + + APScheduler-2.1.0.tar.gz + + 16-Apr-2015 23:18 + 41K + GZIP compressed docume> + +''' + + +def test_crawler_remote(): + Crawler.reset_cache() + + mock_context = mock.create_autospec(Context, spec_set=True) + mock_context.content.side_effect = [MOCK_INDEX_A, MOCK_INDEX_B, Exception('shouldnt get here')] + expected_output = set([Link('http://url1.test.com/3to2-1.0.tar.gz'), + Link('http://url2.test.com/APScheduler-2.1.0.tar.gz')]) + + c = Crawler(mock_context) + test_links = [Link('http://url1.test.com'), Link('http://url2.test.com')] + assert c.crawl(test_links) == expected_output + + # Test memoization of Crawler.crawl(). + assert c.crawl(test_links) == expected_output + + +# TODO(wickman): test page decoding via mock