Skip to content

Commit

Permalink
Memoize calls to Crawler.crawl() for performance win in find-links ba…
Browse files Browse the repository at this point in the history
…sed resolution.
  • Loading branch information
kwlzn committed Dec 17, 2015
1 parent 1e70fdd commit 4f0e54e
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 6 deletions.
30 changes: 27 additions & 3 deletions pex/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from .http import Context
from .link import Link
from .tracer import TRACER
from .util import Memoizer

if PY3:
from queue import Empty, Queue
Expand Down Expand Up @@ -64,6 +65,14 @@ def partition(L, pred):
class Crawler(object):
"""A multi-threaded crawler that supports local (disk) and remote (web) crawling."""

# Memoizer for calls to Crawler.crawl().
_CRAWL_CACHE = Memoizer()

@classmethod
def reset_cache(cls):
"""Reset the internal crawl cache. This is intended primarily for tests."""
cls._CRAWL_CACHE = Memoizer()

@classmethod
def crawl_local(cls, link):
try:
Expand Down Expand Up @@ -99,7 +108,22 @@ def __init__(self, context=None, threads=1):
self._threads = threads
self.context = context or Context.get()

def _make_cache_key(self, links, follow_links):
return (follow_links,) + tuple(links)

def crawl(self, link_or_links, follow_links=False):
links = list(Link.wrap_iterable(link_or_links))
cache_key = self._make_cache_key(links, follow_links)

# Memoize crawling to a global Memoizer (Crawler._CRAWL_CACHE).
result = self._CRAWL_CACHE.get(cache_key)
if result is None:
result = self._crawl(links, follow_links)
self._CRAWL_CACHE.store(cache_key, result)

return result

def _crawl(self, link_or_links, follow_links):
links, seen = set(), set()
queue = Queue()
converged = threading.Event()
Expand Down Expand Up @@ -127,7 +151,8 @@ def execute():
queue.put(rel)
queue.task_done()

for link in Link.wrap_iterable(link_or_links):
for i, link in enumerate(link_or_links):
TRACER.log('crawling link i=%s link=%s follow_links=%s' % (i, link, follow_links), V=3)
queue.put(link)

workers = []
Expand All @@ -140,6 +165,5 @@ def execute():
queue.join()
converged.set()

# We deliberately not join back the worker threads, since they are no longer of
# any use to us.
# We deliberately do not join the worker threads, since they are no longer of any use to us.
return links
62 changes: 59 additions & 3 deletions tests/test_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,14 @@
from twitter.common.contextutil import temporary_dir

from pex.crawler import Crawler, PageParser
from pex.http import Context
from pex.link import Link

try:
from unittest import mock
except ImportError:
import mock


def lpp(page):
links = PageParser.links(page)
Expand Down Expand Up @@ -100,6 +106,56 @@ def test_crawler_unknown_scheme():
Crawler().crawl('ftp://ftp.cdrom.com') == (set(), set())


# TODO(wickman)
# test remote http crawling via mock
# test page decoding via mock
MOCK_INDEX_TMPL = '''
<h1>Index of /home/third_party/python</h1>
<table>
<tr>
<td valign="top"><img src="/icons/back.gif" alt="[DIR]"></td>
<td>&nbsp;</td>
<td align="right"> - </td>
<td>&nbsp;</td>
</tr>
%s
</table>
'''

MOCK_INDEX_A = MOCK_INDEX_TMPL % '''
<tr>
<td valign="top"><img src="/icons/compressed.gif" alt="[ ]"></td>
<td><a href="3to2-1.0.tar.gz">3to2-1.0.tar.gz</a></td>
<td align="right">16-Apr-2015 23:18 </td>
<td align="right"> 45K</td>
<td>GZIP compressed docume></td>
</tr>
'''

MOCK_INDEX_B = MOCK_INDEX_TMPL % '''
<tr>
<td valign="top"><img src="/icons/compressed.gif" alt="[ ]"></td>
<td>
<a href="APScheduler-2.1.0.tar.gz">APScheduler-2.1.0.tar.gz</a>
</td>
<td align="right">16-Apr-2015 23:18 </td>
<td align="right"> 41K</td>
<td>GZIP compressed docume></td>
</tr>
'''


def test_crawler_remote():
Crawler.reset_cache()

mock_context = mock.create_autospec(Context, spec_set=True)
mock_context.content.side_effect = [MOCK_INDEX_A, MOCK_INDEX_B, Exception('shouldnt get here')]
expected_output = set([Link('http://url1.test.com/3to2-1.0.tar.gz'),
Link('http://url2.test.com/APScheduler-2.1.0.tar.gz')])

c = Crawler(mock_context)
test_links = [Link('http://url1.test.com'), Link('http://url2.test.com')]
assert c.crawl(test_links) == expected_output

# Test memoization of Crawler.crawl().
assert c.crawl(test_links) == expected_output


# TODO(wickman): test page decoding via mock

0 comments on commit 4f0e54e

Please sign in to comment.