From 90ef9a1bf7c0655227acb8c19389ae0dd666465b Mon Sep 17 00:00:00 2001 From: Jeff van Santen Date: Fri, 1 Nov 2019 17:11:52 -0700 Subject: [PATCH] Dead link handling Added an extra set for handling dead links, and reporting. One consequence of this is that using this script will "work" offline, but will report that some all the links were not fetched. --- demos/webspider/webspider.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/demos/webspider/webspider.py b/demos/webspider/webspider.py index 3f151553ef..6747f7e139 100755 --- a/demos/webspider/webspider.py +++ b/demos/webspider/webspider.py @@ -50,7 +50,7 @@ def handle_starttag(self, tag, attrs): async def main(): q = queues.Queue() start = time.time() - fetching, fetched = set(), set() + fetching, fetched, dead = set(), set(), set() async def fetch_url(current_url): if current_url in fetching: @@ -74,6 +74,7 @@ async def worker(): await fetch_url(url) except Exception as e: print("Exception: %s %s" % (e, url)) + dead.add(url) finally: q.task_done() @@ -82,9 +83,10 @@ async def worker(): # Start workers, then wait for the work queue to be empty. workers = gen.multi([worker() for _ in range(concurrency)]) await q.join(timeout=timedelta(seconds=300)) - assert fetching == fetched + assert fetching == (fetched | dead) print("Done in %d seconds, fetched %s URLs." % (time.time() - start, len(fetched))) - + print("Unable to fetch %s URLS." % len(dead)) + # Signal all the workers to exit. for _ in range(concurrency): await q.put(None)