Skip to content

Commit

Permalink
Dead link handling
Browse files Browse the repository at this point in the history
Added an extra set for handling dead links, and reporting.

One consequence of this is that using this script will "work" offline, but will report that some all the links were not fetched.
  • Loading branch information
jvansan authored Nov 2, 2019
1 parent d81d685 commit 90ef9a1
Showing 1 changed file with 5 additions and 3 deletions.
8 changes: 5 additions & 3 deletions demos/webspider/webspider.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def handle_starttag(self, tag, attrs):
async def main():
q = queues.Queue()
start = time.time()
fetching, fetched = set(), set()
fetching, fetched, dead = set(), set(), set()

async def fetch_url(current_url):
if current_url in fetching:
Expand All @@ -74,6 +74,7 @@ async def worker():
await fetch_url(url)
except Exception as e:
print("Exception: %s %s" % (e, url))
dead.add(url)
finally:
q.task_done()

Expand All @@ -82,9 +83,10 @@ async def worker():
# Start workers, then wait for the work queue to be empty.
workers = gen.multi([worker() for _ in range(concurrency)])
await q.join(timeout=timedelta(seconds=300))
assert fetching == fetched
assert fetching == (fetched | dead)
print("Done in %d seconds, fetched %s URLs." % (time.time() - start, len(fetched)))

print("Unable to fetch %s URLS." % len(dead))

# Signal all the workers to exit.
for _ in range(concurrency):
await q.put(None)
Expand Down

0 comments on commit 90ef9a1

Please sign in to comment.