From 90ef9a1bf7c0655227acb8c19389ae0dd666465b Mon Sep 17 00:00:00 2001 From: Jeff van Santen Date: Fri, 1 Nov 2019 17:11:52 -0700 Subject: [PATCH] Dead link handling Added an extra set for handling dead links, and reporting. One consequence of this is that using this script will "work" offline, but will report that some all the links were not fetched. --- demos/webspider/webspider.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/demos/webspider/webspider.py b/demos/webspider/webspider.py index 3f151553e..6747f7e13 100755 --- a/demos/webspider/webspider.py +++ b/demos/webspider/webspider.py @@ -50,7 +50,7 @@ def get_links(html): async def main(): q = queues.Queue() start = time.time() - fetching, fetched = set(), set() + fetching, fetched, dead = set(), set(), set() async def fetch_url(current_url): if current_url in fetching: @@ -74,6 +74,7 @@ async def main(): await fetch_url(url) except Exception as e: print("Exception: %s %s" % (e, url)) + dead.add(url) finally: q.task_done() @@ -82,9 +83,10 @@ async def main(): # Start workers, then wait for the work queue to be empty. workers = gen.multi([worker() for _ in range(concurrency)]) await q.join(timeout=timedelta(seconds=300)) - assert fetching == fetched + assert fetching == (fetched | dead) print("Done in %d seconds, fetched %s URLs." % (time.time() - start, len(fetched))) - + print("Unable to fetch %s URLS." % len(dead)) + # Signal all the workers to exit. for _ in range(concurrency): await q.put(None) -- 2.47.2