From: Jeff van Santen Date: Sat, 2 Nov 2019 00:11:52 +0000 (-0700) Subject: Dead link handling X-Git-Tag: v6.1.0b1~49^2 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=refs%2Fpull%2F2765%2Fhead;p=thirdparty%2Ftornado.git Dead link handling Added an extra set for handling dead links, and reporting. One consequence of this is that using this script will "work" offline, but will report that some all the links were not fetched. --- diff --git a/demos/webspider/webspider.py b/demos/webspider/webspider.py index 3f151553e..6747f7e13 100755 --- a/demos/webspider/webspider.py +++ b/demos/webspider/webspider.py @@ -50,7 +50,7 @@ def get_links(html): async def main(): q = queues.Queue() start = time.time() - fetching, fetched = set(), set() + fetching, fetched, dead = set(), set(), set() async def fetch_url(current_url): if current_url in fetching: @@ -74,6 +74,7 @@ async def main(): await fetch_url(url) except Exception as e: print("Exception: %s %s" % (e, url)) + dead.add(url) finally: q.task_done() @@ -82,9 +83,10 @@ async def main(): # Start workers, then wait for the work queue to be empty. workers = gen.multi([worker() for _ in range(concurrency)]) await q.join(timeout=timedelta(seconds=300)) - assert fetching == fetched + assert fetching == (fetched | dead) print("Done in %d seconds, fetched %s URLs." % (time.time() - start, len(fetched))) - + print("Unable to fetch %s URLS." % len(dead)) + # Signal all the workers to exit. for _ in range(concurrency): await q.put(None)