From 12b8e4f9496c2a7f4c7d04c314e6a950287049b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Fri, 5 Apr 2019 16:12:54 +0200 Subject: [PATCH] chg: Improve async processing --- bin/async_scrape.py | 7 +++++-- lookyloo/lookyloo.py | 9 ++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/bin/async_scrape.py b/bin/async_scrape.py index a7cd5bab..52edfaf4 100755 --- a/bin/async_scrape.py +++ b/bin/async_scrape.py @@ -5,7 +5,7 @@ from pathlib import Path import logging from lookyloo.abstractmanager import AbstractManager -from lookyloo.helpers import get_homedir, set_running, unset_running +from lookyloo.helpers import get_homedir, set_running, unset_running, shutdown_requested from lookyloo.lookyloo import Lookyloo logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s', @@ -22,7 +22,10 @@ class AsyncScraper(AbstractManager): def _to_run_forever(self): set_running('async_scrape') - self.lookyloo.process_scrape_queue() + while True: + url = self.lookyloo.process_scrape_queue() + if url is None or shutdown_requested(): + break unset_running('async_scrape') diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 713955f2..f075414d 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -113,11 +113,14 @@ class Lookyloo(): def process_scrape_queue(self): uuid = self.redis.spop('to_scrape') if not uuid: - return + return None to_scrape = self.redis.hgetall(uuid) self.redis.delete(uuid) to_scrape['perma_uuid'] = uuid - self.scrape(**to_scrape) + if self.scrape(**to_scrape): + self.logger.info(f'Processed {to_scrape["url"]}') + return True + return False def load_tree(self, report_dir: Path): har_files = sorted(report_dir.glob('*.har')) @@ -152,7 +155,7 @@ class Lookyloo(): items = crawl(self.splash_url, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO') if not items: # broken - pass + return False if not perma_uuid: perma_uuid = str(uuid4()) width = len(str(len(items)))