From 0389b9c23b9280c74fc6b66911672415d4d944d0 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 13 May 2019 14:24:16 +0200 Subject: [PATCH] chg: [crawler] manual/auto crawler: always save screenshots --- bin/torcrawler/TorSplashCrawler.py | 10 +++++----- bin/torcrawler/tor_crawler.py | 3 ++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index 17710bf6..4e36c1c9 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -48,16 +48,16 @@ class TorSplashCrawler(): 'DEPTH_LIMIT': crawler_options['depth_limit'] }) - def crawl(self, type, crawler_options, date, url, domain, port, original_item): - self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, url=url, domain=domain, port=port, original_item=original_item) + def crawl(self, type, crawler_options, date, requested_mode, url, domain, port, original_item): + self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, requested_mode=requested_mode, url=url, domain=domain, port=port, original_item=original_item) self.process.start() class TorSplashSpider(Spider): name = 'TorSplashSpider' - def __init__(self, type, crawler_options, date, url, domain, port, original_item, *args, **kwargs): + def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, original_item, *args, **kwargs): self.type = type - self.requested = crawler_options['requested'] + self.requested_mode = requested_mode self.original_item = original_item self.root_key = None self.start_urls = url @@ -184,7 +184,7 @@ class TorSplashCrawler(): if 'png' in response.data: size_screenshot = (len(response.data['png'])*3) /4 - if size_screenshot < 5000000 or self.requested: #bytes or manual/auto + if size_screenshot < 5000000 or self.requested_mode: #bytes or manual/auto image_content = base64.standard_b64decode(response.data['png'].encode()) hash = sha256(image_content).hexdigest() img_dir_path = os.path.join(hash[0:2], hash[2:4], hash[4:6], hash[6:8], hash[8:10], hash[10:12]) diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py index 13a67545..2d8365c4 100755 --- a/bin/torcrawler/tor_crawler.py +++ b/bin/torcrawler/tor_crawler.py @@ -43,8 +43,9 @@ if __name__ == '__main__': original_item = crawler_json['item'] crawler_options = crawler_json['crawler_options'] date = crawler_json['date'] + requested_mode = crawler_json['requested'] redis_cache.delete('crawler_request:{}'.format(uuid)) crawler = TorSplashCrawler(splash_url, crawler_options) - crawler.crawl(service_type, crawler_options, date, url, domain, port, original_item) + crawler.crawl(service_type, crawler_options, date, requested_mode, url, domain, port, original_item)