From 8754350d39e5e051681343295f7e3d0db3123386 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Fri, 26 Mar 2021 11:30:06 +0100 Subject: [PATCH] fix: [crawler] user agent + splash restart --- bin/Crawler.py | 1 + bin/torcrawler/TorSplashCrawler.py | 9 +++++++-- configs/core.cfg.sample | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/bin/Crawler.py b/bin/Crawler.py index b12d0f11..e9f75f9a 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -208,6 +208,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config): if nb_retry == 2: crawlers.restart_splash_docker(splash_url, splash_name) + time.spleep(20) if nb_retry == 6: on_error_send_message_back_in_queue(type_service, domain, message) diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index 17438d60..6742f613 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -48,6 +48,9 @@ function main(splash, args) splash.html5_media_enabled = true splash.http2_enabled = true + -- User Agent + splash:set_user_agent(args.user_agent) + -- User defined splash.resource_timeout = args.resource_timeout splash.timeout = args.timeout @@ -71,7 +74,7 @@ function main(splash, args) splash:wait{args.wait} -- Page instrumentation -- splash.scroll_position = {y=1000} - splash:wait{args.wait} + -- splash:wait{args.wait} -- Response return { har = splash:har(), @@ -88,7 +91,7 @@ class TorSplashCrawler(): def __init__(self, splash_url, crawler_options): self.process = CrawlerProcess({'LOG_ENABLED': True}) self.crawler = Crawler(self.TorSplashSpider, { - 'USER_AGENT': crawler_options['user_agent'], + 'USER_AGENT': crawler_options['user_agent'], # /!\ overwritten by lua script 'SPLASH_URL': splash_url, 'ROBOTSTXT_OBEY': False, 'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723, @@ -126,6 +129,7 @@ class TorSplashCrawler(): self.date_month = date['date_month'] self.date_epoch = int(date['epoch']) + self.user_agent = crawler_options['user_agent'] self.png = crawler_options['png'] self.har = crawler_options['har'] self.cookies = cookies @@ -150,6 +154,7 @@ class TorSplashCrawler(): return {'wait': 10, 'resource_timeout': 30, # /!\ Weird behaviour if timeout < resource_timeout /!\ 'timeout': 30, + 'user_agent': self.user_agent, 'cookies': cookies, 'lua_source': script_cookie } diff --git a/configs/core.cfg.sample b/configs/core.cfg.sample index 669550b9..df6fed66 100644 --- a/configs/core.cfg.sample +++ b/configs/core.cfg.sample @@ -271,7 +271,7 @@ crawler_depth_limit = 1 default_crawler_har = True default_crawler_png = True default_crawler_closespider_pagecount = 50 -default_crawler_user_agent = Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0 +default_crawler_user_agent = Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0 splash_url = http://127.0.0.1 splash_port = 8050-8052 domain_proxy = onion.foundation