mirror of https://github.com/CIRCL/AIL-framework
fix: [crawler] user agent + splash restart
parent
5a93b86524
commit
8754350d39
|
@ -208,6 +208,7 @@ def crawl_onion(url, domain, port, type_service, message, crawler_config):
|
||||||
|
|
||||||
if nb_retry == 2:
|
if nb_retry == 2:
|
||||||
crawlers.restart_splash_docker(splash_url, splash_name)
|
crawlers.restart_splash_docker(splash_url, splash_name)
|
||||||
|
time.spleep(20)
|
||||||
|
|
||||||
if nb_retry == 6:
|
if nb_retry == 6:
|
||||||
on_error_send_message_back_in_queue(type_service, domain, message)
|
on_error_send_message_back_in_queue(type_service, domain, message)
|
||||||
|
|
|
@ -48,6 +48,9 @@ function main(splash, args)
|
||||||
splash.html5_media_enabled = true
|
splash.html5_media_enabled = true
|
||||||
splash.http2_enabled = true
|
splash.http2_enabled = true
|
||||||
|
|
||||||
|
-- User Agent
|
||||||
|
splash:set_user_agent(args.user_agent)
|
||||||
|
|
||||||
-- User defined
|
-- User defined
|
||||||
splash.resource_timeout = args.resource_timeout
|
splash.resource_timeout = args.resource_timeout
|
||||||
splash.timeout = args.timeout
|
splash.timeout = args.timeout
|
||||||
|
@ -71,7 +74,7 @@ function main(splash, args)
|
||||||
splash:wait{args.wait}
|
splash:wait{args.wait}
|
||||||
-- Page instrumentation
|
-- Page instrumentation
|
||||||
-- splash.scroll_position = {y=1000}
|
-- splash.scroll_position = {y=1000}
|
||||||
splash:wait{args.wait}
|
-- splash:wait{args.wait}
|
||||||
-- Response
|
-- Response
|
||||||
return {
|
return {
|
||||||
har = splash:har(),
|
har = splash:har(),
|
||||||
|
@ -88,7 +91,7 @@ class TorSplashCrawler():
|
||||||
def __init__(self, splash_url, crawler_options):
|
def __init__(self, splash_url, crawler_options):
|
||||||
self.process = CrawlerProcess({'LOG_ENABLED': True})
|
self.process = CrawlerProcess({'LOG_ENABLED': True})
|
||||||
self.crawler = Crawler(self.TorSplashSpider, {
|
self.crawler = Crawler(self.TorSplashSpider, {
|
||||||
'USER_AGENT': crawler_options['user_agent'],
|
'USER_AGENT': crawler_options['user_agent'], # /!\ overwritten by lua script
|
||||||
'SPLASH_URL': splash_url,
|
'SPLASH_URL': splash_url,
|
||||||
'ROBOTSTXT_OBEY': False,
|
'ROBOTSTXT_OBEY': False,
|
||||||
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
|
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
|
||||||
|
@ -126,6 +129,7 @@ class TorSplashCrawler():
|
||||||
self.date_month = date['date_month']
|
self.date_month = date['date_month']
|
||||||
self.date_epoch = int(date['epoch'])
|
self.date_epoch = int(date['epoch'])
|
||||||
|
|
||||||
|
self.user_agent = crawler_options['user_agent']
|
||||||
self.png = crawler_options['png']
|
self.png = crawler_options['png']
|
||||||
self.har = crawler_options['har']
|
self.har = crawler_options['har']
|
||||||
self.cookies = cookies
|
self.cookies = cookies
|
||||||
|
@ -150,6 +154,7 @@ class TorSplashCrawler():
|
||||||
return {'wait': 10,
|
return {'wait': 10,
|
||||||
'resource_timeout': 30, # /!\ Weird behaviour if timeout < resource_timeout /!\
|
'resource_timeout': 30, # /!\ Weird behaviour if timeout < resource_timeout /!\
|
||||||
'timeout': 30,
|
'timeout': 30,
|
||||||
|
'user_agent': self.user_agent,
|
||||||
'cookies': cookies,
|
'cookies': cookies,
|
||||||
'lua_source': script_cookie
|
'lua_source': script_cookie
|
||||||
}
|
}
|
||||||
|
|
|
@ -271,7 +271,7 @@ crawler_depth_limit = 1
|
||||||
default_crawler_har = True
|
default_crawler_har = True
|
||||||
default_crawler_png = True
|
default_crawler_png = True
|
||||||
default_crawler_closespider_pagecount = 50
|
default_crawler_closespider_pagecount = 50
|
||||||
default_crawler_user_agent = Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0
|
default_crawler_user_agent = Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0
|
||||||
splash_url = http://127.0.0.1
|
splash_url = http://127.0.0.1
|
||||||
splash_port = 8050-8052
|
splash_port = 8050-8052
|
||||||
domain_proxy = onion.foundation
|
domain_proxy = onion.foundation
|
||||||
|
|
Loading…
Reference in New Issue