mirror of https://github.com/CIRCL/AIL-framework
chg: [Crawler] add default crawler config + update default user_agent
parent
d9279823d5
commit
09ecc4d93f
|
@ -341,13 +341,27 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
faup = Faup()
|
faup = Faup()
|
||||||
|
|
||||||
|
# get HAR files
|
||||||
|
default_crawler_har = p.config.getboolean("Crawler", "default_crawler_har")
|
||||||
|
if default_crawler_har:
|
||||||
|
default_crawler_har = 1
|
||||||
|
else:
|
||||||
|
default_crawler_har = 0
|
||||||
|
|
||||||
|
# get PNG files
|
||||||
|
default_crawler_png = p.config.getboolean("Crawler", "default_crawler_png")
|
||||||
|
if default_crawler_png:
|
||||||
|
default_crawler_png = 1
|
||||||
|
else:
|
||||||
|
default_crawler_png = 0
|
||||||
|
|
||||||
# Default crawler options
|
# Default crawler options
|
||||||
default_crawler_config = {'html': 1,
|
default_crawler_config = {'html': 1,
|
||||||
'har': 1,
|
'har': default_crawler_har,
|
||||||
'png': 1,
|
'png': default_crawler_png,
|
||||||
'depth_limit': p.config.getint("Crawler", "crawler_depth_limit"),
|
'depth_limit': p.config.getint("Crawler", "crawler_depth_limit"),
|
||||||
'closespider_pagecount': 50,
|
'closespider_pagecount': p.config.getint("Crawler", "default_crawler_closespider_pagecount"),
|
||||||
'user_agent': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0'}
|
'user_agent': p.config.get("Crawler", "default_crawler_user_agent")}
|
||||||
|
|
||||||
# Track launched crawler
|
# Track launched crawler
|
||||||
r_cache.sadd('all_crawler', splash_port)
|
r_cache.sadd('all_crawler', splash_port)
|
||||||
|
|
|
@ -252,5 +252,9 @@ db = 0
|
||||||
[Crawler]
|
[Crawler]
|
||||||
activate_crawler = False
|
activate_crawler = False
|
||||||
crawler_depth_limit = 1
|
crawler_depth_limit = 1
|
||||||
|
default_crawler_har = True
|
||||||
|
default_crawler_png = True
|
||||||
|
default_crawler_closespider_pagecount = 50
|
||||||
|
default_crawler_user_agent = Mozilla/5.0 (Windows NT 6.1; rv:60.0) Gecko/20100101 Firefox/60.0
|
||||||
splash_url = http://127.0.0.1
|
splash_url = http://127.0.0.1
|
||||||
splash_port = 8050-8052
|
splash_port = 8050-8052
|
||||||
|
|
Loading…
Reference in New Issue