mirror of https://github.com/CIRCL/AIL-framework
fix: [Crawler] change max page crawled
parent
6c7086f4eb
commit
92d192238b
|
@ -209,16 +209,16 @@ if __name__ == '__main__':
|
||||||
date_month = datetime.datetime.now().strftime("%Y%m")
|
date_month = datetime.datetime.now().strftime("%Y%m")
|
||||||
|
|
||||||
if not r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and not r_onion.sismember('{}_down:{}'.format(type_hidden_service, date), domain):
|
if not r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and not r_onion.sismember('{}_down:{}'.format(type_hidden_service, date), domain):
|
||||||
|
# first seen
|
||||||
|
if not r_onion.hexists('{}_metadata:{}'.format(type_hidden_service, domain), 'first_seen'):
|
||||||
|
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'first_seen', date)
|
||||||
|
|
||||||
# last_father
|
# last_father
|
||||||
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'paste_parent', paste)
|
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'paste_parent', paste)
|
||||||
|
|
||||||
# last check
|
# last check
|
||||||
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
|
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
|
||||||
|
|
||||||
# first seen
|
|
||||||
if not r_onion.exists('{}_metadata:{}'.format(type_hidden_service, domain)):
|
|
||||||
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'first_seen', date)
|
|
||||||
|
|
||||||
crawl_onion(url, domain, date, date_month, message)
|
crawl_onion(url, domain, date, date_month, message)
|
||||||
if url != domain_url:
|
if url != domain_url:
|
||||||
print(url)
|
print(url)
|
||||||
|
|
|
@ -42,7 +42,7 @@ class TorSplashCrawler():
|
||||||
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
|
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
|
||||||
'HTTPERROR_ALLOW_ALL': True,
|
'HTTPERROR_ALLOW_ALL': True,
|
||||||
'RETRY_TIMES': 2,
|
'RETRY_TIMES': 2,
|
||||||
'CLOSESPIDER_PAGECOUNT': 1000,
|
'CLOSESPIDER_PAGECOUNT': 50,
|
||||||
'DEPTH_LIMIT': crawler_depth_limit
|
'DEPTH_LIMIT': crawler_depth_limit
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue