mirror of https://github.com/CIRCL/AIL-framework
fix: [Crawler] limit max crawled pages
parent
64ffdd52e8
commit
c1b34bd99c
|
@ -205,6 +205,11 @@ if __name__ == '__main__':
|
|||
date_month = datetime.datetime.now().strftime("%Y%m")
|
||||
|
||||
if not r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and not r_onion.sismember('{}_down:{}'.format(type_hidden_service, date), domain):
|
||||
# last_father
|
||||
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'paste_parent', paste)
|
||||
|
||||
# last check
|
||||
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
|
||||
|
||||
crawl_onion(url, domain, date, date_month, message)
|
||||
if url != domain_url:
|
||||
|
@ -226,12 +231,6 @@ if __name__ == '__main__':
|
|||
msg = 'infoleak:automatic-detection="{}";{}'.format(type_hidden_service, paste)
|
||||
p.populate_set_out(msg, 'Tags')
|
||||
|
||||
# last check
|
||||
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
|
||||
|
||||
# last_father
|
||||
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'paste_parent', paste)
|
||||
|
||||
# add onion screenshot history
|
||||
# add crawled days
|
||||
if r_onion.lindex('{}_history:{}'.format(type_hidden_service, domain), 0) != date:
|
||||
|
|
|
@ -42,6 +42,7 @@ class TorSplashCrawler():
|
|||
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
|
||||
'HTTPERROR_ALLOW_ALL': True,
|
||||
'RETRY_TIMES': 2,
|
||||
'CLOSESPIDER_PAGECOUNT': 1000,
|
||||
'DEPTH_LIMIT': crawler_depth_limit
|
||||
})
|
||||
|
||||
|
|
|
@ -426,7 +426,7 @@ function refresh_list_crawled(){
|
|||
newCell.innerHTML = "<td><i class=\"fa fa-"+icon+"-circle fa-2x\" style=\"color:"+text_color+";\"></i>"+crawler['crawler_info']+"</td>";
|
||||
|
||||
newCell = newRow.insertCell(1);
|
||||
newCell.innerHTML = "<td>"+crawler['crawling_domain']+"</td>";
|
||||
newCell.innerHTML = "<td><a target=\"_blank\" href=\"{{ url_for('hiddenServices.onion_domain') }}?onion_domain="+crawler['crawling_domain']+"\">"+crawler['crawling_domain']+"</a></td>";
|
||||
|
||||
newCell = newRow.insertCell(2);
|
||||
newCell.innerHTML = "<td><div style=\"color:"+text_color+";\">"+crawler['status_info']+"</div></td>";
|
||||
|
|
Loading…
Reference in New Issue