fix: [Crawler] limit max crawled pages

pull/333/head
Terrtia 2019-01-29 15:38:00 +01:00
parent 64ffdd52e8
commit c1b34bd99c
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
3 changed files with 7 additions and 7 deletions

View File

@ -205,6 +205,11 @@ if __name__ == '__main__':
date_month = datetime.datetime.now().strftime("%Y%m")
if not r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and not r_onion.sismember('{}_down:{}'.format(type_hidden_service, date), domain):
# last_father
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'paste_parent', paste)
# last check
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
crawl_onion(url, domain, date, date_month, message)
if url != domain_url:
@ -226,12 +231,6 @@ if __name__ == '__main__':
msg = 'infoleak:automatic-detection="{}";{}'.format(type_hidden_service, paste)
p.populate_set_out(msg, 'Tags')
# last check
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
# last_father
r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'paste_parent', paste)
# add onion screenshot history
# add crawled days
if r_onion.lindex('{}_history:{}'.format(type_hidden_service, domain), 0) != date:

View File

@ -42,6 +42,7 @@ class TorSplashCrawler():
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
'HTTPERROR_ALLOW_ALL': True,
'RETRY_TIMES': 2,
'CLOSESPIDER_PAGECOUNT': 1000,
'DEPTH_LIMIT': crawler_depth_limit
})

View File

@ -426,7 +426,7 @@ function refresh_list_crawled(){
newCell.innerHTML = "<td><i class=\"fa fa-"+icon+"-circle fa-2x\" style=\"color:"+text_color+";\"></i>"+crawler['crawler_info']+"</td>";
newCell = newRow.insertCell(1);
newCell.innerHTML = "<td>"+crawler['crawling_domain']+"</td>";
newCell.innerHTML = "<td><a target=\"_blank\" href=\"{{ url_for('hiddenServices.onion_domain') }}?onion_domain="+crawler['crawling_domain']+"\">"+crawler['crawling_domain']+"</a></td>";
newCell = newRow.insertCell(2);
newCell.innerHTML = "<td><div style=\"color:"+text_color+";\">"+crawler['status_info']+"</div></td>";