mirror of https://github.com/CIRCL/AIL-framework
				
				
				
			fix: [Crawler] limit max crawled pages
							parent
							
								
									64ffdd52e8
								
							
						
					
					
						commit
						c1b34bd99c
					
				|  | @ -205,6 +205,11 @@ if __name__ == '__main__': | |||
|                     date_month = datetime.datetime.now().strftime("%Y%m") | ||||
| 
 | ||||
|                     if not r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and not r_onion.sismember('{}_down:{}'.format(type_hidden_service, date), domain): | ||||
|                         # last_father | ||||
|                         r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'paste_parent', paste) | ||||
| 
 | ||||
|                         # last check | ||||
|                         r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date) | ||||
| 
 | ||||
|                         crawl_onion(url, domain, date, date_month, message) | ||||
|                         if url != domain_url: | ||||
|  | @ -226,12 +231,6 @@ if __name__ == '__main__': | |||
|                                 msg = 'infoleak:automatic-detection="{}";{}'.format(type_hidden_service, paste) | ||||
|                                 p.populate_set_out(msg, 'Tags') | ||||
| 
 | ||||
|                         # last check | ||||
|                         r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date) | ||||
| 
 | ||||
|                         # last_father | ||||
|                         r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'paste_parent', paste) | ||||
| 
 | ||||
|                         # add onion screenshot history | ||||
|                             # add crawled days | ||||
|                         if r_onion.lindex('{}_history:{}'.format(type_hidden_service, domain), 0) != date: | ||||
|  |  | |||
|  | @ -42,6 +42,7 @@ class TorSplashCrawler(): | |||
|             'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', | ||||
|             'HTTPERROR_ALLOW_ALL': True, | ||||
|             'RETRY_TIMES': 2, | ||||
|             'CLOSESPIDER_PAGECOUNT': 1000, | ||||
|             'DEPTH_LIMIT': crawler_depth_limit | ||||
|             }) | ||||
| 
 | ||||
|  |  | |||
|  | @ -426,7 +426,7 @@ function refresh_list_crawled(){ | |||
| 						newCell.innerHTML = "<td><i class=\"fa fa-"+icon+"-circle fa-2x\" style=\"color:"+text_color+";\"></i>"+crawler['crawler_info']+"</td>"; | ||||
| 
 | ||||
| 						newCell  = newRow.insertCell(1); | ||||
| 						newCell.innerHTML = "<td>"+crawler['crawling_domain']+"</td>"; | ||||
| 						newCell.innerHTML = "<td><a target=\"_blank\" href=\"{{ url_for('hiddenServices.onion_domain') }}?onion_domain="+crawler['crawling_domain']+"\">"+crawler['crawling_domain']+"</a></td>"; | ||||
| 
 | ||||
| 						newCell  = newRow.insertCell(2); | ||||
| 						newCell.innerHTML = "<td><div style=\"color:"+text_color+";\">"+crawler['status_info']+"</div></td>"; | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	 Terrtia
						Terrtia