mirror of https://github.com/CIRCL/AIL-framework
				
				
				
			fix: [Crawler] fix onion blacklist + add crawler info
							parent
							
								
									1a1fda4c47
								
							
						
					
					
						commit
						bb301a870c
					
				|  | @ -10,6 +10,8 @@ import time | |||
| import subprocess | ||||
| import requests | ||||
| 
 | ||||
| from pyfaup.faup import Faup | ||||
| 
 | ||||
| sys.path.append(os.environ['AIL_BIN']) | ||||
| from Helper import Process | ||||
| from pubsublogger import publisher | ||||
|  | @ -22,6 +24,9 @@ def on_error_send_message_back_in_queue(type_hidden_service, domain, message): | |||
| 
 | ||||
| def crawl_onion(url, domain, date, date_month, message): | ||||
| 
 | ||||
|     r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain) | ||||
|     r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d  -  %H:%M.%S")) | ||||
| 
 | ||||
|     #if not r_onion.sismember('full_onion_up', domain) and not r_onion.sismember('onion_down:'+date , domain): | ||||
|     super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father') | ||||
|     if super_father is None: | ||||
|  | @ -43,13 +48,15 @@ def crawl_onion(url, domain, date, date_month, message): | |||
|                 print('--------------------------------------') | ||||
|                 print('         \033[91m DOCKER SPLASH DOWN\033[0m') | ||||
|                 print('          {} DOWN'.format(splash_url)) | ||||
|                 exit(1) | ||||
|                 r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'SPLASH DOWN') | ||||
|                 nb_retry == 0 | ||||
| 
 | ||||
|             print('         \033[91m DOCKER SPLASH NOT AVAILABLE\033[0m') | ||||
|             print('          Retry({}) in 10 seconds'.format(nb_retry)) | ||||
|             time.sleep(10) | ||||
| 
 | ||||
|     if r.status_code == 200: | ||||
|         r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling') | ||||
|         process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, type_hidden_service, url, domain, paste, super_father], | ||||
|                                    stdout=subprocess.PIPE) | ||||
|         while process.poll() is None: | ||||
|  | @ -67,6 +74,7 @@ def crawl_onion(url, domain, date, date_month, message): | |||
|                 print('') | ||||
|                 print('            PROXY DOWN OR BAD CONFIGURATION\033[0m'.format(splash_url)) | ||||
|                 print('------------------------------------------------------------------------') | ||||
|                 r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Error') | ||||
|                 exit(-2) | ||||
|         else: | ||||
|             print(process.stdout.read()) | ||||
|  | @ -76,6 +84,7 @@ def crawl_onion(url, domain, date, date_month, message): | |||
|         print('--------------------------------------') | ||||
|         print('         \033[91m DOCKER SPLASH DOWN\033[0m') | ||||
|         print('          {} DOWN'.format(splash_url)) | ||||
|         r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling') | ||||
|         exit(1) | ||||
| 
 | ||||
| 
 | ||||
|  | @ -119,6 +128,7 @@ if __name__ == '__main__': | |||
|     print('splash url: {}'.format(splash_url)) | ||||
| 
 | ||||
|     crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit") | ||||
|     faup = Faup() | ||||
| 
 | ||||
|     PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) | ||||
| 
 | ||||
|  | @ -140,6 +150,10 @@ if __name__ == '__main__': | |||
|         db=p.config.getint("ARDB_Onion", "db"), | ||||
|         decode_responses=True) | ||||
| 
 | ||||
|     r_cache.sadd('all_crawler:{}'.format(type_hidden_service), splash_port) | ||||
|     r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting') | ||||
|     r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d  -  %H:%M.%S")) | ||||
| 
 | ||||
|     # load domains blacklist | ||||
|     try: | ||||
|         with open(os.environ['AIL_BIN']+'/torcrawler/blacklist_onion.txt', 'r') as f: | ||||
|  | @ -180,7 +194,10 @@ if __name__ == '__main__': | |||
|                 print('domain:      {}'.format(domain)) | ||||
|                 print('domain_url:  {}'.format(domain_url)) | ||||
| 
 | ||||
|                 if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain): | ||||
|                 faup.decode(domain) | ||||
|                 onion_domain=faup.get()['domain'].decode() | ||||
| 
 | ||||
|                 if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain) and not r_onion.sismember('blacklist_{}'.format(type_hidden_service), onion_domain): | ||||
| 
 | ||||
|                     date = datetime.datetime.now().strftime("%Y%m%d") | ||||
|                     date_month = datetime.datetime.now().strftime("%Y%m") | ||||
|  | @ -243,6 +260,10 @@ if __name__ == '__main__': | |||
|                         r_onion.lpush('last_{}'.format(type_hidden_service), domain) | ||||
|                         r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15) | ||||
| 
 | ||||
|                         #update crawler status | ||||
|                         r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting') | ||||
|                         r_cache.hrem('metadata_crawler:{}'.format(splash_port), 'crawling_domain') | ||||
| 
 | ||||
|             else: | ||||
|                 continue | ||||
|         else: | ||||
|  |  | |||
|  | @ -30,6 +30,12 @@ r_serv = redis.StrictRedis( | |||
|     db=cfg.getint("Redis_Queues", "db"), | ||||
|     decode_responses=True) | ||||
| 
 | ||||
| r_cache = redis.StrictRedis( | ||||
|     host=cfg.get("Redis_Cache", "host"), | ||||
|     port=cfg.getint("Redis_Cache", "port"), | ||||
|     db=cfg.getint("Redis_Cache", "db"), | ||||
|     decode_responses=True) | ||||
| 
 | ||||
| r_serv_log = redis.StrictRedis( | ||||
|     host=cfg.get("Redis_Log", "host"), | ||||
|     port=cfg.getint("Redis_Log", "port"), | ||||
|  |  | |||
|  | @ -19,6 +19,7 @@ import Flask_config | |||
| app = Flask_config.app | ||||
| cfg = Flask_config.cfg | ||||
| baseUrl = Flask_config.baseUrl | ||||
| r_cache = Flask_config.r_cache | ||||
| r_serv_onion = Flask_config.r_serv_onion | ||||
| r_serv_metadata = Flask_config.r_serv_metadata | ||||
| bootstrap_label = Flask_config.bootstrap_label | ||||
|  | @ -102,8 +103,22 @@ def hiddenServices_page(): | |||
|             metadata_onion['status_icon'] = 'fa-times-circle' | ||||
|         list_onion.append(metadata_onion) | ||||
| 
 | ||||
|     crawler_metadata=[] | ||||
|     all_onion_crawler = r_cache.smembers('all_crawler:onion') | ||||
|     for crawler in all_onion_crawler: | ||||
|         crawling_domain = r_cache.hget('metadata_crawler:{}'.format(splash_port), 'crawling_domain') | ||||
|         started_time = r_cache.hget('metadata_crawler:{}'.format(splash_port), 'started_time') | ||||
|         status_info = r_cache.hget('metadata_crawler:{}'.format(splash_port), 'status') | ||||
|         crawler_info = '{}  - {}'.format(crawler, started_time) | ||||
|         if status_info=='Waiting' or status_info=='Crawling': | ||||
|             status=True | ||||
|         else: | ||||
|             status=False | ||||
|         crawler_metadata.append({'crawler_info': crawler, 'crawling_domain': crawling_domain, 'status_info': status_info, 'status': status}) | ||||
| 
 | ||||
|     date_string = '{}-{}-{}'.format(date[0:4], date[4:6], date[6:8]) | ||||
|     return render_template("hiddenServices.html", last_onions=list_onion, statDomains=statDomains, date_from=date_string, date_to=date_string) | ||||
|     return render_template("hiddenServices.html", last_onions=list_onion, statDomains=statDomains, | ||||
|                             crawler_metadata=crawler_metadata, date_from=date_string, date_to=date_string) | ||||
| 
 | ||||
| @hiddenServices.route("/hiddenServices/last_crawled_domains_with_stats_json", methods=['GET']) | ||||
| def last_crawled_domains_with_stats_json(): | ||||
|  |  | |||
|  | @ -142,7 +142,6 @@ | |||
| 							</div> | ||||
| 						</div> | ||||
| 
 | ||||
| 
 | ||||
| 						<div class="panel panel-info"> | ||||
|       				<div class="panel-heading"> | ||||
|                 <i class="fa fa-eye-slash"></i> Domains Crawled Today | ||||
|  | @ -203,6 +202,33 @@ | |||
|                 </tbody> | ||||
|               </table> | ||||
|             </div> | ||||
| 
 | ||||
| 						{%if crawler_metadata%} | ||||
| 							<div class="panel panel-info"> | ||||
| 	      				<div class="panel-heading"> | ||||
| 	                Crawlers Status | ||||
| 	      				</div> | ||||
| 
 | ||||
| 	              <table class="table table-hover table-striped"> | ||||
| 	                <tbody> | ||||
| 										{% for crawler in crawler_metadata %} | ||||
| 		                  <tr> | ||||
| 			                   <td> | ||||
| 													<i class="fa fa-{%if crawler['status']%}check{%else%}times{%endif%}-circle fa-2x" style="color:{%if crawler['status']%}Green{%else%}Red{%endif%};"></i> {{crawler['crawler_info']}} | ||||
| 												</td> | ||||
| 												<td> | ||||
| 													{{crawler['crawling_domain']}} | ||||
| 												</td> | ||||
| 												<td style="color:{%if crawler['status']%}Green{%else%}Red{%endif%};"> | ||||
| 													{{crawler['status_info']}} | ||||
| 												</td> | ||||
| 		                  </tr> | ||||
| 										{% endfor %} | ||||
| 	                </tbody> | ||||
| 	              </table> | ||||
| 	            </div> | ||||
| 						{%endif%} | ||||
| 
 | ||||
| 					</div> | ||||
| 
 | ||||
| 				</div> | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	 Terrtia
						Terrtia