mirror of https://github.com/CIRCL/AIL-framework
				
				
				
			
		
			
				
	
	
		
			153 lines
		
	
	
		
			6.1 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
			
		
		
	
	
			153 lines
		
	
	
		
			6.1 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
| #!/usr/bin/env python3
 | |
| # -*-coding:UTF-8 -*
 | |
| 
 | |
| import os
 | |
| import sys
 | |
| import time
 | |
| import redis
 | |
| import datetime
 | |
| import configparser
 | |
| 
 | |
| def substract_date(date_from, date_to):
 | |
|     date_from = datetime.date(int(date_from[0:4]), int(date_from[4:6]), int(date_from[6:8]))
 | |
|     date_to = datetime.date(int(date_to[0:4]), int(date_to[4:6]), int(date_to[6:8]))
 | |
|     delta = date_to - date_from # timedelta
 | |
|     l_date = []
 | |
|     for i in range(delta.days + 1):
 | |
|         date = date_from + datetime.timedelta(i)
 | |
|         l_date.append( date.strftime('%Y%m%d') )
 | |
|     return l_date
 | |
| 
 | |
| def get_date_epoch(date):
 | |
|     return int(datetime.datetime(int(date[0:4]), int(date[4:6]), int(date[6:8])).timestamp())
 | |
| 
 | |
| def get_domain_root_from_paste_childrens(item_father, domain):
 | |
|     item_children = r_serv_metadata.smembers('paste_children:{}'.format(item_father))
 | |
|     domain_root = ''
 | |
|     for item_path in item_children:
 | |
|         # remove absolute_path
 | |
|         if PASTES_FOLDER in item_path:
 | |
|             r_serv_metadata.srem('paste_children:{}'.format(item_father), item_path)
 | |
|             item_path = item_path.replace(PASTES_FOLDER, '', 1)
 | |
|             r_serv_metadata.sadd('paste_children:{}'.format(item_father), item_path)
 | |
|         if domain in item_path:
 | |
|             domain_root = item_path
 | |
|     return domain_root
 | |
| 
 | |
| 
 | |
| if __name__ == '__main__':
 | |
| 
 | |
|     start_deb = time.time()
 | |
| 
 | |
|     configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
 | |
|     if not os.path.exists(configfile):
 | |
|         raise Exception('Unable to find the configuration file. \
 | |
|                         Did you set environment variables? \
 | |
|                         Or activate the virtualenv.')
 | |
|     cfg = configparser.ConfigParser()
 | |
|     cfg.read(configfile)
 | |
| 
 | |
|     PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + '/'
 | |
| 
 | |
|     r_serv = redis.StrictRedis(
 | |
|         host=cfg.get("ARDB_DB", "host"),
 | |
|         port=cfg.getint("ARDB_DB", "port"),
 | |
|         db=cfg.getint("ARDB_DB", "db"),
 | |
|         decode_responses=True)
 | |
| 
 | |
|     r_serv_metadata = redis.StrictRedis(
 | |
|         host=cfg.get("ARDB_Metadata", "host"),
 | |
|         port=cfg.getint("ARDB_Metadata", "port"),
 | |
|         db=cfg.getint("ARDB_Metadata", "db"),
 | |
|         decode_responses=True)
 | |
| 
 | |
|     r_serv_tag = redis.StrictRedis(
 | |
|         host=cfg.get("ARDB_Tags", "host"),
 | |
|         port=cfg.getint("ARDB_Tags", "port"),
 | |
|         db=cfg.getint("ARDB_Tags", "db"),
 | |
|         decode_responses=True)
 | |
| 
 | |
|     r_serv_onion = redis.StrictRedis(
 | |
|         host=cfg.get("ARDB_Onion", "host"),
 | |
|         port=cfg.getint("ARDB_Onion", "port"),
 | |
|         db=cfg.getint("ARDB_Onion", "db"),
 | |
|         decode_responses=True)
 | |
| 
 | |
|     r_serv.set('ail:current_background_script', 'onions')
 | |
|     r_serv.set('ail:current_background_script_stat', 0)
 | |
| 
 | |
|     ## Update Onion ##
 | |
|     print('Updating ARDB_Onion ...')
 | |
|     index = 0
 | |
|     start = time.time()
 | |
| 
 | |
|     # clean down domain from db
 | |
|     date_from = '20180929'
 | |
|     date_today = datetime.date.today().strftime("%Y%m%d")
 | |
|     for date in substract_date(date_from, date_today):
 | |
| 
 | |
|         onion_down = r_serv_onion.smembers('onion_down:{}'.format(date))
 | |
|         #print(onion_down)
 | |
|         for onion_domain in onion_down:
 | |
|             if not r_serv_onion.sismember('full_onion_up', onion_domain):
 | |
|                 # delete history
 | |
|                 all_onion_history = r_serv_onion.lrange('onion_history:{}'.format(onion_domain), 0 ,-1)
 | |
|                 if all_onion_history:
 | |
|                     for date_history in all_onion_history:
 | |
|                         #print('onion_history:{}:{}'.format(onion_domain, date_history))
 | |
|                         r_serv_onion.delete('onion_history:{}:{}'.format(onion_domain, date_history))
 | |
|                     r_serv_onion.delete('onion_history:{}'.format(onion_domain))
 | |
| 
 | |
|     #stats
 | |
|     total_domain = r_serv_onion.scard('full_onion_up')
 | |
|     nb_updated = 0
 | |
|     last_progress = 0
 | |
| 
 | |
|     # clean up domain
 | |
|     all_domain_up = r_serv_onion.smembers('full_onion_up')
 | |
|     for onion_domain in all_domain_up:
 | |
|         # delete history
 | |
|         all_onion_history = r_serv_onion.lrange('onion_history:{}'.format(onion_domain), 0 ,-1)
 | |
|         if all_onion_history:
 | |
|             for date_history in all_onion_history:
 | |
|                 print('--------')
 | |
|                 print('onion_history:{}:{}'.format(onion_domain, date_history))
 | |
|                 item_father = r_serv_onion.lrange('onion_history:{}:{}'.format(onion_domain, date_history), 0, 0)
 | |
|                 print('item_father: {}'.format(item_father))
 | |
|                 try:
 | |
|                     item_father = item_father[0]
 | |
|                 except IndexError:
 | |
|                     r_serv_onion.delete('onion_history:{}:{}'.format(onion_domain, date_history))
 | |
|                     continue
 | |
|                 #print(item_father)
 | |
|                 # delete old history
 | |
|                 r_serv_onion.delete('onion_history:{}:{}'.format(onion_domain, date_history))
 | |
|                 # create new history
 | |
|                 root_key = get_domain_root_from_paste_childrens(item_father, onion_domain)
 | |
|                 if root_key:
 | |
|                     r_serv_onion.zadd('crawler_history_onion:{}:80'.format(onion_domain), get_date_epoch(date_history), root_key)
 | |
|                     print('crawler_history_onion:{}:80   {}   {}'.format(onion_domain, get_date_epoch(date_history), root_key))
 | |
|                     #update service metadata: paste_parent
 | |
|                     r_serv_onion.hset('onion_metadata:{}'.format(onion_domain), 'paste_parent', root_key)
 | |
| 
 | |
|             r_serv_onion.delete('onion_history:{}'.format(onion_domain))
 | |
| 
 | |
|         r_serv_onion.hset('onion_metadata:{}'.format(onion_domain), 'ports', '80')
 | |
|         r_serv_onion.hdel('onion_metadata:{}'.format(onion_domain), 'last_seen')
 | |
| 
 | |
|         nb_updated += 1
 | |
|         progress = int((nb_updated * 100) /total_domain)
 | |
|         print('{}/{}    updated    {}%'.format(nb_updated, total_domain, progress))
 | |
|         # update progress stats
 | |
|         if progress != last_progress:
 | |
|             r_serv.set('ail:current_background_script_stat', progress)
 | |
|             last_progress = progress
 | |
| 
 | |
| 
 | |
|     end = time.time()
 | |
|     print('Updating ARDB_Onion Done => {} paths: {} s'.format(index, end - start))
 | |
|     print()
 | |
|     print('Done in {} s'.format(end - start_deb))
 | |
| 
 | |
|     r_serv.sadd('ail:update_v1.5', 'onions')
 |