mirror of https://github.com/CIRCL/AIL-framework
				
				
				
			
		
			
				
	
	
		
			96 lines
		
	
	
		
			3.1 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
			
		
		
	
	
			96 lines
		
	
	
		
			3.1 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
| #!/usr/bin/env python3
 | |
| # -*-coding:UTF-8 -*
 | |
| 
 | |
| import os
 | |
| import sys
 | |
| import time
 | |
| import redis
 | |
| import datetime
 | |
| 
 | |
| from hashlib import sha256
 | |
| 
 | |
| from pyfaup.faup import Faup
 | |
| 
 | |
| sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/'))
 | |
| import Item
 | |
| 
 | |
| sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
 | |
| import ConfigLoader
 | |
| 
 | |
| def get_all_item(screenshot_sha256):
 | |
|     return r_serv_onion.smembers('screenshot:{}'.format(screenshot_sha256))
 | |
| 
 | |
| def sanitize_domain(domain):
 | |
|     faup.decode(domain)
 | |
|     domain_sanitized = faup.get()
 | |
|     domain_sanitized = domain_sanitized['domain']
 | |
|     try:
 | |
|         domain_sanitized = domain_sanitized.decode()
 | |
|     except:
 | |
|         pass
 | |
|     return domain_sanitized.lower()
 | |
| 
 | |
| def update_db(screenshot_sha256):
 | |
|     screenshot_items = get_all_item(screenshot_sha256)
 | |
|     if screenshot_items:
 | |
|         for item_id in screenshot_items:
 | |
|             item_id = item_id.replace(PASTES_FOLDER+'/', '', 1) # remove root path
 | |
|             domain = Item.get_domain(item_id)
 | |
| 
 | |
|             domain_sanitized = sanitize_domain(domain)
 | |
|             if domain != domain_sanitized:
 | |
|                 r_serv_onion.sadd('incorrect_domain', domain)
 | |
|                 domain = domain_sanitized
 | |
| 
 | |
|             #print(item_id)
 | |
|             #print(domain)
 | |
| 
 | |
|             r_serv_onion.sadd('domain_screenshot:{}'.format(domain), screenshot_sha256)
 | |
|             r_serv_onion.sadd('screenshot_domain:{}'.format(screenshot_sha256), domain)
 | |
|     else:
 | |
|         pass
 | |
|         # broken screenshot
 | |
|         r_serv_onion.sadd('broken_screenshot', screenshot_sha256)
 | |
| 
 | |
| 
 | |
| if __name__ == '__main__':
 | |
| 
 | |
|     start_deb = time.time()
 | |
|     faup = Faup()
 | |
| 
 | |
|     config_loader = ConfigLoader.ConfigLoader()
 | |
| 
 | |
|     PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "pastes"))
 | |
|     SCREENSHOT_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "crawled_screenshot"), 'screenshot')
 | |
| 
 | |
|     r_serv_db = config_loader.get_redis_conn("ARDB_DB")
 | |
|     r_serv_onion = config_loader.get_redis_conn("ARDB_Onion")
 | |
|     config_loader = None
 | |
| 
 | |
|     r_serv_db.set('ail:update_in_progress', 'v2.6')
 | |
|     r_serv_db.set('ail:current_background_update', 'v2.6')
 | |
| 
 | |
|     r_serv_db.set('ail:current_background_script_stat', 20)
 | |
|     r_serv_db.set('ail:current_background_script', 'screenshot update')
 | |
| 
 | |
|     nb = 0
 | |
| 
 | |
|     if os.path.isdir(SCREENSHOT_FOLDER):
 | |
|         for root, dirs, files in os.walk(SCREENSHOT_FOLDER, topdown=False):
 | |
|             #print(dirs)
 | |
|             for name in files:
 | |
|                 nb = nb + 1
 | |
|                 screenshot_sha256 = os.path.join(root, name)
 | |
|                 screenshot_sha256 = screenshot_sha256[:-4] # remove .png
 | |
|                 screenshot_sha256 = screenshot_sha256.replace(SCREENSHOT_FOLDER, '', 1)
 | |
|                 screenshot_sha256 = screenshot_sha256.replace('/', '')
 | |
|                 update_db(screenshot_sha256)
 | |
|                 #print('Screenshot updated: {}'.format(nb))
 | |
|                 if nb % 1000 == 0:
 | |
|                     r_serv_db.set('ail:current_background_script', 'screenshot updated: {}'.format(nb))
 | |
| 
 | |
|     r_serv_db.set('ail:current_background_script_stat', 100)
 | |
| 
 | |
|     end = time.time()
 | |
|     print('ALL screenshot updated: {} in {} s'.format(nb, end - start_deb))
 |