mirror of https://github.com/CIRCL/AIL-framework
96 lines
3.1 KiB
Python
Executable File
96 lines
3.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# -*-coding:UTF-8 -*
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import redis
|
|
import datetime
|
|
|
|
from hashlib import sha256
|
|
|
|
from pyfaup.faup import Faup
|
|
|
|
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/'))
|
|
import Item
|
|
|
|
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
|
|
import ConfigLoader
|
|
|
|
def get_all_item(screenshot_sha256):
|
|
return r_serv_onion.smembers('screenshot:{}'.format(screenshot_sha256))
|
|
|
|
def sanitize_domain(domain):
|
|
faup.decode(domain)
|
|
domain_sanitized = faup.get()
|
|
domain_sanitized = domain_sanitized['domain']
|
|
try:
|
|
domain_sanitized = domain_sanitized.decode()
|
|
except:
|
|
pass
|
|
return domain_sanitized.lower()
|
|
|
|
def update_db(screenshot_sha256):
|
|
screenshot_items = get_all_item(screenshot_sha256)
|
|
if screenshot_items:
|
|
for item_id in screenshot_items:
|
|
item_id = item_id.replace(PASTES_FOLDER+'/', '', 1) # remove root path
|
|
domain = Item.get_domain(item_id)
|
|
|
|
domain_sanitized = sanitize_domain(domain)
|
|
if domain != domain_sanitized:
|
|
r_serv_onion.sadd('incorrect_domain', domain)
|
|
domain = domain_sanitized
|
|
|
|
#print(item_id)
|
|
#print(domain)
|
|
|
|
r_serv_onion.sadd('domain_screenshot:{}'.format(domain), screenshot_sha256)
|
|
r_serv_onion.sadd('screenshot_domain:{}'.format(screenshot_sha256), domain)
|
|
else:
|
|
pass
|
|
# broken screenshot
|
|
r_serv_onion.sadd('broken_screenshot', screenshot_sha256)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
start_deb = time.time()
|
|
faup = Faup()
|
|
|
|
config_loader = ConfigLoader.ConfigLoader()
|
|
|
|
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "pastes"))
|
|
SCREENSHOT_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "crawled_screenshot"), 'screenshot')
|
|
|
|
r_serv_db = config_loader.get_redis_conn("ARDB_DB")
|
|
r_serv_onion = config_loader.get_redis_conn("ARDB_Onion")
|
|
config_loader = None
|
|
|
|
r_serv_db.set('ail:update_in_progress', 'v2.6')
|
|
r_serv_db.set('ail:current_background_update', 'v2.6')
|
|
|
|
r_serv_db.set('ail:current_background_script_stat', 20)
|
|
r_serv_db.set('ail:current_background_script', 'screenshot update')
|
|
|
|
nb = 0
|
|
|
|
if os.path.isdir(SCREENSHOT_FOLDER):
|
|
for root, dirs, files in os.walk(SCREENSHOT_FOLDER, topdown=False):
|
|
#print(dirs)
|
|
for name in files:
|
|
nb = nb + 1
|
|
screenshot_sha256 = os.path.join(root, name)
|
|
screenshot_sha256 = screenshot_sha256[:-4] # remove .png
|
|
screenshot_sha256 = screenshot_sha256.replace(SCREENSHOT_FOLDER, '', 1)
|
|
screenshot_sha256 = screenshot_sha256.replace('/', '')
|
|
update_db(screenshot_sha256)
|
|
#print('Screenshot updated: {}'.format(nb))
|
|
if nb % 1000 == 0:
|
|
r_serv_db.set('ail:current_background_script', 'screenshot updated: {}'.format(nb))
|
|
|
|
r_serv_db.set('ail:current_background_script_stat', 100)
|
|
|
|
end = time.time()
|
|
print('ALL screenshot updated: {} in {} s'.format(nb, end - start_deb))
|