AIL-framework/update/v2.7/Update_domain_tags.py

128 lines
4.1 KiB
Python
Executable File

#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import sys
import time
from pyfaup.faup import Faup
sys.path.append(os.environ['AIL_BIN'])
from packages import Date
from lib import ConfigLoader
def sanitize_domain(domain):
faup.decode(domain)
domain_sanitized = faup.get()
domain_sanitized = domain_sanitized['domain']
try:
domain_sanitized = domain_sanitized.decode()
except:
pass
return domain_sanitized.lower()
def get_all_obj_tags(obj_type):
return list(r_serv_tags.smembers(f'list_tags:{obj_type}'))
def add_global_tag(tag, object_type=None):
r_serv_tags.sadd('list_tags', tag)
if object_type:
r_serv_tags.sadd('list_tags:{}'.format(object_type), tag)
def get_obj_tag(object_id):
res = r_serv_metadata.smembers('tag:{}'.format(object_id))
if res:
return list(res)
else:
return []
def delete_domain_tag_daterange():
all_domains_tags = get_all_obj_tags('domain')
nb_updated = 0
nb_to_update = len(all_domains_tags)
if nb_to_update == 0:
nb_to_update = 1
refresh_time = time.time()
l_dates = Date.substract_date('20191008', Date.get_today_date_str())
for tag in all_domains_tags:
for date_day in l_dates:
r_serv_tags.delete('domain:{}:{}'.format(tag, date_day))
nb_updated += 1
refresh_time = update_progress(refresh_time, nb_updated, nb_to_update)
r_serv_db.delete('ail:update_v2.7:deletetagrange')
def update_domain_tags(domain):
domain_sanitized = sanitize_domain(domain)
if domain != domain_sanitized:
r_serv_onion.sadd('incorrect_domain', domain)
domain = domain_sanitized
domain_tags = get_obj_tag(domain)
for tag in domain_tags:
# delete incorrect tags
if tag == 'infoleak:submission="crawler"' or tag == 'infoleak:submission="manual"':
r_serv_metadata.srem('tag:{}'.format(domain), tag)
else:
add_global_tag(tag, object_type='domain')
r_serv_tags.sadd('{}:{}'.format('domain', tag), domain)
def update_progress(refresh_time, nb_updated, nb_elem_to_update):
if time.time() - refresh_time > 10:
progress = int((nb_updated * 100) / nb_elem_to_update)
print('{}/{} updated {}%'.format(nb_updated, nb_elem_to_update, progress))
r_serv_db.set('ail:current_background_script_stat', progress)
refresh_time = time.time()
return refresh_time
def update_db():
nb_updated = 0
nb_to_update = r_serv_onion.scard('domain_update_v2.7')
refresh_time = time.time()
r_serv_db.set('ail:current_background_script_stat', 0)
r_serv_db.set('ail:current_background_script', 'domain tags update')
domain = r_serv_onion.spop('domain_update_v2.7')
while domain is not None:
update_domain_tags(domain)
nb_updated += 1
refresh_time = update_progress(refresh_time, nb_updated, nb_to_update)
domain = r_serv_onion.spop('domain_update_v2.7')
if r_serv_db.exists('ail:update_v2.7:deletetagrange'):
r_serv_db.set('ail:current_background_script_stat', 0)
r_serv_db.set('ail:current_background_script', 'tags: remove deprecated keys')
delete_domain_tag_daterange()
# sort all crawled domain
r_serv_onion.sort('full_onion_up', alpha=True)
r_serv_onion.sort('full_regular_up', alpha=True)
if __name__ == '__main__':
start_deb = time.time()
faup = Faup()
config_loader = ConfigLoader.ConfigLoader()
r_serv_db = config_loader.get_redis_conn("ARDB_DB")
r_serv_tags = config_loader.get_redis_conn("ARDB_Tags")
r_serv_onion = config_loader.get_redis_conn("ARDB_Onion")
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
config_loader = None
update_version = 'v2.7'
r_serv_db.set('ail:update_in_progress', update_version)
r_serv_db.set('ail:current_background_update', update_version)
r_serv_db.set('ail:current_background_script_stat', 0)
r_serv_db.set('ail:current_background_script', 'tags update')
update_db()
r_serv_db.set('ail:current_background_script_stat', 100)
end = time.time()
print('ALL domains tags updated in {} s'.format(end - start_deb))