AIL-framework/update/v1.5/Update-ARDB_Onions.py

131 lines
5.3 KiB
Python
Raw Normal View History

2019-04-08 17:04:09 +02:00
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import sys
import time
import redis
import datetime
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
import ConfigLoader
2019-04-08 17:04:09 +02:00
def substract_date(date_from, date_to):
date_from = datetime.date(int(date_from[0:4]), int(date_from[4:6]), int(date_from[6:8]))
date_to = datetime.date(int(date_to[0:4]), int(date_to[4:6]), int(date_to[6:8]))
delta = date_to - date_from # timedelta
l_date = []
for i in range(delta.days + 1):
date = date_from + datetime.timedelta(i)
l_date.append( date.strftime('%Y%m%d') )
return l_date
def get_date_epoch(date):
return int(datetime.datetime(int(date[0:4]), int(date[4:6]), int(date[6:8])).timestamp())
def get_domain_root_from_paste_childrens(item_father, domain):
item_children = r_serv_metadata.smembers('paste_children:{}'.format(item_father))
domain_root = ''
for item_path in item_children:
# remove absolute_path
if PASTES_FOLDER in item_path:
r_serv_metadata.srem('paste_children:{}'.format(item_father), item_path)
2019-04-08 17:04:09 +02:00
item_path = item_path.replace(PASTES_FOLDER, '', 1)
r_serv_metadata.sadd('paste_children:{}'.format(item_father), item_path)
2019-04-08 17:04:09 +02:00
if domain in item_path:
domain_root = item_path
return domain_root
if __name__ == '__main__':
start_deb = time.time()
config_loader = ConfigLoader.ConfigLoader()
2019-04-08 17:04:09 +02:00
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "pastes")) + '/'
2019-04-08 17:04:09 +02:00
r_serv = config_loader.get_redis_conn("ARDB_DB")
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
r_serv_tag = config_loader.get_redis_conn("ARDB_Tags")
r_serv_onion = config_loader.get_redis_conn("ARDB_Onion")
config_loader = None
2019-04-08 17:04:09 +02:00
r_serv.set('ail:current_background_script', 'onions')
r_serv.set('ail:current_background_script_stat', 0)
2019-04-08 17:04:09 +02:00
## Update Onion ##
print('Updating ARDB_Onion ...')
index = 0
start = time.time()
# clean down domain from db
date_from = '20180929'
date_today = datetime.date.today().strftime("%Y%m%d")
for date in substract_date(date_from, date_today):
onion_down = r_serv_onion.smembers('onion_down:{}'.format(date))
#print(onion_down)
for onion_domain in onion_down:
if not r_serv_onion.sismember('full_onion_up', onion_domain):
# delete history
all_onion_history = r_serv_onion.lrange('onion_history:{}'.format(onion_domain), 0 ,-1)
if all_onion_history:
for date_history in all_onion_history:
#print('onion_history:{}:{}'.format(onion_domain, date_history))
2019-04-10 17:47:40 +02:00
r_serv_onion.delete('onion_history:{}:{}'.format(onion_domain, date_history))
r_serv_onion.delete('onion_history:{}'.format(onion_domain))
2019-04-08 17:04:09 +02:00
#stats
total_domain = r_serv_onion.scard('full_onion_up')
nb_updated = 0
last_progress = 0
2019-04-08 17:04:09 +02:00
# clean up domain
all_domain_up = r_serv_onion.smembers('full_onion_up')
for onion_domain in all_domain_up:
# delete history
all_onion_history = r_serv_onion.lrange('onion_history:{}'.format(onion_domain), 0 ,-1)
if all_onion_history:
for date_history in all_onion_history:
print('--------')
print('onion_history:{}:{}'.format(onion_domain, date_history))
item_father = r_serv_onion.lrange('onion_history:{}:{}'.format(onion_domain, date_history), 0, 0)
print('item_father: {}'.format(item_father))
2019-04-12 17:32:17 +02:00
try:
item_father = item_father[0]
except IndexError:
r_serv_onion.delete('onion_history:{}:{}'.format(onion_domain, date_history))
continue
2019-04-08 17:04:09 +02:00
#print(item_father)
# delete old history
2019-04-10 17:47:40 +02:00
r_serv_onion.delete('onion_history:{}:{}'.format(onion_domain, date_history))
2019-04-08 17:04:09 +02:00
# create new history
root_key = get_domain_root_from_paste_childrens(item_father, onion_domain)
if root_key:
2019-04-10 17:47:40 +02:00
r_serv_onion.zadd('crawler_history_onion:{}:80'.format(onion_domain), get_date_epoch(date_history), root_key)
2019-04-08 17:04:09 +02:00
print('crawler_history_onion:{}:80 {} {}'.format(onion_domain, get_date_epoch(date_history), root_key))
#update service metadata: paste_parent
2019-04-10 17:47:40 +02:00
r_serv_onion.hset('onion_metadata:{}'.format(onion_domain), 'paste_parent', root_key)
2019-04-08 17:04:09 +02:00
2019-04-10 17:47:40 +02:00
r_serv_onion.delete('onion_history:{}'.format(onion_domain))
2019-04-08 17:04:09 +02:00
2019-04-10 17:47:40 +02:00
r_serv_onion.hset('onion_metadata:{}'.format(onion_domain), 'ports', '80')
r_serv_onion.hdel('onion_metadata:{}'.format(onion_domain), 'last_seen')
2019-04-08 17:04:09 +02:00
nb_updated += 1
progress = int((nb_updated * 100) /total_domain)
print('{}/{} updated {}%'.format(nb_updated, total_domain, progress))
# update progress stats
if progress != last_progress:
r_serv.set('ail:current_background_script_stat', progress)
last_progress = progress
2019-04-08 17:04:09 +02:00
end = time.time()
print('Updating ARDB_Onion Done => {} paths: {} s'.format(index, end - start))
print()
print('Done in {} s'.format(end - start_deb))
r_serv.sadd('ail:update_v1.5', 'onions')