2019-04-08 17:04:09 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# -*-coding:UTF-8 -*
|
|
|
|
|
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
import time
|
|
|
|
import redis
|
|
|
|
import datetime
|
2019-11-05 15:18:03 +01:00
|
|
|
|
|
|
|
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
|
|
|
|
import ConfigLoader
|
2019-04-08 17:04:09 +02:00
|
|
|
|
|
|
|
def substract_date(date_from, date_to):
|
|
|
|
date_from = datetime.date(int(date_from[0:4]), int(date_from[4:6]), int(date_from[6:8]))
|
|
|
|
date_to = datetime.date(int(date_to[0:4]), int(date_to[4:6]), int(date_to[6:8]))
|
|
|
|
delta = date_to - date_from # timedelta
|
|
|
|
l_date = []
|
|
|
|
for i in range(delta.days + 1):
|
|
|
|
date = date_from + datetime.timedelta(i)
|
|
|
|
l_date.append( date.strftime('%Y%m%d') )
|
|
|
|
return l_date
|
|
|
|
|
|
|
|
def get_date_epoch(date):
|
|
|
|
return int(datetime.datetime(int(date[0:4]), int(date[4:6]), int(date[6:8])).timestamp())
|
|
|
|
|
|
|
|
def get_domain_root_from_paste_childrens(item_father, domain):
|
|
|
|
item_children = r_serv_metadata.smembers('paste_children:{}'.format(item_father))
|
|
|
|
domain_root = ''
|
|
|
|
for item_path in item_children:
|
|
|
|
# remove absolute_path
|
|
|
|
if PASTES_FOLDER in item_path:
|
2019-04-11 17:49:20 +02:00
|
|
|
r_serv_metadata.srem('paste_children:{}'.format(item_father), item_path)
|
2019-04-08 17:04:09 +02:00
|
|
|
item_path = item_path.replace(PASTES_FOLDER, '', 1)
|
2019-04-11 17:49:20 +02:00
|
|
|
r_serv_metadata.sadd('paste_children:{}'.format(item_father), item_path)
|
2019-04-08 17:04:09 +02:00
|
|
|
if domain in item_path:
|
|
|
|
domain_root = item_path
|
|
|
|
return domain_root
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
|
|
start_deb = time.time()
|
|
|
|
|
2019-11-05 15:18:03 +01:00
|
|
|
config_loader = ConfigLoader.ConfigLoader()
|
2019-04-08 17:04:09 +02:00
|
|
|
|
2019-11-05 15:18:03 +01:00
|
|
|
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "pastes")) + '/'
|
2019-04-08 17:04:09 +02:00
|
|
|
|
2019-11-05 15:18:03 +01:00
|
|
|
r_serv = config_loader.get_redis_conn("ARDB_DB")
|
|
|
|
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
|
|
|
|
r_serv_tag = config_loader.get_redis_conn("ARDB_Tags")
|
|
|
|
r_serv_onion = config_loader.get_redis_conn("ARDB_Onion")
|
|
|
|
config_loader = None
|
2019-04-08 17:04:09 +02:00
|
|
|
|
2019-04-17 14:39:29 +02:00
|
|
|
r_serv.set('ail:current_background_script', 'onions')
|
2019-04-17 17:07:09 +02:00
|
|
|
r_serv.set('ail:current_background_script_stat', 0)
|
2019-04-17 14:39:29 +02:00
|
|
|
|
2019-04-08 17:04:09 +02:00
|
|
|
## Update Onion ##
|
|
|
|
print('Updating ARDB_Onion ...')
|
|
|
|
index = 0
|
|
|
|
start = time.time()
|
|
|
|
|
|
|
|
# clean down domain from db
|
|
|
|
date_from = '20180929'
|
|
|
|
date_today = datetime.date.today().strftime("%Y%m%d")
|
|
|
|
for date in substract_date(date_from, date_today):
|
|
|
|
|
|
|
|
onion_down = r_serv_onion.smembers('onion_down:{}'.format(date))
|
|
|
|
#print(onion_down)
|
|
|
|
for onion_domain in onion_down:
|
|
|
|
if not r_serv_onion.sismember('full_onion_up', onion_domain):
|
|
|
|
# delete history
|
|
|
|
all_onion_history = r_serv_onion.lrange('onion_history:{}'.format(onion_domain), 0 ,-1)
|
|
|
|
if all_onion_history:
|
|
|
|
for date_history in all_onion_history:
|
|
|
|
#print('onion_history:{}:{}'.format(onion_domain, date_history))
|
2019-04-10 17:47:40 +02:00
|
|
|
r_serv_onion.delete('onion_history:{}:{}'.format(onion_domain, date_history))
|
|
|
|
r_serv_onion.delete('onion_history:{}'.format(onion_domain))
|
2019-04-08 17:04:09 +02:00
|
|
|
|
2019-04-17 17:07:09 +02:00
|
|
|
#stats
|
|
|
|
total_domain = r_serv_onion.scard('full_onion_up')
|
|
|
|
nb_updated = 0
|
|
|
|
last_progress = 0
|
|
|
|
|
2019-04-08 17:04:09 +02:00
|
|
|
# clean up domain
|
|
|
|
all_domain_up = r_serv_onion.smembers('full_onion_up')
|
|
|
|
for onion_domain in all_domain_up:
|
|
|
|
# delete history
|
|
|
|
all_onion_history = r_serv_onion.lrange('onion_history:{}'.format(onion_domain), 0 ,-1)
|
|
|
|
if all_onion_history:
|
|
|
|
for date_history in all_onion_history:
|
|
|
|
print('--------')
|
|
|
|
print('onion_history:{}:{}'.format(onion_domain, date_history))
|
|
|
|
item_father = r_serv_onion.lrange('onion_history:{}:{}'.format(onion_domain, date_history), 0, 0)
|
|
|
|
print('item_father: {}'.format(item_father))
|
2019-04-12 17:32:17 +02:00
|
|
|
try:
|
|
|
|
item_father = item_father[0]
|
|
|
|
except IndexError:
|
|
|
|
r_serv_onion.delete('onion_history:{}:{}'.format(onion_domain, date_history))
|
|
|
|
continue
|
2019-04-08 17:04:09 +02:00
|
|
|
#print(item_father)
|
|
|
|
# delete old history
|
2019-04-10 17:47:40 +02:00
|
|
|
r_serv_onion.delete('onion_history:{}:{}'.format(onion_domain, date_history))
|
2019-04-08 17:04:09 +02:00
|
|
|
# create new history
|
|
|
|
root_key = get_domain_root_from_paste_childrens(item_father, onion_domain)
|
|
|
|
if root_key:
|
2019-04-10 17:47:40 +02:00
|
|
|
r_serv_onion.zadd('crawler_history_onion:{}:80'.format(onion_domain), get_date_epoch(date_history), root_key)
|
2019-04-08 17:04:09 +02:00
|
|
|
print('crawler_history_onion:{}:80 {} {}'.format(onion_domain, get_date_epoch(date_history), root_key))
|
|
|
|
#update service metadata: paste_parent
|
2019-04-10 17:47:40 +02:00
|
|
|
r_serv_onion.hset('onion_metadata:{}'.format(onion_domain), 'paste_parent', root_key)
|
2019-04-08 17:04:09 +02:00
|
|
|
|
2019-04-10 17:47:40 +02:00
|
|
|
r_serv_onion.delete('onion_history:{}'.format(onion_domain))
|
2019-04-08 17:04:09 +02:00
|
|
|
|
2019-04-10 17:47:40 +02:00
|
|
|
r_serv_onion.hset('onion_metadata:{}'.format(onion_domain), 'ports', '80')
|
|
|
|
r_serv_onion.hdel('onion_metadata:{}'.format(onion_domain), 'last_seen')
|
2019-04-08 17:04:09 +02:00
|
|
|
|
2019-04-17 17:07:09 +02:00
|
|
|
nb_updated += 1
|
|
|
|
progress = int((nb_updated * 100) /total_domain)
|
|
|
|
print('{}/{} updated {}%'.format(nb_updated, total_domain, progress))
|
|
|
|
# update progress stats
|
|
|
|
if progress != last_progress:
|
|
|
|
r_serv.set('ail:current_background_script_stat', progress)
|
|
|
|
last_progress = progress
|
|
|
|
|
2019-04-08 17:04:09 +02:00
|
|
|
|
|
|
|
end = time.time()
|
|
|
|
print('Updating ARDB_Onion Done => {} paths: {} s'.format(index, end - start))
|
|
|
|
print()
|
|
|
|
print('Done in {} s'.format(end - start_deb))
|
2019-04-11 17:49:20 +02:00
|
|
|
|
2019-04-16 17:24:59 +02:00
|
|
|
r_serv.sadd('ail:update_v1.5', 'onions')
|