chg: [update DB] add migration script

pull/342/head
Terrtia 2019-04-08 17:04:09 +02:00
parent f6d7d2ae16
commit 0519b4a437
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
4 changed files with 506 additions and 1 deletions

View File

@ -67,12 +67,32 @@ Redis and ARDB overview
| | father | **item father** |
| | domain | **crawled domain**:**domain port** |
##### Set:
| Key | Field |
| ------ | ------ |
| tag:**item path** | **tag** |
| | |
| paste_children:**item path** | **item path** |
| | |
| hash_paste:**item path** | **hash** |
| base64_paste:**item path** | **hash** |
| hexadecimal_paste:**item path** | **hash** |
| binary_paste:**item path** | **hash** |
##### Zset:
| Key | Field | Value |
| ------ | ------ | ------ |
| nb_seen_hash:**hash** | **item** | **nb_seen** |
| base64_hash:**hash** | **item** | **nb_seen** |
| binary_hash:**hash** | **item** | **nb_seen** |
| hexadecimal_hash:**hash** | **item** | **nb_seen** |
## DB9 - Crawler:
##### Hset:
| Key | Field | Value |
| ------ | ------ | ------ |
| **service type**:**domain** | first_seen | **date** |
| **service type**_metadata:**domain** | first_seen | **date** |
| | last_check | **date** |
| | ports | **port**;**port**;**port** ... |
| | paste_parent | **parent last crawling (can be auto or manual)** |

View File

@ -0,0 +1,228 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import sys
import time
import redis
import configparser
def update_hash_item(has_type):
#get all hash items:
#all_base64 = r_serv_tag.smembers('infoleak:automatic-detection=\"{}\"'.format(has_type))
all_hash_items = r_serv_tag.smembers('infoleak:automatic-detection=\"{}\":20190307'.format(has_type))
for item_path in all_hash_items:
if PASTES_FOLDER in item_path:
base64_key = '{}_paste:{}'.format(has_type, item_path)
hash_key = 'hash_paste:{}'.format(item_path)
## TODO: catch error
if r_serv_metadata.exists(base64_key):
res = r_serv_metadata.renamenx(base64_key, base64_key.replace(PASTES_FOLDER, '', 1))
## TODO: key merge
if not res:
print('same key, double name: {}'.format(item_path))
if r_serv_metadata.exists(hash_key):
## TODO: catch error
res = r_serv_metadata.renamenx(hash_key, hash_key.replace(PASTES_FOLDER, '', 1))
## TODO: key merge
if not res:
print('same key, double name: {}'.format(item_path))
if __name__ == '__main__':
start_deb = time.time()
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
if not os.path.exists(configfile):
raise Exception('Unable to find the configuration file. \
Did you set environment variables? \
Or activate the virtualenv.')
cfg = configparser.ConfigParser()
cfg.read(configfile)
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + '/'
r_serv_metadata = redis.StrictRedis(
host=cfg.get("ARDB_Metadata", "host"),
port=cfg.getint("ARDB_Metadata", "port"),
db=cfg.getint("ARDB_Metadata", "db"),
decode_responses=True)
r_serv_tag = redis.StrictRedis(
host=cfg.get("ARDB_Tags", "host"),
port=cfg.getint("ARDB_Tags", "port"),
db=cfg.getint("ARDB_Tags", "db"),
decode_responses=True)
r_serv_onion = redis.StrictRedis(
host=cfg.get("ARDB_Onion", "host"),
port=cfg.getint("ARDB_Onion", "port"),
db=cfg.getint("ARDB_Onion", "db"),
decode_responses=True)
## Update metadata ##
print('Updating ARDB_Metadata ...')
index = 0
start = time.time()
# Update base64
update_hash_item('base64')
# Update binary
update_hash_item('binary')
# Update binary
update_hash_item('hexadecimal')
# Update onion metadata
#all_crawled_items = r_serv_tag.smembers('infoleak:submission=\"crawler\"')
all_crawled_items = r_serv_tag.smembers('infoleak:submission=\"crawler\":20190227')
for item_path in all_crawled_items:
if PASTES_FOLDER in item_path:
item_metadata = 'paste_metadata:{}'.format(item_path)
## TODO: catch error
r_serv_metadata.rename(item_metadata, item_metadata.replace(PASTES_FOLDER, '', 1))
######################################################################################################################
######################################################################################################################
######################################################################################################################
######################################################################################################################
######################################################################################################################
######################################################################################################################
'''
string_keys_to_rename = ['misp_events:{}*'.format(PASTES_FOLDER), 'hive_cases:{}*'.format(PASTES_FOLDER)]
for key_to_rename in string_keys_to_rename:
keys_to_rename = []
for key in r_serv_metadata.scan_iter(key_to_rename):
new_key = key.replace(PASTES_FOLDER, '', 1)
keys_to_rename.append( (key, new_key) )
index = index + 1
for key, new_key in keys_to_rename:
r_serv_metadata.rename(key, new_key)
keys_to_rename = None
set_keys_to_rename = ['tag:{}*'.format(PASTES_FOLDER), 'paste_regular_external_links:{}*'.format(PASTES_FOLDER), 'paste_onion_external_links:{}*'.format(PASTES_FOLDER), 'paste_children:{}*'.format(PASTES_FOLDER)]
for key_to_rename in set_keys_to_rename:
keys_to_remove = []
keys_to_rename = []
for key in r_serv_metadata.scan_iter(key_to_rename):
new_key = key.replace(PASTES_FOLDER, '', 1)
# a set with this key already exist
if r_serv_metadata.exists(new_key):
# save data
for new_key_value in r_serv_metadata.smembers(key):
r_serv_metadata.sadd(new_key, new_key_value)
keys_to_remove.append(key)
else:
keys_to_rename.append( (key, new_key) )
index = index + 1
for key in keys_to_remove:
r_serv_metadata.delete(key)
for key, new_key in keys_to_rename:
r_serv_metadata.rename(key, new_key)
keys_to_remove = None
keys_to_rename = None
zset_keys_to_rename = ['nb_seen_hash:*', 'base64_hash:*', 'binary_hash:*']
for key_to_rename in zset_keys_to_rename:
keys_to_remove = []
zkeys_to_remove = []
keys_to_add = []
for key in r_serv_metadata.scan_iter(key_to_rename):
temp = []
for zset_key, value in r_serv_metadata.zscan_iter(key, '*{}*'.format(PASTES_FOLDER)):
new_key = zset_key.replace(PASTES_FOLDER, '', 1)
index = index +1
temp.append((key, zset_key))
keys_to_add.append((key, new_key, value))
if 0 < len(temp) < r_serv_metadata.zcard(key):
zkeys_to_remove.extend(temp)
else:
keys_to_remove.append(key)
for key in keys_to_remove:
r_serv_metadata.delete(key)
for key, zset_key in zkeys_to_remove:
r_serv_metadata.zrem(key, zset_key)
for key, new_key, value in keys_to_add:
r_serv_metadata.zincrby(key, new_key, int(value))
keys_to_remove = None
zkeys_to_remove = None
keys_to_add = None
set_keys_to_rename = ['paste_children:*']
for key_to_rename in set_keys_to_rename:
keys_to_remove = []
skeys_to_remove = []
keys_to_add = []
for key in r_serv_metadata.scan_iter(key_to_rename):
temp = []
for set_key in r_serv_metadata.sscan_iter(key, '*{}*'.format(PASTES_FOLDER)):
new_key = set_key.replace(PASTES_FOLDER, '', 1)
index = index +1
temp.append((key, set_key))
keys_to_add.append((key, new_key))
if 0 < len(temp) < r_serv_metadata.scard(key):
skeys_to_remove.extend(temp)
else:
keys_to_remove.append(key)
for key in keys_to_remove:
r_serv_metadata.delete(key)
for key, set_key in skeys_to_remove:
r_serv_metadata.srem(key, set_key)
for key, new_key in keys_to_add:
r_serv_metadata.sadd(key, new_key)
keys_to_remove = None
skeys_to_remove = None
keys_to_add = None
hset_keys_to_rename = ['paste_metadata:{}*'.format(PASTES_FOLDER)]
for key_to_rename in hset_keys_to_rename:
keys_to_rename = []
for key in r_serv_metadata.scan_iter(key_to_rename):
new_key = key.replace(PASTES_FOLDER, '', 1)
# a hset with this key already exist
if r_serv_metadata.exists(new_key):
pass
else:
keys_to_rename.append((key, new_key))
index = index + 1
for key, new_key in keys_to_rename:
r_serv_metadata.rename(key, new_key)
keys_to_rename = None
# to verify 120/100 try with scan
hset_keys_to_rename = ['paste_metadata:*']
for key_to_rename in hset_keys_to_rename:
for key in r_serv_metadata.scan_iter(key_to_rename):
father = r_serv_metadata.hget(key, 'father')
super_father = r_serv_metadata.hget(key, 'super_father')
if father:
if PASTES_FOLDER in father:
index = index + 1
r_serv_metadata.hdel(key, 'father')
r_serv_metadata.hset(key, 'father', father.replace(PASTES_FOLDER, '', 1))
if super_father:
if PASTES_FOLDER in super_father:
index = index + 1
r_serv_metadata.hdel(key, 'super_father')
r_serv_metadata.hset(key, 'super_father', super_father.replace(PASTES_FOLDER, '', 1))
keys_to_rename = None
'''
end = time.time()
print('Updating ARDB_Metadata Done => {} paths: {} s'.format(index, end - start))
print()

136
update/v1.4/Update-ARDB_Onions.py Executable file
View File

@ -0,0 +1,136 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import sys
import time
import redis
import datetime
import configparser
def substract_date(date_from, date_to):
date_from = datetime.date(int(date_from[0:4]), int(date_from[4:6]), int(date_from[6:8]))
date_to = datetime.date(int(date_to[0:4]), int(date_to[4:6]), int(date_to[6:8]))
delta = date_to - date_from # timedelta
l_date = []
for i in range(delta.days + 1):
date = date_from + datetime.timedelta(i)
l_date.append( date.strftime('%Y%m%d') )
return l_date
def get_date_epoch(date):
return int(datetime.datetime(int(date[0:4]), int(date[4:6]), int(date[6:8])).timestamp())
def get_domain_root_from_paste_childrens(item_father, domain):
item_children = r_serv_metadata.smembers('paste_children:{}'.format(item_father))
domain_root = ''
for item_path in item_children:
# remove absolute_path
if PASTES_FOLDER in item_path:
#r_serv_metadata.srem('paste_children:{}'.format(item_father), item_path)
item_path = item_path.replace(PASTES_FOLDER, '', 1)
#r_serv_metadata.sadd('paste_children:{}'.format(item_father), item_path)
if domain in item_path:
domain_root = item_path
return domain_root
if __name__ == '__main__':
start_deb = time.time()
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
if not os.path.exists(configfile):
raise Exception('Unable to find the configuration file. \
Did you set environment variables? \
Or activate the virtualenv.')
cfg = configparser.ConfigParser()
cfg.read(configfile)
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + '/'
r_serv_metadata = redis.StrictRedis(
host=cfg.get("ARDB_Metadata", "host"),
port=cfg.getint("ARDB_Metadata", "port"),
db=cfg.getint("ARDB_Metadata", "db"),
decode_responses=True)
r_serv_tag = redis.StrictRedis(
host=cfg.get("ARDB_Tags", "host"),
port=cfg.getint("ARDB_Tags", "port"),
db=cfg.getint("ARDB_Tags", "db"),
decode_responses=True)
r_serv_onion = redis.StrictRedis(
host=cfg.get("ARDB_Onion", "host"),
port=cfg.getint("ARDB_Onion", "port"),
db=cfg.getint("ARDB_Onion", "db"),
decode_responses=True)
## Update Onion ##
print('Updating ARDB_Onion ...')
index = 0
start = time.time()
# clean down domain from db
date_from = '20180929'
date_today = datetime.date.today().strftime("%Y%m%d")
for date in substract_date(date_from, date_today):
onion_down = r_serv_onion.smembers('onion_down:{}'.format(date))
#print(onion_down)
for onion_domain in onion_down:
if not r_serv_onion.sismember('full_onion_up', onion_domain):
# delete history
all_onion_history = r_serv_onion.lrange('onion_history:{}'.format(onion_domain), 0 ,-1)
if all_onion_history:
for date_history in all_onion_history:
pass
#print('onion_history:{}:{}'.format(onion_domain, date_history))
#r_serv_onion.delete('onion_history:{}:{}'.format(onion_domain, date_history))
#r_serv_onion.delete('onion_history:{}'.format(onion_domain))
# clean up domain
all_domain_up = r_serv_onion.smembers('full_onion_up')
for onion_domain in all_domain_up:
# delete history
all_onion_history = r_serv_onion.lrange('onion_history:{}'.format(onion_domain), 0 ,-1)
if all_onion_history:
for date_history in all_onion_history:
print('--------')
print('onion_history:{}:{}'.format(onion_domain, date_history))
#item_father = r_serv_onion.lpop('onion_history:{}:{}'.format(onion_domain, date_history))
item_father = r_serv_onion.lrange('onion_history:{}:{}'.format(onion_domain, date_history), 0, 0)
print('item_father: {}'.format(item_father))
item_father = item_father[0]
#print(item_father)
# delete old history
#r_serv_onion.delete('onion_history:{}:{}'.format(onion_domain, date_history))
# create new history
root_key = get_domain_root_from_paste_childrens(item_father, onion_domain)
if root_key:
#r_serv_onion.zadd('crawler_history_onion:{}:80'.format(onion_domain), get_date_epoch(date_history), root_key)
print('crawler_history_onion:{}:80 {} {}'.format(onion_domain, get_date_epoch(date_history), root_key))
#update service metadata: paste_parent
#r_serv_onion.hset('onion_metadata:{}'.format(onion_domain), 'paste_parent', root_key)
#r_serv_onion.delete('onion_history:{}'.format(onion_domain))
#r_serv_onion.hset('onion_metadata:{}'.format(onion_domain), 'ports', '80')
#r_serv_onion.hdel('onion_metadata:{}'.format(onion_domain), 'last_seen')
'''
for elem in r_serv_onion.smembers('onion_crawler_queue'):
if PASTES_FOLDER in elem:
r_serv_onion.srem('onion_crawler_queue', elem)
r_serv_onion.sadd('onion_crawler_queue', elem.replace(PASTES_FOLDER, '', 1))
index = index +1
'''
end = time.time()
print('Updating ARDB_Onion Done => {} paths: {} s'.format(index, end - start))
print()
print('Done in {} s'.format(end - start_deb))

121
update/v1.4/Update-ARDB_Tags.py Executable file
View File

@ -0,0 +1,121 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import sys
import time
import redis
import configparser
if __name__ == '__main__':
start_deb = time.time()
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
if not os.path.exists(configfile):
raise Exception('Unable to find the configuration file. \
Did you set environment variables? \
Or activate the virtualenv.')
cfg = configparser.ConfigParser()
cfg.read(configfile)
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + '/'
r_serv_metadata = redis.StrictRedis(
host=cfg.get("ARDB_Metadata", "host"),
port=cfg.getint("ARDB_Metadata", "port"),
db=cfg.getint("ARDB_Metadata", "db"),
decode_responses=True)
r_serv_tag = redis.StrictRedis(
host=cfg.get("ARDB_Tags", "host"),
port=cfg.getint("ARDB_Tags", "port"),
db=cfg.getint("ARDB_Tags", "db"),
decode_responses=True)
r_serv_onion = redis.StrictRedis(
host=cfg.get("ARDB_Onion", "host"),
port=cfg.getint("ARDB_Onion", "port"),
db=cfg.getint("ARDB_Onion", "db"),
decode_responses=True)
r_serv_onion = redis.StrictRedis(
host=cfg.get("ARDB_Onion", "host"),
port=cfg.getint("ARDB_Onion", "port"),
db=cfg.getint("ARDB_Onion", "db"),
decode_responses=True)
r_important_paste_2018 = redis.StrictRedis(
host=cfg.get("ARDB_Metadata", "host"),
port=cfg.getint("ARDB_Metadata", "port"),
db=2018,
decode_responses=True)
r_important_paste_2019 = redis.StrictRedis(
host=cfg.get("ARDB_Metadata", "host"),
port=cfg.getint("ARDB_Metadata", "port"),
db=2018,
decode_responses=True)
print('Updating ARDB_Tags ...')
index = 0
start = time.time()
tags_list = r_serv_tag.smembers('list_tags')
# create temp tags metadata
tag_metadata = {}
for tag in tags_list:
tag_metadata[tag] = {}
tag_metadata[tag]['first_seen'] = r_serv_tag.hget('tag_metadata:{}'.format(tag), 'first_seen')
if tag_metadata[tag]['first_seen'] is None:
tag_metadata[tag]['first_seen'] = 99999999
else:
tag_metadata[tag]['first_seen'] = int(tag_metadata[tag]['first_seen'])
tag_metadata[tag]['last_seen'] = r_serv_tag.hget('tag_metadata:{}'.format(tag), 'last_seen')
if tag_metadata[tag]['last_seen'] is None:
tag_metadata[tag]['last_seen'] = 0
else:
tag_metadata[tag]['last_seen'] = int(tag_metadata[tag]['last_seen'])
for tag in tags_list:
all_item = r_serv_tag.smembers(tag)
for item_path in all_item:
splitted_item_path = item_path.split('/')
#print(tag)
#print(item_path)
item_date = int( ''.join([splitted_item_path[-4], splitted_item_path[-3], splitted_item_path[-2]]) )
# remove absolute path
new_path = item_path.replace(PASTES_FOLDER, '', 1)
if new_path != item_path:
# save in queue absolute path to remove
r_serv_tag.sadd('maj:v1.5:absolute_path_to_rename', item_path)
# update metadata first_seen
if item_date < tag_metadata[tag]['first_seen']:
tag_metadata[tag]['first_seen'] = item_date
r_serv_tag.hset('tag_metadata:{}'.format(tag), 'first_seen', item_date)
# update metadata last_seen
if item_date > tag_metadata[tag]['last_seen']:
tag_metadata[tag]['last_seen'] = item_date
r_serv_tag.hset('tag_metadata:{}'.format(tag), 'last_seen', item_date)
r_serv_tag.sadd('{}:{}'.format(tag, item_date), new_path)
r_serv_tag.hincrby('daily_tags:{}'.format(item_date), tag, 1)
# clean db
r_serv_tag.srem(tag, item_path)
index = index + 1
#flush browse importante pastes db
r_important_paste_2018.flushdb()
r_important_paste_2019.flushdb()
end = time.time()
print('Updating ARDB_Tags Done => {} paths: {} s'.format(index, end - start))