From 0519b4a4370e075baff9e713f900eeef2c50075b Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 8 Apr 2019 17:04:09 +0200 Subject: [PATCH] chg: [update DB] add migration script --- OVERVIEW.md | 22 ++- update/v1.4/Update-ARDB_Metadata.py | 228 ++++++++++++++++++++++++++++ update/v1.4/Update-ARDB_Onions.py | 136 +++++++++++++++++ update/v1.4/Update-ARDB_Tags.py | 121 +++++++++++++++ 4 files changed, 506 insertions(+), 1 deletion(-) create mode 100755 update/v1.4/Update-ARDB_Metadata.py create mode 100755 update/v1.4/Update-ARDB_Onions.py create mode 100755 update/v1.4/Update-ARDB_Tags.py diff --git a/OVERVIEW.md b/OVERVIEW.md index f389a085..7524bea7 100644 --- a/OVERVIEW.md +++ b/OVERVIEW.md @@ -67,12 +67,32 @@ Redis and ARDB overview | | father | **item father** | | | domain | **crawled domain**:**domain port** | +##### Set: +| Key | Field | +| ------ | ------ | +| tag:**item path** | **tag** | +| | | +| paste_children:**item path** | **item path** | +| | | +| hash_paste:**item path** | **hash** | +| base64_paste:**item path** | **hash** | +| hexadecimal_paste:**item path** | **hash** | +| binary_paste:**item path** | **hash** | + +##### Zset: +| Key | Field | Value | +| ------ | ------ | ------ | +| nb_seen_hash:**hash** | **item** | **nb_seen** | +| base64_hash:**hash** | **item** | **nb_seen** | +| binary_hash:**hash** | **item** | **nb_seen** | +| hexadecimal_hash:**hash** | **item** | **nb_seen** | + ## DB9 - Crawler: ##### Hset: | Key | Field | Value | | ------ | ------ | ------ | -| **service type**:**domain** | first_seen | **date** | +| **service type**_metadata:**domain** | first_seen | **date** | | | last_check | **date** | | | ports | **port**;**port**;**port** ... | | | paste_parent | **parent last crawling (can be auto or manual)** | diff --git a/update/v1.4/Update-ARDB_Metadata.py b/update/v1.4/Update-ARDB_Metadata.py new file mode 100755 index 00000000..2e7fbb3d --- /dev/null +++ b/update/v1.4/Update-ARDB_Metadata.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import sys +import time +import redis +import configparser + + +def update_hash_item(has_type): + #get all hash items: + #all_base64 = r_serv_tag.smembers('infoleak:automatic-detection=\"{}\"'.format(has_type)) + all_hash_items = r_serv_tag.smembers('infoleak:automatic-detection=\"{}\":20190307'.format(has_type)) + for item_path in all_hash_items: + if PASTES_FOLDER in item_path: + base64_key = '{}_paste:{}'.format(has_type, item_path) + hash_key = 'hash_paste:{}'.format(item_path) + + ## TODO: catch error + if r_serv_metadata.exists(base64_key): + res = r_serv_metadata.renamenx(base64_key, base64_key.replace(PASTES_FOLDER, '', 1)) + ## TODO: key merge + if not res: + print('same key, double name: {}'.format(item_path)) + + if r_serv_metadata.exists(hash_key): + ## TODO: catch error + res = r_serv_metadata.renamenx(hash_key, hash_key.replace(PASTES_FOLDER, '', 1)) + ## TODO: key merge + if not res: + print('same key, double name: {}'.format(item_path)) + +if __name__ == '__main__': + + start_deb = time.time() + + configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') + if not os.path.exists(configfile): + raise Exception('Unable to find the configuration file. \ + Did you set environment variables? \ + Or activate the virtualenv.') + cfg = configparser.ConfigParser() + cfg.read(configfile) + + PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + '/' + + r_serv_metadata = redis.StrictRedis( + host=cfg.get("ARDB_Metadata", "host"), + port=cfg.getint("ARDB_Metadata", "port"), + db=cfg.getint("ARDB_Metadata", "db"), + decode_responses=True) + + r_serv_tag = redis.StrictRedis( + host=cfg.get("ARDB_Tags", "host"), + port=cfg.getint("ARDB_Tags", "port"), + db=cfg.getint("ARDB_Tags", "db"), + decode_responses=True) + + r_serv_onion = redis.StrictRedis( + host=cfg.get("ARDB_Onion", "host"), + port=cfg.getint("ARDB_Onion", "port"), + db=cfg.getint("ARDB_Onion", "db"), + decode_responses=True) + + ## Update metadata ## + print('Updating ARDB_Metadata ...') + index = 0 + start = time.time() + + # Update base64 + update_hash_item('base64') + # Update binary + update_hash_item('binary') + # Update binary + update_hash_item('hexadecimal') + + # Update onion metadata + #all_crawled_items = r_serv_tag.smembers('infoleak:submission=\"crawler\"') + all_crawled_items = r_serv_tag.smembers('infoleak:submission=\"crawler\":20190227') + for item_path in all_crawled_items: + if PASTES_FOLDER in item_path: + item_metadata = 'paste_metadata:{}'.format(item_path) + ## TODO: catch error + r_serv_metadata.rename(item_metadata, item_metadata.replace(PASTES_FOLDER, '', 1)) + + ###################################################################################################################### + ###################################################################################################################### + ###################################################################################################################### + ###################################################################################################################### + ###################################################################################################################### + ###################################################################################################################### + ''' + + string_keys_to_rename = ['misp_events:{}*'.format(PASTES_FOLDER), 'hive_cases:{}*'.format(PASTES_FOLDER)] + for key_to_rename in string_keys_to_rename: + + keys_to_rename = [] + for key in r_serv_metadata.scan_iter(key_to_rename): + new_key = key.replace(PASTES_FOLDER, '', 1) + keys_to_rename.append( (key, new_key) ) + index = index + 1 + for key, new_key in keys_to_rename: + r_serv_metadata.rename(key, new_key) + + keys_to_rename = None + + set_keys_to_rename = ['tag:{}*'.format(PASTES_FOLDER), 'paste_regular_external_links:{}*'.format(PASTES_FOLDER), 'paste_onion_external_links:{}*'.format(PASTES_FOLDER), 'paste_children:{}*'.format(PASTES_FOLDER)] + for key_to_rename in set_keys_to_rename: + + keys_to_remove = [] + keys_to_rename = [] + for key in r_serv_metadata.scan_iter(key_to_rename): + new_key = key.replace(PASTES_FOLDER, '', 1) + # a set with this key already exist + if r_serv_metadata.exists(new_key): + # save data + for new_key_value in r_serv_metadata.smembers(key): + r_serv_metadata.sadd(new_key, new_key_value) + keys_to_remove.append(key) + else: + keys_to_rename.append( (key, new_key) ) + index = index + 1 + for key in keys_to_remove: + r_serv_metadata.delete(key) + for key, new_key in keys_to_rename: + r_serv_metadata.rename(key, new_key) + + keys_to_remove = None + keys_to_rename = None + + + zset_keys_to_rename = ['nb_seen_hash:*', 'base64_hash:*', 'binary_hash:*'] + for key_to_rename in zset_keys_to_rename: + + keys_to_remove = [] + zkeys_to_remove = [] + keys_to_add = [] + for key in r_serv_metadata.scan_iter(key_to_rename): + temp = [] + for zset_key, value in r_serv_metadata.zscan_iter(key, '*{}*'.format(PASTES_FOLDER)): + new_key = zset_key.replace(PASTES_FOLDER, '', 1) + index = index +1 + temp.append((key, zset_key)) + keys_to_add.append((key, new_key, value)) + if 0 < len(temp) < r_serv_metadata.zcard(key): + zkeys_to_remove.extend(temp) + else: + keys_to_remove.append(key) + for key in keys_to_remove: + r_serv_metadata.delete(key) + for key, zset_key in zkeys_to_remove: + r_serv_metadata.zrem(key, zset_key) + for key, new_key, value in keys_to_add: + r_serv_metadata.zincrby(key, new_key, int(value)) + keys_to_remove = None + zkeys_to_remove = None + keys_to_add = None + + set_keys_to_rename = ['paste_children:*'] + for key_to_rename in set_keys_to_rename: + keys_to_remove = [] + skeys_to_remove = [] + keys_to_add = [] + for key in r_serv_metadata.scan_iter(key_to_rename): + temp = [] + for set_key in r_serv_metadata.sscan_iter(key, '*{}*'.format(PASTES_FOLDER)): + new_key = set_key.replace(PASTES_FOLDER, '', 1) + index = index +1 + temp.append((key, set_key)) + keys_to_add.append((key, new_key)) + if 0 < len(temp) < r_serv_metadata.scard(key): + skeys_to_remove.extend(temp) + else: + keys_to_remove.append(key) + for key in keys_to_remove: + r_serv_metadata.delete(key) + for key, set_key in skeys_to_remove: + r_serv_metadata.srem(key, set_key) + for key, new_key in keys_to_add: + r_serv_metadata.sadd(key, new_key) + keys_to_remove = None + skeys_to_remove = None + keys_to_add = None + + hset_keys_to_rename = ['paste_metadata:{}*'.format(PASTES_FOLDER)] + for key_to_rename in hset_keys_to_rename: + + keys_to_rename = [] + for key in r_serv_metadata.scan_iter(key_to_rename): + new_key = key.replace(PASTES_FOLDER, '', 1) + # a hset with this key already exist + if r_serv_metadata.exists(new_key): + pass + else: + keys_to_rename.append((key, new_key)) + index = index + 1 + for key, new_key in keys_to_rename: + r_serv_metadata.rename(key, new_key) + keys_to_rename = None + + # to verify 120/100 try with scan + hset_keys_to_rename = ['paste_metadata:*'] + for key_to_rename in hset_keys_to_rename: + for key in r_serv_metadata.scan_iter(key_to_rename): + father = r_serv_metadata.hget(key, 'father') + super_father = r_serv_metadata.hget(key, 'super_father') + + if father: + if PASTES_FOLDER in father: + index = index + 1 + r_serv_metadata.hdel(key, 'father') + r_serv_metadata.hset(key, 'father', father.replace(PASTES_FOLDER, '', 1)) + + if super_father: + if PASTES_FOLDER in super_father: + index = index + 1 + r_serv_metadata.hdel(key, 'super_father') + r_serv_metadata.hset(key, 'super_father', super_father.replace(PASTES_FOLDER, '', 1)) + + keys_to_rename = None + ''' + + + end = time.time() + + print('Updating ARDB_Metadata Done => {} paths: {} s'.format(index, end - start)) + print() diff --git a/update/v1.4/Update-ARDB_Onions.py b/update/v1.4/Update-ARDB_Onions.py new file mode 100755 index 00000000..0e72dc6f --- /dev/null +++ b/update/v1.4/Update-ARDB_Onions.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import sys +import time +import redis +import datetime +import configparser + +def substract_date(date_from, date_to): + date_from = datetime.date(int(date_from[0:4]), int(date_from[4:6]), int(date_from[6:8])) + date_to = datetime.date(int(date_to[0:4]), int(date_to[4:6]), int(date_to[6:8])) + delta = date_to - date_from # timedelta + l_date = [] + for i in range(delta.days + 1): + date = date_from + datetime.timedelta(i) + l_date.append( date.strftime('%Y%m%d') ) + return l_date + +def get_date_epoch(date): + return int(datetime.datetime(int(date[0:4]), int(date[4:6]), int(date[6:8])).timestamp()) + +def get_domain_root_from_paste_childrens(item_father, domain): + item_children = r_serv_metadata.smembers('paste_children:{}'.format(item_father)) + domain_root = '' + for item_path in item_children: + # remove absolute_path + if PASTES_FOLDER in item_path: + #r_serv_metadata.srem('paste_children:{}'.format(item_father), item_path) + item_path = item_path.replace(PASTES_FOLDER, '', 1) + #r_serv_metadata.sadd('paste_children:{}'.format(item_father), item_path) + if domain in item_path: + domain_root = item_path + return domain_root + + +if __name__ == '__main__': + + start_deb = time.time() + + configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') + if not os.path.exists(configfile): + raise Exception('Unable to find the configuration file. \ + Did you set environment variables? \ + Or activate the virtualenv.') + cfg = configparser.ConfigParser() + cfg.read(configfile) + + PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + '/' + + r_serv_metadata = redis.StrictRedis( + host=cfg.get("ARDB_Metadata", "host"), + port=cfg.getint("ARDB_Metadata", "port"), + db=cfg.getint("ARDB_Metadata", "db"), + decode_responses=True) + + r_serv_tag = redis.StrictRedis( + host=cfg.get("ARDB_Tags", "host"), + port=cfg.getint("ARDB_Tags", "port"), + db=cfg.getint("ARDB_Tags", "db"), + decode_responses=True) + + r_serv_onion = redis.StrictRedis( + host=cfg.get("ARDB_Onion", "host"), + port=cfg.getint("ARDB_Onion", "port"), + db=cfg.getint("ARDB_Onion", "db"), + decode_responses=True) + + ## Update Onion ## + print('Updating ARDB_Onion ...') + index = 0 + start = time.time() + + # clean down domain from db + date_from = '20180929' + date_today = datetime.date.today().strftime("%Y%m%d") + for date in substract_date(date_from, date_today): + + onion_down = r_serv_onion.smembers('onion_down:{}'.format(date)) + #print(onion_down) + for onion_domain in onion_down: + if not r_serv_onion.sismember('full_onion_up', onion_domain): + # delete history + all_onion_history = r_serv_onion.lrange('onion_history:{}'.format(onion_domain), 0 ,-1) + if all_onion_history: + for date_history in all_onion_history: + pass + #print('onion_history:{}:{}'.format(onion_domain, date_history)) + #r_serv_onion.delete('onion_history:{}:{}'.format(onion_domain, date_history)) + #r_serv_onion.delete('onion_history:{}'.format(onion_domain)) + + # clean up domain + all_domain_up = r_serv_onion.smembers('full_onion_up') + for onion_domain in all_domain_up: + # delete history + all_onion_history = r_serv_onion.lrange('onion_history:{}'.format(onion_domain), 0 ,-1) + if all_onion_history: + for date_history in all_onion_history: + print('--------') + print('onion_history:{}:{}'.format(onion_domain, date_history)) + #item_father = r_serv_onion.lpop('onion_history:{}:{}'.format(onion_domain, date_history)) + item_father = r_serv_onion.lrange('onion_history:{}:{}'.format(onion_domain, date_history), 0, 0) + print('item_father: {}'.format(item_father)) + item_father = item_father[0] + #print(item_father) + # delete old history + #r_serv_onion.delete('onion_history:{}:{}'.format(onion_domain, date_history)) + # create new history + root_key = get_domain_root_from_paste_childrens(item_father, onion_domain) + if root_key: + #r_serv_onion.zadd('crawler_history_onion:{}:80'.format(onion_domain), get_date_epoch(date_history), root_key) + print('crawler_history_onion:{}:80 {} {}'.format(onion_domain, get_date_epoch(date_history), root_key)) + #update service metadata: paste_parent + #r_serv_onion.hset('onion_metadata:{}'.format(onion_domain), 'paste_parent', root_key) + + #r_serv_onion.delete('onion_history:{}'.format(onion_domain)) + + #r_serv_onion.hset('onion_metadata:{}'.format(onion_domain), 'ports', '80') + #r_serv_onion.hdel('onion_metadata:{}'.format(onion_domain), 'last_seen') + + + ''' + for elem in r_serv_onion.smembers('onion_crawler_queue'): + if PASTES_FOLDER in elem: + r_serv_onion.srem('onion_crawler_queue', elem) + r_serv_onion.sadd('onion_crawler_queue', elem.replace(PASTES_FOLDER, '', 1)) + index = index +1 + + ''' + + + end = time.time() + print('Updating ARDB_Onion Done => {} paths: {} s'.format(index, end - start)) + print() + print('Done in {} s'.format(end - start_deb)) diff --git a/update/v1.4/Update-ARDB_Tags.py b/update/v1.4/Update-ARDB_Tags.py new file mode 100755 index 00000000..4327b9d0 --- /dev/null +++ b/update/v1.4/Update-ARDB_Tags.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import sys +import time +import redis +import configparser + +if __name__ == '__main__': + + start_deb = time.time() + + configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') + if not os.path.exists(configfile): + raise Exception('Unable to find the configuration file. \ + Did you set environment variables? \ + Or activate the virtualenv.') + cfg = configparser.ConfigParser() + cfg.read(configfile) + + PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + '/' + + r_serv_metadata = redis.StrictRedis( + host=cfg.get("ARDB_Metadata", "host"), + port=cfg.getint("ARDB_Metadata", "port"), + db=cfg.getint("ARDB_Metadata", "db"), + decode_responses=True) + + r_serv_tag = redis.StrictRedis( + host=cfg.get("ARDB_Tags", "host"), + port=cfg.getint("ARDB_Tags", "port"), + db=cfg.getint("ARDB_Tags", "db"), + decode_responses=True) + + r_serv_onion = redis.StrictRedis( + host=cfg.get("ARDB_Onion", "host"), + port=cfg.getint("ARDB_Onion", "port"), + db=cfg.getint("ARDB_Onion", "db"), + decode_responses=True) + + r_serv_onion = redis.StrictRedis( + host=cfg.get("ARDB_Onion", "host"), + port=cfg.getint("ARDB_Onion", "port"), + db=cfg.getint("ARDB_Onion", "db"), + decode_responses=True) + + r_important_paste_2018 = redis.StrictRedis( + host=cfg.get("ARDB_Metadata", "host"), + port=cfg.getint("ARDB_Metadata", "port"), + db=2018, + decode_responses=True) + + r_important_paste_2019 = redis.StrictRedis( + host=cfg.get("ARDB_Metadata", "host"), + port=cfg.getint("ARDB_Metadata", "port"), + db=2018, + decode_responses=True) + + print('Updating ARDB_Tags ...') + index = 0 + start = time.time() + + tags_list = r_serv_tag.smembers('list_tags') + # create temp tags metadata + tag_metadata = {} + for tag in tags_list: + tag_metadata[tag] = {} + tag_metadata[tag]['first_seen'] = r_serv_tag.hget('tag_metadata:{}'.format(tag), 'first_seen') + if tag_metadata[tag]['first_seen'] is None: + tag_metadata[tag]['first_seen'] = 99999999 + else: + tag_metadata[tag]['first_seen'] = int(tag_metadata[tag]['first_seen']) + + tag_metadata[tag]['last_seen'] = r_serv_tag.hget('tag_metadata:{}'.format(tag), 'last_seen') + if tag_metadata[tag]['last_seen'] is None: + tag_metadata[tag]['last_seen'] = 0 + else: + tag_metadata[tag]['last_seen'] = int(tag_metadata[tag]['last_seen']) + + for tag in tags_list: + + all_item = r_serv_tag.smembers(tag) + for item_path in all_item: + splitted_item_path = item_path.split('/') + #print(tag) + #print(item_path) + item_date = int( ''.join([splitted_item_path[-4], splitted_item_path[-3], splitted_item_path[-2]]) ) + + # remove absolute path + new_path = item_path.replace(PASTES_FOLDER, '', 1) + if new_path != item_path: + # save in queue absolute path to remove + r_serv_tag.sadd('maj:v1.5:absolute_path_to_rename', item_path) + + # update metadata first_seen + if item_date < tag_metadata[tag]['first_seen']: + tag_metadata[tag]['first_seen'] = item_date + r_serv_tag.hset('tag_metadata:{}'.format(tag), 'first_seen', item_date) + + # update metadata last_seen + if item_date > tag_metadata[tag]['last_seen']: + tag_metadata[tag]['last_seen'] = item_date + r_serv_tag.hset('tag_metadata:{}'.format(tag), 'last_seen', item_date) + + + r_serv_tag.sadd('{}:{}'.format(tag, item_date), new_path) + r_serv_tag.hincrby('daily_tags:{}'.format(item_date), tag, 1) + + # clean db + r_serv_tag.srem(tag, item_path) + index = index + 1 + + #flush browse importante pastes db + r_important_paste_2018.flushdb() + r_important_paste_2019.flushdb() + + end = time.time() + + + print('Updating ARDB_Tags Done => {} paths: {} s'.format(index, end - start))