diff --git a/bin/Crawler.py b/bin/Crawler.py deleted file mode 100755 index 3ec4922f..00000000 --- a/bin/Crawler.py +++ /dev/null @@ -1,457 +0,0 @@ -#!/usr/bin/env python3 -# -*-coding:UTF-8 -* - -import os -import sys -import re -import uuid -import json -import redis -import datetime -import time -import subprocess -import requests - -from collections import deque -from pyfaup.faup import Faup - -sys.path.append(os.environ['AIL_BIN']) -from Helper import Process -from pubsublogger import publisher - -sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib')) -import crawlers - -# ======== FUNCTIONS ======== - -def load_blacklist(service_type): - try: - with open(os.environ['AIL_BIN']+'/torcrawler/blacklist_{}.txt'.format(service_type), 'r') as f: - redis_crawler.delete('blacklist_{}'.format(service_type)) - lines = f.read().splitlines() - for line in lines: - redis_crawler.sadd('blacklist_{}'.format(service_type), line) - except Exception: - pass - -def update_auto_crawler(): - current_epoch = int(time.time()) - list_to_crawl = redis_crawler.zrangebyscore('crawler_auto_queue', '-inf', current_epoch) - for elem_to_crawl in list_to_crawl: - mess, type = elem_to_crawl.rsplit(';', 1) - redis_crawler.sadd('{}_crawler_priority_queue'.format(type), mess) - redis_crawler.zrem('crawler_auto_queue', elem_to_crawl) - -# Extract info form url (url, domain, domain url, ...) -def unpack_url(url): - to_crawl = {} - faup.decode(url) - url_unpack = faup.get() - # # FIXME: # TODO: remove me - try: - to_crawl['domain'] = url_unpack['domain'].decode() - except: - to_crawl['domain'] = url_unpack['domain'] - to_crawl['domain'] = to_crawl['domain'].lower() - - - # force lower case domain/subdomain (rfc4343) - # # FIXME: # TODO: remove me - try: - url_host = url_unpack['host'].decode() - except: - url_host = url_unpack['host'] - - new_url_host = url_host.lower() - url_lower_case = url.replace(url_host, new_url_host, 1) - - if url_unpack['scheme'] is None: - to_crawl['scheme'] = 'http' - url= 'http://{}'.format(url_lower_case) - else: - # # FIXME: # TODO: remove me - try: - scheme = url_unpack['scheme'].decode() - except Exception as e: - scheme = url_unpack['scheme'] - if scheme in default_proto_map: - to_crawl['scheme'] = scheme - url = url_lower_case - else: - redis_crawler.sadd('new_proto', '{} {}'.format(scheme, url_lower_case)) - to_crawl['scheme'] = 'http' - url= 'http://{}'.format(url_lower_case.replace(scheme, '', 1)) - - if url_unpack['port'] is None: - to_crawl['port'] = default_proto_map[to_crawl['scheme']] - else: - # # FIXME: # TODO: remove me - try: - port = url_unpack['port'].decode() - except: - port = url_unpack['port'] - # Verify port number #################### make function to verify/correct port number - try: - int(port) - # Invalid port Number - except Exception as e: - port = default_proto_map[to_crawl['scheme']] - to_crawl['port'] = port - - #if url_unpack['query_string'] is None: - # if to_crawl['port'] == 80: - # to_crawl['url']= '{}://{}'.format(to_crawl['scheme'], url_unpack['host'].decode()) - # else: - # to_crawl['url']= '{}://{}:{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port']) - #else: - # to_crawl['url']= '{}://{}:{}{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port'], url_unpack['query_string'].decode()) - - to_crawl['url'] = url - if to_crawl['port'] == 80: - to_crawl['domain_url'] = '{}://{}'.format(to_crawl['scheme'], new_url_host) - else: - to_crawl['domain_url'] = '{}://{}:{}'.format(to_crawl['scheme'], new_url_host, to_crawl['port']) - - # # FIXME: # TODO: remove me - try: - to_crawl['tld'] = url_unpack['tld'].decode() - except: - to_crawl['tld'] = url_unpack['tld'] - - return to_crawl - -def get_crawler_config(redis_server, mode, service_type, domain, url=None): - crawler_options = {} - if mode=='auto': - config = redis_server.get('crawler_config:{}:{}:{}:{}'.format(mode, service_type, domain, url)) - else: - config = redis_server.get('crawler_config:{}:{}:{}'.format(mode, service_type, domain)) - if config is None: - config = {} - else: - config = json.loads(config) - for option in default_crawler_config: - if option in config: - crawler_options[option] = config[option] - else: - crawler_options[option] = default_crawler_config[option] - if mode == 'auto': - crawler_options['time'] = int(config['time']) - elif mode == 'manual': - redis_server.delete('crawler_config:{}:{}:{}'.format(mode, service_type, domain)) - return crawler_options - -def load_crawler_config(queue_type, service_type, domain, paste, url, date): - crawler_config = {} - crawler_config['splash_url'] = f'http://{splash_url}' - crawler_config['item'] = paste - crawler_config['service_type'] = service_type - crawler_config['domain'] = domain - crawler_config['date'] = date - - if queue_type and queue_type != 'tor': - service_type = queue_type - - # Auto and Manual Crawling - # Auto ################################################# create new entry, next crawling => here or when ended ? - if paste == 'auto': - crawler_config['crawler_options'] = get_crawler_config(redis_crawler, 'auto', service_type, domain, url=url) - crawler_config['requested'] = True - # Manual - elif paste == 'manual': - crawler_config['crawler_options'] = get_crawler_config(r_cache, 'manual', service_type, domain) - crawler_config['requested'] = True - # default crawler - else: - crawler_config['crawler_options'] = get_crawler_config(redis_crawler, 'default', service_type, domain) - crawler_config['requested'] = False - return crawler_config - -def is_domain_up_day(domain, type_service, date_day): - if redis_crawler.sismember('{}_up:{}'.format(type_service, date_day), domain): - return True - else: - return False - -def set_crawled_domain_metadata(type_service, date, domain, father_item): - # first seen - if not redis_crawler.hexists('{}_metadata:{}'.format(type_service, domain), 'first_seen'): - redis_crawler.hset('{}_metadata:{}'.format(type_service, domain), 'first_seen', date['date_day']) - - redis_crawler.hset('{}_metadata:{}'.format(type_service, domain), 'paste_parent', father_item) - # last check - redis_crawler.hset('{}_metadata:{}'.format(type_service, domain), 'last_check', date['date_day']) - -# Put message back on queue -def on_error_send_message_back_in_queue(type_service, domain, message): - if not redis_crawler.sismember('{}_domain_crawler_queue'.format(type_service), domain): - redis_crawler.sadd('{}_domain_crawler_queue'.format(type_service), domain) - redis_crawler.sadd('{}_crawler_priority_queue'.format(type_service), message) - -def crawl_onion(url, domain, port, type_service, message, crawler_config): - crawler_config['url'] = url - crawler_config['port'] = port - print('Launching Crawler: {}'.format(url)) - - r_cache.hset('metadata_crawler:{}'.format(splash_url), 'crawling_domain', domain) - r_cache.hset('metadata_crawler:{}'.format(splash_url), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S")) - - retry = True - nb_retry = 0 - while retry: - try: - r = requests.get(f'http://{splash_url}' , timeout=30.0) - retry = False - except Exception: - # TODO: relaunch docker or send error message - nb_retry += 1 - - if nb_retry == 2: - crawlers.restart_splash_docker(splash_url, splash_name) - time.sleep(20) - - if nb_retry == 6: - on_error_send_message_back_in_queue(type_service, domain, message) - publisher.error('{} SPASH DOWN'.format(splash_url)) - print('--------------------------------------') - print(' \033[91m DOCKER SPLASH DOWN\033[0m') - print(' {} DOWN'.format(splash_url)) - r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'SPLASH DOWN') - nb_retry == 0 - - print(' \033[91m DOCKER SPLASH NOT AVAILABLE\033[0m') - print(' Retry({}) in 10 seconds'.format(nb_retry)) - time.sleep(10) - - if r.status_code == 200: - r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Crawling') - # save config in cash - UUID = str(uuid.uuid4()) - r_cache.set('crawler_request:{}'.format(UUID), json.dumps(crawler_config)) - - process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', UUID], - stdout=subprocess.PIPE) - while process.poll() is None: - time.sleep(1) - - if process.returncode == 0: - output = process.stdout.read().decode() - print(output) - # error: splash:Connection to proxy refused - if 'Connection to proxy refused' in output: - on_error_send_message_back_in_queue(type_service, domain, message) - publisher.error('{} SPASH, PROXY DOWN OR BAD CONFIGURATION'.format(splash_url)) - print('------------------------------------------------------------------------') - print(' \033[91m SPLASH: Connection to proxy refused') - print('') - print(' PROXY DOWN OR BAD CONFIGURATION\033[0m'.format(splash_url)) - print('------------------------------------------------------------------------') - r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Error') - exit(-2) - else: - crawlers.update_splash_manager_connection_status(True) - else: - print(process.stdout.read()) - exit(-1) - else: - on_error_send_message_back_in_queue(type_service, domain, message) - print('--------------------------------------') - print(' \033[91m DOCKER SPLASH DOWN\033[0m') - print(' {} DOWN'.format(splash_url)) - r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Crawling') - exit(1) - -# check external links (full_crawl) -def search_potential_source_domain(type_service, domain): - external_domains = set() - for link in redis_crawler.smembers('domain_{}_external_links:{}'.format(type_service, domain)): - # unpack url - url_data = unpack_url(link) - if url_data['domain'] != domain: - if url_data['tld'] == 'onion' or url_data['tld'] == 'i2p': - external_domains.add(url_data['domain']) - # # TODO: add special tag ? - if len(external_domains) >= 20: - redis_crawler.sadd('{}_potential_source'.format(type_service), domain) - print('New potential source found: domain') - redis_crawler.delete('domain_{}_external_links:{}'.format(type_service, domain)) - - -if __name__ == '__main__': - - if len(sys.argv) != 2: - print('usage:', 'Crawler.py', 'splash_url') - exit(1) -################################################## - splash_url = sys.argv[1] - - splash_name = crawlers.get_splash_name_by_url(splash_url) - proxy_name = crawlers.get_splash_proxy(splash_name) - crawler_type = crawlers.get_splash_crawler_type(splash_name) - - print(f'SPLASH Name: {splash_name}') - print(f'Proxy Name: {proxy_name}') - print(f'Crawler Type: {crawler_type}') - - #time.sleep(10) - #sys.exit(0) - - #rotation_mode = deque(['onion', 'regular']) - all_crawler_queues = crawlers.get_crawler_queue_types_by_splash_name(splash_name) - rotation_mode = deque(all_crawler_queues) - print(rotation_mode) - - default_proto_map = {'http': 80, 'https': 443} -######################################################## add ftp ??? - - publisher.port = 6380 - publisher.channel = "Script" - publisher.info("Script Crawler started") - config_section = 'Crawler' - - # Setup the I/O queues - p = Process(config_section) - - print('splash url: {}'.format(splash_url)) - - r_cache = redis.StrictRedis( - host=p.config.get("Redis_Cache", "host"), - port=p.config.getint("Redis_Cache", "port"), - db=p.config.getint("Redis_Cache", "db"), - decode_responses=True) - - redis_crawler = redis.StrictRedis( - host=p.config.get("ARDB_Onion", "host"), - port=p.config.getint("ARDB_Onion", "port"), - db=p.config.getint("ARDB_Onion", "db"), - decode_responses=True) - - faup = crawlers.get_faup() - - # get HAR files - default_crawler_har = p.config.getboolean("Crawler", "default_crawler_har") - if default_crawler_har: - default_crawler_har = True - else: - default_crawler_har = False - - # get PNG files - default_crawler_png = p.config.getboolean("Crawler", "default_crawler_png") - if default_crawler_png: - default_crawler_png = True - else: - default_crawler_png = False - - # Default crawler options - default_crawler_config = {'html': True, - 'har': default_crawler_har, - 'png': default_crawler_png, - 'depth_limit': p.config.getint("Crawler", "crawler_depth_limit"), - 'closespider_pagecount': p.config.getint("Crawler", "default_crawler_closespider_pagecount"), - 'cookiejar_uuid': None, - 'user_agent': p.config.get("Crawler", "default_crawler_user_agent")} - - # Track launched crawler - r_cache.sadd('all_splash_crawlers', splash_url) - r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Waiting') - r_cache.hset('metadata_crawler:{}'.format(splash_url), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S")) - - # update hardcoded blacklist - load_blacklist('onion') - load_blacklist('regular') - - while True: - - update_auto_crawler() - - rotation_mode.rotate() - to_crawl = crawlers.get_elem_to_crawl_by_queue_type(rotation_mode) - if to_crawl: - url_data = unpack_url(to_crawl['url']) - # remove domain from queue - redis_crawler.srem('{}_domain_crawler_queue'.format(to_crawl['type_service']), url_data['domain']) - - print() - print() - print('\033[92m------------------START CRAWLER------------------\033[0m') - print('crawler type: {}'.format(to_crawl['type_service'])) - print('\033[92m-------------------------------------------------\033[0m') - print('url: {}'.format(url_data['url'])) - print('domain: {}'.format(url_data['domain'])) - print('domain_url: {}'.format(url_data['domain_url'])) - print() - - # Check blacklist - if not redis_crawler.sismember('blacklist_{}'.format(to_crawl['type_service']), url_data['domain']): - date = {'date_day': datetime.datetime.now().strftime("%Y%m%d"), - 'date_month': datetime.datetime.now().strftime("%Y%m"), - 'epoch': int(time.time())} - - # Update crawler status type - r_cache.hset('metadata_crawler:{}'.format(splash_url), 'type', to_crawl['type_service']) - - crawler_config = load_crawler_config(to_crawl['queue_type'], to_crawl['type_service'], url_data['domain'], to_crawl['paste'], to_crawl['url'], date) - # check if default crawler - if not crawler_config['requested']: - # Auto crawl only if service not up this month - if redis_crawler.sismember('month_{}_up:{}'.format(to_crawl['type_service'], date['date_month']), url_data['domain']): - continue - - set_crawled_domain_metadata(to_crawl['type_service'], date, url_data['domain'], to_crawl['paste']) - - - #### CRAWLER #### - # Manual and Auto Crawler - if crawler_config['requested']: - - ######################################################crawler strategy - # CRAWL domain - crawl_onion(url_data['url'], url_data['domain'], url_data['port'], to_crawl['type_service'], to_crawl['original_message'], crawler_config) - - # Default Crawler - else: - # CRAWL domain - crawl_onion(url_data['domain_url'], url_data['domain'], url_data['port'], to_crawl['type_service'], to_crawl['original_message'], crawler_config) - #if url != domain_url and not is_domain_up_day(url_data['domain'], to_crawl['type_service'], date['date_day']): - # crawl_onion(url_data['url'], url_data['domain'], to_crawl['original_message']) - - - # Save last_status day (DOWN) - if not is_domain_up_day(url_data['domain'], to_crawl['type_service'], date['date_day']): - redis_crawler.sadd('{}_down:{}'.format(to_crawl['type_service'], date['date_day']), url_data['domain']) - - # if domain was UP at least one time - if redis_crawler.exists('crawler_history_{}:{}:{}'.format(to_crawl['type_service'], url_data['domain'], url_data['port'])): - # add crawler history (if domain is down) - if not redis_crawler.zrangebyscore('crawler_history_{}:{}:{}'.format(to_crawl['type_service'], url_data['domain'], url_data['port']), date['epoch'], date['epoch']): - # Domain is down - redis_crawler.zadd('crawler_history_{}:{}:{}'.format(to_crawl['type_service'], url_data['domain'], url_data['port']), int(date['epoch']), int(date['epoch'])) - - ############################ - # extract page content - ############################ - - # update list, last crawled domains - redis_crawler.lpush('last_{}'.format(to_crawl['type_service']), '{}:{};{}'.format(url_data['domain'], url_data['port'], date['epoch'])) - redis_crawler.ltrim('last_{}'.format(to_crawl['type_service']), 0, 15) - - #update crawler status - r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Waiting') - r_cache.hdel('metadata_crawler:{}'.format(splash_url), 'crawling_domain') - - # Update crawler status type - r_cache.hdel('metadata_crawler:{}'.format(splash_url), 'type', to_crawl['type_service']) - - # add next auto Crawling in queue: - if to_crawl['paste'] == 'auto': - redis_crawler.zadd('crawler_auto_queue', int(time.time()+crawler_config['crawler_options']['time']) , '{};{}'.format(to_crawl['original_message'], to_crawl['type_service'])) - # update list, last auto crawled domains - redis_crawler.lpush('last_auto_crawled', '{}:{};{}'.format(url_data['domain'], url_data['port'], date['epoch'])) - redis_crawler.ltrim('last_auto_crawled', 0, 9) - else: - print(' Blacklisted Domain') - print() - print() - - else: - time.sleep(1) diff --git a/bin/DB_KVROCKS_MIGRATION.py b/bin/DB_KVROCKS_MIGRATION.py index 2010a837..6da1f7f2 100755 --- a/bin/DB_KVROCKS_MIGRATION.py +++ b/bin/DB_KVROCKS_MIGRATION.py @@ -119,11 +119,11 @@ def core_migration(): # Auto Export Migration ail_misp = r_serv_db.get('ail:misp') if ail_misp != 'True': - ail_misp == 'False' + ail_misp = 'False' r_kvrocks.set('ail:misp', ail_misp) ail_thehive = r_serv_db.get('ail:thehive') if ail_thehive != 'True': - ail_thehive == 'False' + ail_thehive = 'False' r_kvrocks.set('ail:thehive', ail_thehive) @@ -494,7 +494,7 @@ def domain_migration(): domain = Domains.Domain(dom) domain.update_daterange(first_seen) domain.update_daterange(last_check) - domain._set_ports(ports) + domain._set_ports(ports) # TODO ############################################################################ if last_origin: domain.set_last_origin(last_origin) for language in languages: @@ -520,13 +520,13 @@ def domain_migration(): epoch = history['epoch'] # DOMAIN DOWN if not history.get('status'): # domain DOWN - domain.add_history(epoch, port) + domain.add_history(epoch) print(f'DOWN {epoch}') # DOMAIN UP else: root_id = history.get('root') if root_id: - domain.add_history(epoch, port, root_item=root_id) + domain.add_history(epoch, root_item=root_id) print(f'UP {root_id}') crawled_items = get_crawled_items(dom, root_id) for item_id in crawled_items: @@ -534,7 +534,7 @@ def domain_migration(): item_father = get_item_father(item_id) if item_father and url: print(f'{url} {item_id}') - domain.add_crawled_item(url, port, item_id, item_father) + domain.add_crawled_item(url, item_id, item_father) #print() diff --git a/bin/IPAddress.py b/bin/IPAddress.py index 4ec11647..81cf2e14 100755 --- a/bin/IPAddress.py +++ b/bin/IPAddress.py @@ -18,17 +18,18 @@ import time import re import sys from pubsublogger import publisher -from packages import Paste +from lib.objects.Items import Item from Helper import Process from ipaddress import IPv4Network, IPv4Address +# TODO REWRITE ME -> IMPROVE + MIGRATE TO MODULE def search_ip(message): - paste = Paste.Paste(message) - content = paste.get_p_content() + item = Item(message) + content = item.get_content() # regex to find IPs reg_ip = re.compile(r'^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)', flags=re.MULTILINE) - # list of the regex results in the Paste, may be null + # list of the regex results in the Item, may be null results = reg_ip.findall(content) matching_ips = [] @@ -40,14 +41,13 @@ def search_ip(message): matching_ips.append(address) if len(matching_ips) > 0: - print('{} contains {} IPs'.format(paste.p_name, len(matching_ips))) - publisher.warning('{} contains {} IPs'.format(paste.p_name, len(matching_ips))) + print(f'{item.get_id()} contains {len(matching_ips)} IPs') + publisher.warning(f'{item.get_id()} contains {item.get_id()} IPs') - #Tag message with IP - msg = 'infoleak:automatic-detection="ip";{}'.format(message) + # Tag message with IP + msg = f'infoleak:automatic-detection="ip";{item.get_id()}' p.populate_set_out(msg, 'Tags') - #Send to duplicate - p.populate_set_out(message, 'Duplicate') + if __name__ == '__main__': # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh) diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index 46f215cd..111399ec 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -40,7 +40,6 @@ is_ail_core=`screen -ls | egrep '[0-9]+.Core_AIL' | cut -d. -f1` is_ail_2_ail=`screen -ls | egrep '[0-9]+.AIL_2_AIL' | cut -d. -f1` isscripted=`screen -ls | egrep '[0-9]+.Script_AIL' | cut -d. -f1` isflasked=`screen -ls | egrep '[0-9]+.Flask_AIL' | cut -d. -f1` -iscrawler=`screen -ls | egrep '[0-9]+.Crawler_AIL' | cut -d. -f1` isfeeded=`screen -ls | egrep '[0-9]+.Feeder_Pystemon' | cut -d. -f1` function helptext { @@ -126,6 +125,8 @@ function launching_logs { screen -S "Logging_AIL" -X screen -t "LogScript" bash -c "cd ${AIL_BIN}; ${AIL_VENV}/bin/log_subscriber -p 6380 -c Script -l ../logs/; read x" sleep 0.1 screen -S "Logging_AIL" -X screen -t "LogScript" bash -c "cd ${AIL_BIN}; ${AIL_VENV}/bin/log_subscriber -p 6380 -c Sync -l ../logs/; read x" + sleep 0.1 + screen -S "Logging_AIL" -X screen -t "LogScript" bash -c "cd ${AIL_BIN}; ${AIL_VENV}/bin/log_subscriber -p 6380 -c Crawler -l ../logs/; read x" } function launching_queues { @@ -174,8 +175,6 @@ function launching_scripts { screen -S "Script_AIL" -X screen -t "JSON_importer" bash -c "cd ${AIL_BIN}/import; ${ENV_PY} ./JSON_importer.py; read x" sleep 0.1 - screen -S "Script_AIL" -X screen -t "Crawler_manager" bash -c "cd ${AIL_BIN}/core; ${ENV_PY} ./Crawler_manager.py; read x" - sleep 0.1 screen -S "Script_AIL" -X screen -t "D4_client" bash -c "cd ${AIL_BIN}/core; ${ENV_PY} ./D4_client.py; read x" sleep 0.1 screen -S "Script_AIL" -X screen -t "DbCleaner" bash -c "cd ${AIL_BIN}/core; ${ENV_PY} ./DbCleaner.py; read x" @@ -202,6 +201,9 @@ function launching_scripts { screen -S "Script_AIL" -X screen -t "SubmitPaste" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./submit_paste.py; read x" sleep 0.1 + screen -S "Script_AIL" -X screen -t "Crawler" bash -c "cd ${AIL_BIN}/crawlers; ${ENV_PY} ./Crawler.py; read x" + sleep 0.1 + screen -S "Script_AIL" -X screen -t "Sync_module" bash -c "cd ${AIL_BIN}/core; ${ENV_PY} ./Sync_module.py; read x" sleep 0.1 @@ -225,8 +227,6 @@ function launching_scripts { sleep 0.1 screen -S "Script_AIL" -X screen -t "Mail" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Mail.py; read x" sleep 0.1 - # screen -S "Script_AIL" -X screen -t "SentimentAnalysis" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./SentimentAnalysis.py; read x" - # sleep 0.1 screen -S "Script_AIL" -X screen -t "ModuleStats" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./ModuleStats.py; read x" sleep 0.1 screen -S "Script_AIL" -X screen -t "Onion" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Onion.py; read x" @@ -265,8 +265,12 @@ function launching_scripts { ################################## # DISABLED MODULES # ################################## - #screen -S "Script_AIL" -X screen -t "Phone" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Phone.py; read x" - #sleep 0.1 + # screen -S "Script_AIL" -X screen -t "Phone" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Phone.py; read x" + # sleep 0.1 + # screen -S "Script_AIL" -X screen -t "SentimentAnalysis" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./SentimentAnalysis.py; read x" + # sleep 0.1 + # screen -S "Script_AIL" -X screen -t "Release" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Release.py; read x" + # sleep 0.1 ################################## # # @@ -285,8 +289,6 @@ function launching_scripts { sleep 0.1 screen -S "Script_AIL" -X screen -t "IPAddress" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./IPAddress.py; read x" - #screen -S "Script_AIL" -X screen -t "Release" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Release.py; read x" - #sleep 0.1 } @@ -476,19 +478,19 @@ function launch_feeder { } function killscript { - if [[ $islogged || $isqueued || $is_ail_core || $isscripted || $isflasked || $isfeeded || $iscrawler || $is_ail_2_ail ]]; then + if [[ $islogged || $isqueued || $is_ail_core || $isscripted || $isflasked || $isfeeded || $is_ail_2_ail ]]; then echo -e $GREEN"Killing Script"$DEFAULT - kill $islogged $isqueued $is_ail_core $isscripted $isflasked $isfeeded $iscrawler $is_ail_2_ail + kill $islogged $isqueued $is_ail_core $isscripted $isflasked $isfeeded $is_ail_2_ail sleep 0.2 echo -e $ROSE`screen -ls`$DEFAULT - echo -e $GREEN"\t* $islogged $isqueued $is_ail_core $isscripted $isflasked $isfeeded $iscrawler $is_ail_2_ail killed."$DEFAULT + echo -e $GREEN"\t* $islogged $isqueued $is_ail_core $isscripted $isflasked $isfeeded $is_ail_2_ail killed."$DEFAULT else echo -e $RED"\t* No script to kill"$DEFAULT fi } function killall { - if [[ $isredis || $isardb || $iskvrocks || $islogged || $isqueued || $is_ail_2_ail || $isscripted || $isflasked || $isfeeded || $iscrawler || $is_ail_core || $is_ail_2_ail ]]; then + if [[ $isredis || $isardb || $iskvrocks || $islogged || $isqueued || $is_ail_2_ail || $isscripted || $isflasked || $isfeeded || $is_ail_core || $is_ail_2_ail ]]; then if [[ $isredis ]]; then echo -e $GREEN"Gracefully closing redis servers"$DEFAULT shutting_down_redis; @@ -503,10 +505,10 @@ function killall { shutting_down_kvrocks; fi echo -e $GREEN"Killing all"$DEFAULT - kill $isredis $isardb $iskvrocks $islogged $isqueued $is_ail_core $isscripted $isflasked $isfeeded $iscrawler $is_ail_2_ail + kill $isredis $isardb $iskvrocks $islogged $isqueued $is_ail_core $isscripted $isflasked $isfeeded $is_ail_2_ail sleep 0.2 echo -e $ROSE`screen -ls`$DEFAULT - echo -e $GREEN"\t* $isredis $isardb $iskvrocks $islogged $isqueued $isscripted $is_ail_2_ail $isflasked $isfeeded $iscrawler $is_ail_core killed."$DEFAULT + echo -e $GREEN"\t* $isredis $isardb $iskvrocks $islogged $isqueued $isscripted $is_ail_2_ail $isflasked $isfeeded $is_ail_core killed."$DEFAULT else echo -e $RED"\t* No screen to kill"$DEFAULT fi diff --git a/bin/ModulesInformationV2.py b/bin/ModulesInformationV2.py index def7509f..f10bc8a2 100755 --- a/bin/ModulesInformationV2.py +++ b/bin/ModulesInformationV2.py @@ -15,7 +15,7 @@ import json import redis import psutil from subprocess import PIPE, Popen -from packages import Paste +from lib.objects.Items import Item sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) import ConfigLoader @@ -51,7 +51,7 @@ QUEUE_STATUS = {} CPU_TABLE = {} CPU_OBJECT_TABLE = {} -# Path of the current paste for a pid +# Path of the current item for a pid COMPLETE_PASTE_PATH_PER_PID = {} ''' @@ -443,10 +443,10 @@ class Show_paste(Frame): self.label_list[i]._text = "" return - paste = Paste.Paste(COMPLETE_PASTE_PATH_PER_PID[current_selected_value]) - old_content = paste.get_p_content()[0:4000] # Limit number of char to be displayed + item = Item(COMPLETE_PASTE_PATH_PER_PID[current_selected_value]) + old_content = item.get_content()[0:4000] # Limit number of char to be displayed - #Replace unprintable char by ? + # Replace unprintable char by ? content = "" for i, c in enumerate(old_content): if ord(c) > 127: # Used to avoid printing unprintable char @@ -456,7 +456,7 @@ class Show_paste(Frame): else: content += c - #Print in the correct label, END or more + # Print in the correct label, END or more to_print = "" i = 0 for line in content.split("\n"): @@ -472,7 +472,7 @@ class Show_paste(Frame): self.label_list[i]._text = "- END of PASTE -" i += 1 - while i 30: - publisher.warning(to_print) - else: - publisher.info(to_print) + to_print = f'Release;{item.get_source()};{item.get_date()};{item.get_basename()};{len(releases)} releases;{item.get_id()}' + print(to_print) + if len(releases) > 30: + publisher.warning(to_print) + else: + publisher.info(to_print) except TimeoutException: p.incr_module_timeout_statistic() - print ("{0} processing timeout".format(paste.p_rel_path)) + print(f"{item.get_id()} processing timeout") continue else: signal.alarm(0) diff --git a/bin/SourceCode.py b/bin/SourceCode.py index f34bb34e..8ad1f269 100644 --- a/bin/SourceCode.py +++ b/bin/SourceCode.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*-coding:UTF-8 -* import time -from packages import Paste +from lib.objects.Items import Item from pubsublogger import publisher from Helper import Process import re @@ -13,19 +13,19 @@ if __name__ == "__main__": p = Process(config_section) publisher.info("Finding Source Code") - critical = 0 # AS TO BE IMPORTANT, MIGHT BE REMOVED + critical = 0 # AS TO BE IMPORTANT, MIGHT BE REMOVED - #RELEVANTS LANGUAGES - shell = "[a-zA-Z0-9]+@[a-zA-Z0-9\-]+\:\~\$" - c = "\#include\ \<[a-z\/]+.h\>" - php = "\<\?php" - python = "import\ [\w]+" - bash = "#!\/[\w]*\/bash" - javascript = "function\(\)" - ruby = "require \ [\w]+" - adr = "0x[a-f0-9]{2}" + # RELEVANT LANGUAGES + shell = r"[a-zA-Z0-9]+@[a-zA-Z0-9\-]+\:\~\$" + c = r"\#include\ \<[a-z\/]+.h\>" + php = r"\<\?php" + python = r"import\ [\w]+" + bash = r"#!\/[\w]*\/bash" + javascript = r"function\(\)" + ruby = r"require \ [\w]+" + adr = r"0x[a-f0-9]{2}" - #asm = "\"((?s).{1}x[0-9a-f]{2}){3,}" ISSUES WITH FINDALL, pattern like \x54\xaf\x23\.. + # asm = r"\"((?s).{1}x[0-9a-f]{2}){3,}" ISSUES WITH FINDALL, pattern like \x54\xaf\x23\.. languages = [shell, c, php, bash, python, javascript, bash, ruby, adr] regex = '|'.join(languages) @@ -41,13 +41,13 @@ if __name__ == "__main__": filepath, count = message.split() - paste = Paste.Paste(filepath) - content = paste.get_p_content() + item = Item(filepath) + content = item.get_content() match_set = set(re.findall(regex, content)) if len(match_set) == 0: continue - to_print = 'SourceCode;{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, message) + to_print = f'SourceCode;{item.get_source()};{item.get_date()};{item.get_basename()};{item.get_id()}' if len(match_set) > critical: publisher.warning(to_print) diff --git a/bin/core/Crawler_manager.py b/bin/core/Crawler_manager.py deleted file mode 100755 index bacaaa71..00000000 --- a/bin/core/Crawler_manager.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python3 -# -*-coding:UTF-8 -* - -import os -import sys -import time - -sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib')) -import ConfigLoader -import crawlers - -config_loader = ConfigLoader.ConfigLoader() -r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata") -config_loader = None - -# # TODO: lauch me in core screen -# # TODO: check if already launched in tor screen - -# # TODO: handle mutltiple splash_manager -if __name__ == '__main__': - - is_manager_connected = crawlers.ping_splash_manager() - if not is_manager_connected: - print('Error, Can\'t connect to Splash manager') - session_uuid = None - else: - print('Splash manager connected') - session_uuid = crawlers.get_splash_manager_session_uuid() - is_manager_connected = crawlers.reload_splash_and_proxies_list() - print(is_manager_connected) - if is_manager_connected: - if crawlers.test_ail_crawlers(): - crawlers.relaunch_crawlers() - last_check = int(time.time()) - - while True: - - # # TODO: avoid multiple ping - - # check if manager is connected - if int(time.time()) - last_check > 60: - is_manager_connected = crawlers.is_splash_manager_connected() - current_session_uuid = crawlers.get_splash_manager_session_uuid() - # reload proxy and splash list - if current_session_uuid and current_session_uuid != session_uuid: - is_manager_connected = crawlers.reload_splash_and_proxies_list() - if is_manager_connected: - print('reload proxies and splash list') - if crawlers.test_ail_crawlers(): - crawlers.relaunch_crawlers() - session_uuid = current_session_uuid - if not is_manager_connected: - print('Error, Can\'t connect to Splash manager') - last_check = int(time.time()) - - # # TODO: lauch crawlers if was never connected - # refresh splash and proxy list - elif False: - crawlers.reload_splash_and_proxies_list() - print('list of splash and proxies refreshed') - else: - time.sleep(5) - - # kill/launch new crawler / crawler manager check if already launched - - - # # TODO: handle mutltiple splash_manager - # catch reload request diff --git a/bin/crawlers/Crawler.py b/bin/crawlers/Crawler.py new file mode 100755 index 00000000..d6aabd72 --- /dev/null +++ b/bin/crawlers/Crawler.py @@ -0,0 +1,331 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import sys +import time + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from modules.abstract_module import AbstractModule +from lib import crawlers +from lib.ConfigLoader import ConfigLoader +from lib.objects.Domains import Domain +from lib.objects import Screenshots + +class Crawler(AbstractModule): + + def __init__(self): + super(Crawler, self, ).__init__(logger_channel='Crawler') + + # Waiting time in seconds between to message processed + self.pending_seconds = 1 + + config_loader = ConfigLoader() + self.r_log_submit = config_loader.get_redis_conn('Redis_Log_submit') + + self.default_har = config_loader.get_config_boolean('Crawler', 'default_har') + self.default_screenshot = config_loader.get_config_boolean('Crawler', 'default_screenshot') + self.default_depth_limit = config_loader.get_config_int('Crawler', 'default_depth_limit') + + # TODO: LIMIT MAX NUMBERS OF CRAWLED PAGES + + # update hardcoded blacklist + crawlers.load_blacklist() + # update captures cache + crawlers.reload_crawler_captures() + + # LACUS + self.lacus = crawlers.get_lacus() + + # Capture + self.har = None + self.screenshot = None + self.root_item = None + self.har_dir = None + self.items_dir = None + self.domain = None + + # Send module state to logs + self.redis_logger.info('Crawler initialized') + + def print_crawler_start_info(self, url, domain, domain_url): + print() + print() + print('\033[92m------------------START CRAWLER------------------\033[0m') + print(f'crawler type: {domain}') + print('\033[92m-------------------------------------------------\033[0m') + print(f'url: {url}') + print(f'domain: {domain}') + print(f'domain_url: {domain_url}') + print() + + def get_message(self): + # Check if a new Capture can be Launched + if crawlers.get_nb_crawler_captures() < crawlers.get_crawler_max_captures(): + task_row = crawlers.get_crawler_task_from_queue() + if task_row: + print(task_row) + task_uuid, priority = task_row + self.enqueue_capture(task_uuid, priority) + + # Check if a Capture is Done + capture = crawlers.get_crawler_capture() + if capture: + print(capture) + capture_uuid = capture[0][0] + capture_status = self.lacus.get_capture_status(capture_uuid) + if capture_status != crawlers.CaptureStatus.DONE: # TODO ADD GLOBAL TIMEOUT-> Save start time + crawlers.update_crawler_capture(capture_uuid) + print(capture_uuid, capture_status, int(time.time())) + else: + self.compute(capture_uuid) + crawlers.remove_crawler_capture(capture_uuid) + print('capture', capture_uuid, 'completed') + + + time.sleep(self.pending_seconds) + + def enqueue_capture(self, task_uuid, priority): + task = crawlers.get_crawler_task(task_uuid) + print(task) + # task = { + # 'uuid': task_uuid, + # 'url': 'https://foo.be', + # 'domain': 'foo.be', + # 'depth': 1, + # 'har': True, + # 'screenshot': True, + # 'user_agent': crawlers.get_default_user_agent(), + # 'cookiejar': [], + # 'header': '', + # 'proxy': 'force_tor', + # 'parent': 'manual', + # } + url = task['url'] + force = priority != 0 + + # TODO unpack cookiejar + + # TODO HEADER + + capture_uuid = self.lacus.enqueue(url=url, + depth=task['depth'], + user_agent=task['user_agent'], + proxy=task['proxy'], + cookies=[], + force=force, + general_timeout_in_sec=90) + + crawlers.add_crawler_capture(task_uuid, capture_uuid) + print(task_uuid, capture_uuid, 'launched') + return capture_uuid + + # CRAWL DOMAIN + # TODO: CATCH ERRORS + def compute(self, capture_uuid): + + print('saving capture', capture_uuid) + + task_uuid = crawlers.get_crawler_capture_task_uuid(capture_uuid) + task = crawlers.get_crawler_task(task_uuid) + + print(task['domain']) + + self.domain = Domain(task['domain']) + + # TODO CHANGE EPOCH + epoch = int(time.time()) + parent_id = task['parent'] + print(task) + + entries = self.lacus.get_capture(capture_uuid) + print(entries['status']) + self.har = task['har'] + self.screenshot = task['screenshot'] + str_date = crawlers.get_current_date(separator=True) + self.har_dir = crawlers.get_date_har_dir(str_date) + self.items_dir = crawlers.get_date_crawled_items_source(str_date) + self.root_item = None + + # Save Capture + self.save_capture_response(parent_id, entries) + + self.domain.update_daterange(str_date.replace('/', '')) + # Origin + History + if self.root_item: + # domain.add_ports(port) + self.domain.set_last_origin(parent_id) + self.domain.add_history(epoch, root_item=self.root_item) + elif self.domain.was_up(): + self.domain.add_history(epoch, root_item=epoch) + + crawlers.update_last_crawled_domain(self.domain.get_domain_type(), self.domain.id, epoch) + crawlers.clear_crawler_task(task_uuid, self.domain.get_domain_type()) + + def save_capture_response(self, parent_id, entries): + print(entries.keys()) + if 'error' in entries: + # TODO IMPROVE ERROR MESSAGE + self.redis_logger.warning(str(entries['error'])) + print(entries['error']) + if entries.get('html'): + print('retrieved content') + # print(entries.get('html')) + + # TODO LOGS IF != domain + if 'last_redirected_url' in entries and entries['last_redirected_url']: + last_url = entries['last_redirected_url'] + unpacked_last_url = crawlers.unpack_url(last_url) + current_domain = unpacked_last_url['domain'] + # REDIRECTION TODO CHECK IF WEB + if current_domain != self.domain.id and not self.root_item: + self.redis_logger.warning(f'External redirection {self.domain.id} -> {current_domain}') + print(f'External redirection {self.domain.id} -> {current_domain}') + if not self.root_item: + self.domain = Domain(current_domain) + # TODO LAST URL + # FIXME + else: + last_url = f'http://{self.domain.id}' + + if 'html' in entries and entries['html']: + item_id = crawlers.create_item_id(self.items_dir, self.domain.id) + print(item_id) + gzip64encoded = crawlers.get_gzipped_b64_item(item_id, entries['html']) + # send item to Global + relay_message = f'{item_id} {gzip64encoded}' + self.send_message_to_queue(relay_message, 'Mixer') + # increase nb of paste by feeder name + self.r_log_submit.hincrby('mixer_cache:list_feeder', 'crawler', 1) + + # Tag + msg = f'infoleak:submission="crawler";{item_id}' + self.send_message_to_queue(msg, 'Tags') + + crawlers.create_item_metadata(item_id, self.domain.id, last_url, parent_id) + if self.root_item is None: + self.root_item = item_id + parent_id = item_id + + # SCREENSHOT + if self.screenshot: + if 'png' in entries and entries['png']: + screenshot = Screenshots.create_screenshot(entries['png'], b64=False) + if screenshot: + # Create Correlations + screenshot.add_correlation('item', '', item_id) + screenshot.add_correlation('domain', '', self.domain.id) + # HAR + if self.har: + if 'har' in entries and entries['har']: + crawlers.save_har(self.har_dir, item_id, entries['har']) + # Next Children + entries_children = entries.get('children') + if entries_children: + for children in entries_children: + self.save_capture_response(parent_id, children) + + +if __name__ == '__main__': + module = Crawler() + module.debug = True + # module.compute(('ooooo', 0)) + module.run() + + +################################## +################################## +################################## +################################## +################################## + + +# from Helper import Process +# from pubsublogger import publisher + + +# ======== FUNCTIONS ======== + + +# def update_auto_crawler(): +# current_epoch = int(time.time()) +# list_to_crawl = redis_crawler.zrangebyscore('crawler_auto_queue', '-inf', current_epoch) +# for elem_to_crawl in list_to_crawl: +# mess, type = elem_to_crawl.rsplit(';', 1) +# redis_crawler.sadd('{}_crawler_priority_queue'.format(type), mess) +# redis_crawler.zrem('crawler_auto_queue', elem_to_crawl) + +# Extract info form url (url, domain, domain url, ...) +# def unpack_url(url): +# to_crawl = {} +# faup.decode(url) +# url_unpack = faup.get() +# to_crawl['domain'] = to_crawl['domain'].lower() +# new_url_host = url_host.lower() +# url_lower_case = url.replace(url_host, new_url_host, 1) +# +# if url_unpack['scheme'] is None: +# to_crawl['scheme'] = 'http' +# url= 'http://{}'.format(url_lower_case) +# else: +# try: +# scheme = url_unpack['scheme'].decode() +# except Exception as e: +# scheme = url_unpack['scheme'] +# if scheme in default_proto_map: +# to_crawl['scheme'] = scheme +# url = url_lower_case +# else: +# redis_crawler.sadd('new_proto', '{} {}'.format(scheme, url_lower_case)) +# to_crawl['scheme'] = 'http' +# url= 'http://{}'.format(url_lower_case.replace(scheme, '', 1)) +# +# if url_unpack['port'] is None: +# to_crawl['port'] = default_proto_map[to_crawl['scheme']] +# else: +# try: +# port = url_unpack['port'].decode() +# except: +# port = url_unpack['port'] +# # Verify port number #################### make function to verify/correct port number +# try: +# int(port) +# # Invalid port Number +# except Exception as e: +# port = default_proto_map[to_crawl['scheme']] +# to_crawl['port'] = port +# +# #if url_unpack['query_string'] is None: +# # if to_crawl['port'] == 80: +# # to_crawl['url']= '{}://{}'.format(to_crawl['scheme'], url_unpack['host'].decode()) +# # else: +# # to_crawl['url']= '{}://{}:{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port']) +# #else: +# # to_crawl['url']= '{}://{}:{}{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port'], url_unpack['query_string'].decode()) +# +# to_crawl['url'] = url +# if to_crawl['port'] == 80: +# to_crawl['domain_url'] = '{}://{}'.format(to_crawl['scheme'], new_url_host) +# else: +# to_crawl['domain_url'] = '{}://{}:{}'.format(to_crawl['scheme'], new_url_host, to_crawl['port']) +# +# try: +# to_crawl['tld'] = url_unpack['tld'].decode() +# except: +# to_crawl['tld'] = url_unpack['tld'] +# +# return to_crawl + +# ##################################################### add ftp ??? + # update_auto_crawler() + + # # add next auto Crawling in queue: + # if to_crawl['paste'] == 'auto': + # redis_crawler.zadd('crawler_auto_queue', int(time.time()+crawler_config['crawler_options']['time']) , '{};{}'.format(to_crawl['original_message'], to_crawl['type_service'])) + # # update list, last auto crawled domains + # redis_crawler.lpush('last_auto_crawled', '{}:{};{}'.format(url_data['domain'], url_data['port'], date['epoch'])) + # redis_crawler.ltrim('last_auto_crawled', 0, 9) + # diff --git a/bin/torcrawler/blacklist_onion.txt b/bin/crawlers/blacklist.txt similarity index 100% rename from bin/torcrawler/blacklist_onion.txt rename to bin/crawlers/blacklist.txt diff --git a/bin/helper/CVE_check.py b/bin/helper/CVE_check.py index 63f611de..e6200506 100755 --- a/bin/helper/CVE_check.py +++ b/bin/helper/CVE_check.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*-coding:UTF-8 -* -from packages import Paste +from lib.objects.Items import Item from Helper import Process import os @@ -12,11 +12,13 @@ import configparser from collections import defaultdict +# TODO FIX ME OR REMOVE ME + def get_dict_cve(list_paste_cve, only_one_same_cve_by_paste=False): dict_keyword = {} for paste_cve in list_paste_cve: - paste_content = Paste.Paste(paste_cve).get_p_content() + paste_content = Item(paste_cve).get_content() cve_list = reg_cve.findall(paste_content) if only_one_same_cve_by_paste: diff --git a/bin/lib/ConfigLoader.py b/bin/lib/ConfigLoader.py index a6d5fef4..04ae8ee8 100755 --- a/bin/lib/ConfigLoader.py +++ b/bin/lib/ConfigLoader.py @@ -35,17 +35,17 @@ class ConfigLoader(object): else: self.cfg.read(default_config_file) - def get_redis_conn(self, redis_name, decode_responses=True): ## TODO: verify redis name - return redis.StrictRedis( host=self.cfg.get(redis_name, "host"), + def get_redis_conn(self, redis_name, decode_responses=True): + return redis.StrictRedis(host=self.cfg.get(redis_name, "host"), port=self.cfg.getint(redis_name, "port"), db=self.cfg.getint(redis_name, "db"), - decode_responses=decode_responses ) + decode_responses=decode_responses) - def get_db_conn(self, db_name, decode_responses=True): ## TODO: verify redis name - return redis.StrictRedis( host=self.cfg.get(db_name, "host"), + def get_db_conn(self, db_name, decode_responses=True): + return redis.StrictRedis(host=self.cfg.get(db_name, "host"), port=self.cfg.getint(db_name, "port"), password=self.cfg.get(db_name, "password"), - decode_responses=decode_responses ) + decode_responses=decode_responses) def get_files_directory(self, key_name): directory_path = self.cfg.get('Directories', key_name) @@ -79,3 +79,33 @@ class ConfigLoader(object): return all_keys_values else: return [] + +# # # # Directory Config # # # # + +config_loader = ConfigLoader() +ITEMS_FOLDER = config_loader.get_config_str("Directories", "pastes") +if ITEMS_FOLDER == 'PASTES': + ITEMS_FOLDER = os.path.join(os.environ['AIL_HOME'], ITEMS_FOLDER) +ITEMS_FOLDER = ITEMS_FOLDER + '/' +ITEMS_FOLDER = os.path.join(os.path.realpath(ITEMS_FOLDER), '') + +HARS_DIR = config_loader.get_files_directory('har') +if HARS_DIR == 'CRAWLED_SCREENSHOT': + HARS_DIR = os.path.join(os.environ['AIL_HOME'], HARS_DIR) + +SCREENSHOTS_FOLDER = config_loader.get_files_directory('screenshot') +if SCREENSHOTS_FOLDER == 'CRAWLED_SCREENSHOT/screenshot': + SCREENSHOTS_FOLDER = os.path.join(os.environ['AIL_HOME'], SCREENSHOTS_FOLDER) +config_loader = None + +def get_hars_dir(): + return HARS_DIR + +def get_items_dir(): + return ITEMS_FOLDER + +def get_screenshots_dir(): + return SCREENSHOTS_FOLDER + + + diff --git a/bin/lib/Statistics.py b/bin/lib/Statistics.py index a6d903a6..7d9067e5 100755 --- a/bin/lib/Statistics.py +++ b/bin/lib/Statistics.py @@ -58,7 +58,6 @@ def get_item_stats_nb_by_date(): def _set_item_stats_nb_by_date(date, source): return r_statistics.zrange(f'providers_set_{date}', ) - # # TODO: load ZSET IN CACHE => FAST UPDATE def update_item_stats_size_nb(item_id, source, size, date): # Add/Update in Redis @@ -106,7 +105,7 @@ def update_module_stats(module_name, num, keyword, date): # check if this keyword is eligible for progression keyword_total_sum = 0 - curr_value = r_statistics.hget(date, module+'-'+keyword) + curr_value = r_statistics.hget(date, f'{module_name}-{keyword}') keyword_total_sum += int(curr_value) if curr_value is not None else 0 if r_statistics.zcard(f'top_{module_name}_set_{date}') < PIE_CHART_MAX_CARDINALITY: diff --git a/bin/lib/ail_core.py b/bin/lib/ail_core.py index 5556bcdf..dc22f21b 100755 --- a/bin/lib/ail_core.py +++ b/bin/lib/ail_core.py @@ -22,7 +22,7 @@ def get_ail_uuid(): # # TODO: check change paste => item def get_all_objects(): - return ['domain', 'item', 'pgp', 'cryptocurrency', 'decoded', 'screenshot', 'username'] + return ['cve', 'domain', 'item', 'pgp', 'cryptocurrency', 'decoded', 'screenshot', 'username'] def get_object_all_subtypes(obj_type): if obj_type == 'cryptocurrency': diff --git a/bin/lib/correlations_engine.py b/bin/lib/correlations_engine.py index 1314cc90..432f6c25 100755 --- a/bin/lib/correlations_engine.py +++ b/bin/lib/correlations_engine.py @@ -43,12 +43,13 @@ config_loader = None CORRELATION_TYPES_BY_OBJ = { "cryptocurrency" : ["domain", "item"], - "decoded" : ["domain", "item"], - "domain": ["cryptocurrency", "decoded", "item", "pgp", "username", "screenshot"], - "item": ["cryptocurrency", "decoded", "domain", "pgp", "username", "screenshot"], + "cve": ["domain", "item"], + "decoded": ["domain", "item"], + "domain": ["cve", "cryptocurrency", "decoded", "item", "pgp", "username", "screenshot"], + "item": ["cve", "cryptocurrency", "decoded", "domain", "pgp", "username", "screenshot"], "pgp" : ["domain", "item"], - "username" : ["domain", "item"], - "screenshot" : ["domain", "item"], + "username": ["domain", "item"], + "screenshot": ["domain", "item"], } def get_obj_correl_types(obj_type): diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index 471e978d..e106d3e4 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -8,19 +8,23 @@ API Helper """ import base64 import gzip +import hashlib import json import os +import pickle import re -import redis import sys import time import uuid import subprocess +from enum import IntEnum, unique from datetime import datetime, timedelta from urllib.parse import urlparse, urljoin -from bs4 import BeautifulSoup +#from bs4 import BeautifulSoup + +from pylacus import PyLacus from pyfaup.faup import Faup @@ -28,20 +32,25 @@ from pyfaup.faup import Faup import requests requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) -sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/')) -import git_status +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from packages import git_status +from lib.ConfigLoader import ConfigLoader +from lib.objects.Domains import Domain +from core import screen -sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) -import ConfigLoader +config_loader = ConfigLoader() +r_db = config_loader.get_db_conn("Kvrocks_DB") +r_crawler = config_loader.get_db_conn("Kvrocks_Crawler") +r_cache = config_loader.get_redis_conn("Redis_Cache") -sys.path.append(os.path.join(os.environ['AIL_BIN'], 'core/')) -import screen - -config_loader = ConfigLoader.ConfigLoader() r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata") r_serv_onion = config_loader.get_redis_conn("ARDB_Onion") -r_cache = config_loader.get_redis_conn("Redis_Cache") -PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "pastes")) + +ITEMS_FOLDER = config_loader.get_config_str("Directories", "pastes") +HAR_DIR = config_loader.get_files_directory('har') activate_crawler = config_loader.get_config_str("Crawler", "activate_crawler") config_loader = None @@ -53,40 +62,77 @@ faup = Faup() # # # # # # # # # # +def gen_uuid(): + return str(uuid.uuid4()) + def generate_uuid(): return str(uuid.uuid4()).replace('-', '') # # TODO: remove me ? -def get_current_date(): - return datetime.now().strftime("%Y%m%d") +def get_current_date(separator=False): + if separator: + return datetime.now().strftime("%Y/%m/%d") + else: + return datetime.now().strftime("%Y%m%d") + +def get_date_crawled_items_source(date): + return os.path.join('crawled', date) + +def get_date_har_dir(date): + return os.path.join(HAR_DIR, date) def is_valid_onion_domain(domain): if not domain.endswith('.onion'): return False domain = domain.replace('.onion', '', 1) - if len(domain) == 16: # v2 address + if len(domain) == 16: # v2 address r_onion = r'[a-z0-9]{16}' if re.match(r_onion, domain): return True - elif len(domain) == 56: # v3 address + elif len(domain) == 56: # v3 address r_onion = r'[a-z0-9]{56}' if re.fullmatch(r_onion, domain): return True return False -# TEMP FIX def get_faup(): return faup +def unpack_url(url): + f = get_faup() + f.decode(url) + url_decoded = f.get() + port = url_decoded['port'] + if not port: + if url_decoded['scheme'] == 'http': + port = 80 + elif url_decoded['scheme'] == 'https': + port = 443 + else: + port = 80 + url_decoded['port'] = port + # decode URL + try: + url = url_decoded['url'].decode() + except AttributeError: + url = url_decoded['url'] + # if not url_decoded['scheme']: + # url = f'http://{url}' + + # Fix case + url_decoded['domain'] = url_decoded['domain'].lower() + url_decoded['url'] = url.replace(url_decoded['host'], url_decoded['host'].lower(), 1) + return url_decoded + # # # # # # # # # # -# FAVICON # +# FAVICON # TODO REWRITE ME # # # # # # # # # # def get_favicon_from_html(html, domain, url): favicon_urls = extract_favicon_from_html(html, url) - # add root favicom + # add root favicon if not favicon_urls: favicon_urls.add(f'{urlparse(url).scheme}://{domain}/favicon.ico') print(favicon_urls) @@ -140,13 +186,13 @@ def extract_favicon_from_html(html, url): # # TODO: handle prefix cookies # # TODO: fill empty fields -def create_cookie_crawler(cookie_dict, domain, crawler_type='regular'): +def create_cookie_crawler(cookie_dict, domain, crawler_type='web'): # check cookie domain filed if not 'domain' in cookie_dict: - cookie_dict['domain'] = '.{}'.format(domain) + cookie_dict['domain'] = f'.{domain}' # tor browser: disable secure cookie - if crawler_type=='onion': + if crawler_type == 'onion': cookie_dict['secure'] = False # force cookie domain @@ -158,7 +204,7 @@ def create_cookie_crawler(cookie_dict, domain, crawler_type='regular'): cookie_dict['expires'] = (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z' return cookie_dict -def load_crawler_cookies(cookiejar_uuid, domain, crawler_type='regular'): +def load_crawler_cookies(cookiejar_uuid, domain, crawler_type='web'): cookies = get_cookiejar_cookies_list(cookiejar_uuid) all_cookies = [] for cookie_dict in cookies: @@ -171,21 +217,21 @@ def get_all_cookiejar(): return r_serv_onion.smembers('cookiejar:all') def get_global_cookiejar(): - res = r_serv_onion.smembers('cookiejar:global') - if not res: - res = [] - return res + cookiejars = r_serv_onion.smembers('cookiejar:global') + if not cookiejars: + cookiejars = [] + return cookiejars def get_user_cookiejar(user_id): - res = r_serv_onion.smembers('cookiejar:user:{}'.format(user_id)) - if not res: - res = [] - return res + cookiejars = r_serv_onion.smembers('cookiejar:user:{}'.format(user_id)) + if not cookiejars: + cookiejars = [] + return cookiejars def exist_cookiejar(cookiejar_uuid): return r_serv_onion.exists('cookiejar_metadata:{}'.format(cookiejar_uuid)) -def _set_cookiejar_date(date): +def _set_cookiejar_date(cookiejar_uuid, date): r_serv_onion.hset(f'cookiejar_metadata:{cookiejar_uuid}', 'date', date) # # TODO: sanitize cookie_uuid @@ -194,7 +240,7 @@ def create_cookiejar(user_id, level=1, description=None, cookiejar_uuid=None): cookiejar_uuid = str(uuid.uuid4()) r_serv_onion.sadd('cookiejar:all', cookiejar_uuid) - if level==0: + if level == 0: r_serv_onion.sadd(f'cookiejar:user:{user_id}', cookiejar_uuid) else: r_serv_onion.sadd('cookiejar:global', cookiejar_uuid) @@ -202,7 +248,7 @@ def create_cookiejar(user_id, level=1, description=None, cookiejar_uuid=None): r_serv_onion.hset(f'cookiejar_metadata:{cookiejar_uuid}', 'user_id', user_id) r_serv_onion.hset(f'cookiejar_metadata:{cookiejar_uuid}', 'level', level) r_serv_onion.hset(f'cookiejar_metadata:{cookiejar_uuid}', 'description', description) - _set_cookiejar_date(datetime.now().strftime("%Y%m%d")) + _set_cookiejar_date(cookiejar_uuid, datetime.now().strftime("%Y%m%d")) # if json_cookies: # json_cookies = json.loads(json_cookies) # # TODO: catch Exception @@ -223,10 +269,10 @@ def delete_cookie_jar(cookiejar_uuid): r_serv_onion.delete('cookiejar_metadata:{}'.format(cookiejar_uuid)) def get_cookiejar_cookies_uuid(cookiejar_uuid): - res = r_serv_onion.smembers('cookiejar:{}:cookies:uuid'.format(cookiejar_uuid)) - if not res: - res = [] - return res + cookies = r_serv_onion.smembers('cookiejar:{}:cookies:uuid'.format(cookiejar_uuid)) + if not cookies: + cookies = [] + return cookies def get_cookiejar_cookies_list(cookiejar_uuid, add_cookie_uuid=False): l_cookiejar = [] @@ -251,10 +297,10 @@ def get_cookiejar_date(cookiejar_uuid): return r_serv_onion.hget('cookiejar_metadata:{}'.format(cookiejar_uuid), 'date') def get_cookiejar_level(cookiejar_uuid): - res = r_serv_onion.hget('cookiejar_metadata:{}'.format(cookiejar_uuid), 'level') - if not res: - res = 1 - return int(res) + level = r_serv_onion.hget('cookiejar_metadata:{}'.format(cookiejar_uuid), 'level') + if not level: + level = 1 + return int(level) def get_cookiejar_metadata(cookiejar_uuid, level=False): dict_cookiejar = {} @@ -296,22 +342,22 @@ def get_cookie_all_keys_name(): return ['name', 'value', 'domain', 'path', 'httpOnly', 'secure'] def exists_cookie(cookie_uuid): - if int(r_serv_onion.scard('cookies:map:cookiejar:{}'.format(cookie_uuid))) > 0: + if int(r_serv_onion.scard(f'cookies:map:cookiejar:{cookie_uuid}')) > 0: return True return False def get_cookie_value(cookie_uuid, name): - return r_serv_onion.hget('cookiejar:cookie:{}'.format(cookie_uuid), name) + return r_serv_onion.hget(f'cookiejar:cookie:{cookie_uuid}', name) def set_cookie_value(cookie_uuid, name, value): - r_serv_onion.hset('cookiejar:cookie:{}'.format(cookie_uuid), name, value) + r_serv_onion.hset(f'cookiejar:cookie:{cookie_uuid}', name, value) def delete_cookie_value(cookie_uuid, name): - r_serv_onion.hdel('cookiejar:cookie:{}'.format(cookie_uuid), name) + r_serv_onion.hdel(f'cookiejar:cookie:{cookie_uuid}', name) def get_cookie_dict(cookie_uuid): cookie_dict = {} - for key_name in r_serv_onion.hkeys('cookiejar:cookie:{}'.format(cookie_uuid)): + for key_name in r_serv_onion.hkeys(f'cookiejar:cookie:{cookie_uuid}'): cookie_dict[key_name] = get_cookie_value(cookie_uuid, key_name) return cookie_dict @@ -346,14 +392,14 @@ def delete_all_cookies_from_cookiejar(cookiejar_uuid): delete_cookie_from_cookiejar(cookiejar_uuid, cookie_uuid) def delete_cookie_from_cookiejar(cookiejar_uuid, cookie_uuid): - r_serv_onion.srem('cookiejar:{}:cookies:uuid'.format(cookiejar_uuid), cookie_uuid) - r_serv_onion.srem('cookies:map:cookiejar:{}'.format(cookie_uuid), cookiejar_uuid) + r_serv_onion.srem(f'cookiejar:{cookiejar_uuid}:cookies:uuid', cookie_uuid) + r_serv_onion.srem(f'cookies:map:cookiejar:{cookie_uuid}', cookiejar_uuid) if not exists_cookie(cookie_uuid): - r_serv_onion.delete('cookiejar:cookie:{}'.format(cookie_uuid)) + r_serv_onion.delete(f'cookiejar:cookie:{cookie_uuid}') def edit_cookie(cookiejar_uuid, cookie_uuid, cookie_dict): # delete old keys - for key_name in r_serv_onion.hkeys('cookiejar:cookie:{}'.format(cookie_uuid)): + for key_name in r_serv_onion.hkeys(f'cookiejar:cookie:{cookie_uuid}'): if key_name not in cookie_dict: delete_cookie_value(cookie_uuid, key_name) # add new keys @@ -392,99 +438,489 @@ def misp_cookie_import(misp_object, cookiejar_uuid): #### COOKIEJAR API #### def api_import_cookies_from_json(json_cookies_str, cookiejar_uuid): # # TODO: add catch json_cookies = json.loads(json_cookies_str) - res = import_cookies_from_json(json_cookies, cookiejar_uuid) - if res: - return (res, 400) + resp = import_cookies_from_json(json_cookies, cookiejar_uuid) + if resp: + return resp, 400 #### #### #### COOKIES API #### def api_verify_basic_cookiejar(cookiejar_uuid, user_id): if not exist_cookiejar(cookiejar_uuid): - return ({'error': 'unknow cookiejar uuid', 'cookiejar_uuid': cookiejar_uuid}, 404) + return {'error': 'unknown cookiejar uuid', 'cookiejar_uuid': cookiejar_uuid}, 404 level = get_cookiejar_level(cookiejar_uuid) if level == 0: # # TODO: check if user is admin cookie_owner = get_cookiejar_owner(cookiejar_uuid) if cookie_owner != user_id: - return ({'error': 'The access to this cookiejar is restricted'}, 403) + return {'error': 'The access to this cookiejar is restricted'}, 403 def api_get_cookiejar_cookies(cookiejar_uuid, user_id): - res = api_verify_basic_cookiejar(cookiejar_uuid, user_id) - if res: - return res - res = get_cookiejar_cookies_list(cookiejar_uuid) - return (res, 200) + resp = api_verify_basic_cookiejar(cookiejar_uuid, user_id) + if resp: + return resp + resp = get_cookiejar_cookies_list(cookiejar_uuid) + return resp, 200 def api_edit_cookiejar_description(user_id, cookiejar_uuid, description): - res = api_verify_basic_cookiejar(cookiejar_uuid, user_id) - if res: - return res + resp = api_verify_basic_cookiejar(cookiejar_uuid, user_id) + if resp: + return resp edit_cookiejar_description(cookiejar_uuid, description) - return ({'cookiejar_uuid': cookiejar_uuid}, 200) + return {'cookiejar_uuid': cookiejar_uuid}, 200 def api_get_cookiejar_cookies_with_uuid(cookiejar_uuid, user_id): - res = api_verify_basic_cookiejar(cookiejar_uuid, user_id) - if res: - return res - res = get_cookiejar_cookies_list(cookiejar_uuid, add_cookie_uuid=True) - return (res, 200) + resp = api_verify_basic_cookiejar(cookiejar_uuid, user_id) + if resp: + return resp + resp = get_cookiejar_cookies_list(cookiejar_uuid, add_cookie_uuid=True) + return resp, 200 def api_get_cookies_list_select(user_id): l_cookiejar = [] for cookies_uuid in get_global_cookiejar(): - l_cookiejar.append('{} : {}'.format(get_cookiejar_description(cookies_uuid), cookies_uuid)) + l_cookiejar.append(f'{get_cookiejar_description(cookies_uuid)} : {cookies_uuid}') for cookies_uuid in get_user_cookiejar(user_id): - l_cookiejar.append('{} : {}'.format(get_cookiejar_description(cookies_uuid), cookies_uuid)) + l_cookiejar.append(f'{get_cookiejar_description(cookies_uuid)} : {cookies_uuid}') return sorted(l_cookiejar) def api_delete_cookie_from_cookiejar(user_id, cookiejar_uuid, cookie_uuid): - res = api_verify_basic_cookiejar(cookiejar_uuid, user_id) - if res: - return res + resp = api_verify_basic_cookiejar(cookiejar_uuid, user_id) + if resp: + return resp delete_cookie_from_cookiejar(cookiejar_uuid, cookie_uuid) - return ({'cookiejar_uuid': cookiejar_uuid, 'cookie_uuid': cookie_uuid}, 200) + return {'cookiejar_uuid': cookiejar_uuid, 'cookie_uuid': cookie_uuid}, 200 def api_delete_cookie_jar(user_id, cookiejar_uuid): - res = api_verify_basic_cookiejar(cookiejar_uuid, user_id) - if res: - return res + resp = api_verify_basic_cookiejar(cookiejar_uuid, user_id) + if resp: + return resp delete_cookie_jar(cookiejar_uuid) - return ({'cookiejar_uuid': cookiejar_uuid}, 200) + return {'cookiejar_uuid': cookiejar_uuid}, 200 def api_edit_cookie(user_id, cookiejar_uuid, cookie_uuid, cookie_dict): - res = api_verify_basic_cookiejar(cookiejar_uuid, user_id) - if res: - return res + resp = api_verify_basic_cookiejar(cookiejar_uuid, user_id) + if resp: + return resp if 'name' not in cookie_dict or 'value' not in cookie_dict or cookie_dict['name'] == '': - ({'error': 'cookie name or value not provided'}, 400) + return {'error': 'cookie name or value not provided'}, 400 edit_cookie(cookiejar_uuid, cookie_uuid, cookie_dict) - return (get_cookie_dict(cookie_uuid), 200) + return get_cookie_dict(cookie_uuid), 200 def api_create_cookie(user_id, cookiejar_uuid, cookie_dict): - res = api_verify_basic_cookiejar(cookiejar_uuid, user_id) - if res: - return res + resp = api_verify_basic_cookiejar(cookiejar_uuid, user_id) + if resp: + return resp if 'name' not in cookie_dict or 'value' not in cookie_dict or cookie_dict['name'] == '': - ({'error': 'cookie name or value not provided'}, 400) - res = add_cookie_to_cookiejar(cookiejar_uuid, cookie_dict) - return (res, 200) + return {'error': 'cookie name or value not provided'}, 400 + resp = add_cookie_to_cookiejar(cookiejar_uuid, cookie_dict) + return resp, 200 #### #### # # # # # # # # # # -# CRAWLER # +# CRAWLER # ################################################################################### # # # # # # # # # # + +@unique +class CaptureStatus(IntEnum): + """The status of the capture""" + UNKNOWN = -1 + QUEUED = 0 + DONE = 1 + ONGOING = 2 + +def get_default_user_agent(): + return 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0' + +def get_blacklist(): + return r_crawler.smembers('blacklist:domain') + +def is_blacklisted_domain(domain): + return r_crawler.sismember('blacklist:domain', domain) + +def blacklist_domain(domain): + return r_crawler.sadd('blacklist:domain', domain) + +def load_blacklist(): + try: + with open(os.path.join(os.environ['AIL_BIN'], 'torcrawler/blacklist.txt'), 'r') as f: + r_crawler.delete('blacklist:domain') + lines = f.read().splitlines() + for line in lines: + blacklist_domain(line) + # TODO LOG + except Exception as e: + print(e) + +def get_last_crawled_domains(domain_type): + return r_crawler.lrange(f'last_{domain_type}', 0, -1) + +def update_last_crawled_domain(domain_type, domain, epoch): + # update list, last crawled domains + r_crawler.lpush(f'last_{domain_type}', f'{domain}:{epoch}') + r_crawler.ltrim(f'last_{domain_type}', 0, 15) + +def create_item_metadata(item_id, domain, url, item_father): + r_serv_metadata.hset(f'paste_metadata:{item_id}', 'father', item_father) + r_serv_metadata.hset(f'paste_metadata:{item_id}', 'domain', domain) + r_serv_metadata.hset(f'paste_metadata:{item_id}', 'real_link', url) + # add this item_id to his father + r_serv_metadata.sadd(f'paste_children:{item_father}', item_id) + +def get_gzipped_b64_item(item_id, content): + try: + gzipencoded = gzip.compress(content.encode()) + gzip64encoded = base64.standard_b64encode(gzipencoded).decode() + return gzip64encoded + except: + print(f'file error: {item_id}') + return False + +def get_crawlers_stats_by_day(date, domain_type): + return { + 'date': date[0:4] + '-' + date[4:6] + '-' + date[6:8], + 'up': r_crawler.scard(f'{domain_type}_up:{date}'), + 'down': r_crawler.scard(f'{domain_type}_down:{date}'), + } + + +def get_crawlers_stats(domain_type=None): + stats = {} + date = datetime.now().strftime("%Y%m%d") + if domain_type: + domain_types = [domain_type] + else: + domain_types = get_crawler_all_types() + for domain_type in domain_types: + queue = r_crawler.scard(f'crawler:queue:type:{domain_type}') + up = r_crawler.scard(f'{domain_type}_up:{date}') + down = r_crawler.scard(f'{domain_type}_down:{date}') + crawled = up + down + stats[domain_type] = {'queue': queue, 'up': up, 'down': down, 'crawled': crawled} + return stats + +#### CRAWLER STATE #### + +# TODO SET IN UI OR USE DEFAULT +def get_crawler_max_captures(): + return 10 + +def get_nb_crawler_captures(): + return r_cache.zcard('crawler:captures') + +def get_crawler_captures(): + return r_crawler.zrange('crawler:captures', 0, -1) + +def reload_crawler_captures(): + r_cache.delete('crawler:captures') + for capture in get_crawler_captures(): + r_cache.zadd('crawler:captures', {capture[0]: capture[1]}) + +def get_crawler_capture(): + return r_cache.zpopmin('crawler:captures') + +def update_crawler_capture(capture_uuid): + last_check = int(time.time()) + r_cache.zadd('crawler:captures', {capture_uuid: last_check}) + +def get_crawler_capture_task_uuid(capture_uuid): + return r_crawler.hget('crawler:captures:tasks', capture_uuid) + +def add_crawler_capture(task_uuid, capture_uuid): + launch_time = int(time.time()) + r_crawler.hset(f'crawler:task:{task_uuid}', 'capture', capture_uuid) + r_crawler.hset('crawler:captures:tasks', capture_uuid, task_uuid) + r_crawler.zadd('crawler:captures', {capture_uuid: launch_time}) + r_cache.zadd('crawler:captures', {capture_uuid: launch_time}) + +def remove_crawler_capture(capture_uuid): + r_crawler.zrem('crawler:captures', capture_uuid) + r_crawler.hdel('crawler:captures:tasks', capture_uuid) + +def get_crawler_capture_status(): + status = [] + for capture_uuid in get_crawler_captures(): + task_uuid = get_crawler_capture_task_uuid(capture_uuid) + domain = get_crawler_task_domain(task_uuid) + dom = Domain(domain) + meta = { + 'uuid': task_uuid, + 'domain': dom.get_id(), + 'type': dom.get_domain_type(), + 'start_time': get_crawler_task_start_time(task_uuid), + 'status': 'test', + } + status.append(meta) + return status + +##-- CRAWLER STATE --## + +#### CRAWLER TASK #### + +def get_crawler_task_url(task_uuid): + return r_crawler.hget(f'crawler:task:{task_uuid}', 'url') + +def get_crawler_task_domain(task_uuid): + return r_crawler.hget(f'crawler:task:{task_uuid}', 'domain') + +def get_crawler_task_depth(task_uuid): + depth = r_crawler.hget(f'crawler:task:{task_uuid}', 'depth') + if not depth: + depth = 1 + return int(depth) + +def get_crawler_task_har(task_uuid): + return r_crawler.hget(f'crawler:task:{task_uuid}', 'har') == '1' + +def get_crawler_task_screenshot(task_uuid): + return r_crawler.hget(f'crawler:task:{task_uuid}', 'screenshot') == '1' + +def get_crawler_task_user_agent(task_uuid): + user_agent = r_crawler.hget(f'crawler:task:{task_uuid}', 'user_agent') + if not user_agent: + user_agent = get_default_user_agent() + return user_agent + +def get_crawler_task_cookiejar(task_uuid): + return r_crawler.hget(f'crawler:task:{task_uuid}', 'cookiejar') + +def get_crawler_task_header(task_uuid): + return r_crawler.hget(f'crawler:task:{task_uuid}', 'header') + +def get_crawler_task_proxy(task_uuid): + return r_crawler.hget(f'crawler:task:{task_uuid}', 'proxy') + +def get_crawler_task_parent(task_uuid): + return r_crawler.hget(f'crawler:task:{task_uuid}', 'parent') + +def get_crawler_task_hash(task_uuid): + return r_crawler.hget(f'crawler:task:{task_uuid}', 'hash') + +def get_crawler_task_start_time(task_uuid): + return r_crawler.hget(f'crawler:task:{task_uuid}', 'start_time') + +def get_crawler_task_status(task_uuid): + return r_crawler.hget(f'crawler:task:{task_uuid}', 'status') + +def get_crawler_task_capture(task_uuid): + return r_crawler.hset(f'crawler:task:{task_uuid}', 'capture') + +def get_crawler_task(task_uuid): + meta = { + 'uuid': task_uuid, + 'url': get_crawler_task_url(task_uuid), + 'domain': get_crawler_task_domain(task_uuid), + 'depth': get_crawler_task_depth(task_uuid), + 'har': get_crawler_task_har(task_uuid), + 'screenshot': get_crawler_task_screenshot(task_uuid), + 'user_agent': get_crawler_task_user_agent(task_uuid), + 'cookiejar': get_crawler_task_cookiejar(task_uuid), + 'header': get_crawler_task_header(task_uuid), + 'proxy': get_crawler_task_proxy(task_uuid), + 'parent': get_crawler_task_parent(task_uuid), + } + return meta + +def get_task_status(task_uuid): + domain = get_crawler_task_domain(task_uuid) + dom = Domain(domain) + meta = { + 'uuid': task_uuid, + 'domain': dom.get_id(), + 'domain_type': dom.get_domain_type(), + 'start_time': get_crawler_task_start_time(task_uuid), + 'status': 'test', + } + return meta + +# domain -> uuid +def get_task_hash(url, domain, depth, har, screenshot, priority, proxy, cookiejar, user_agent, header): + to_enqueue = {'domain': domain, 'depth': depth, 'har': har, 'screenshot': screenshot, + 'priority': priority, 'proxy': proxy, 'cookiejar': cookiejar, 'user_agent': user_agent, + 'header': header} + if priority != 0: + to_enqueue['url'] = url + return hashlib.sha512(pickle.dumps(to_enqueue)).hexdigest() + +# TODO STATUS UPDATE +# PRIORITY: discovery = 0/10, feeder = 10, manual = 50, auto = 40, test = 100 +def add_crawler_task(url, depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None, user_agent=None, parent='manual', priority=0): + url_decoded = unpack_url(url) + url = url_decoded['url'] + domain = url_decoded['domain'] + dom = Domain(domain) + + # Discovery crawler + if priority == 0: + if is_blacklisted_domain(dom.get_id()): + return None + if not dom.exists(): + priority = 10 + # Domain Crawled today or UP this month + if dom.is_down_today() or dom.is_up_this_month(): + return None + + har = int(har) + screenshot = int(screenshot) + + # TODO SELECT PROXY -> URL TODO SELECT PROXY + if proxy == 'web': + proxy = None + else: + proxy = 'force_tor' + if not user_agent: + user_agent = get_default_user_agent() + + # TODO COOKIEJAR -> UUID + if cookiejar: + pass + + # Check if already in queue + hash_query = get_task_hash(url, domain, depth, har, screenshot, priority, proxy, cookiejar, user_agent, header) + if r_crawler.hexists(f'crawler:queue:hash', hash_query): + return r_crawler.hget(f'crawler:queue:hash', hash_query) + + # TODO ADD TASK STATUS ----- + task_uuid = gen_uuid() # TODO Save hash ??? (just to be safe and remove it) + r_crawler.hset(f'crawler:task:{task_uuid}', 'domain', domain) + r_crawler.hset(f'crawler:task:{task_uuid}', 'url', url) + r_crawler.hset(f'crawler:task:{task_uuid}', 'depth', int(depth)) + r_crawler.hset(f'crawler:task:{task_uuid}', 'har', har) + r_crawler.hset(f'crawler:task:{task_uuid}', 'screenshot', har) + r_crawler.hset(f'crawler:task:{task_uuid}', 'user_agent', user_agent) + r_crawler.hset(f'crawler:task:{task_uuid}', 'proxy', proxy) + if cookiejar: + r_crawler.hset(f'crawler:task:{task_uuid}', 'cookiejar', cookiejar) # TODO + if header: + r_crawler.hset(f'crawler:task:{task_uuid}', 'header', header) + r_crawler.hset(f'crawler:task:{task_uuid}', 'hash', hash_query) + + r_crawler.hset(f'crawler:task:{task_uuid}', 'parent', parent) + + r_crawler.hset('crawler:queue:hash', hash_query, task_uuid) + r_crawler.zadd('crawler:queue', {task_uuid: priority}) + # UI + r_crawler.sadd(f'crawler:queue:type:{dom.get_domain_type()}', task_uuid) + return task_uuid + +def get_crawler_task_from_queue(): + task_uuid = r_crawler.zpopmax('crawler:queue') + if not task_uuid or not task_uuid[0]: + return None + task_uuid, priority = task_uuid[0] + r_crawler.sadd('crawler:queue:queued', task_uuid) + r_crawler.hset(f'crawler:task:{task_uuid}', 'start_time', datetime.now().strftime("%Y/%m/%d - %H:%M.%S")) + return task_uuid, priority + +def clear_crawler_task(task_uuid, domain_type): + hash_query = get_crawler_task_hash(task_uuid) + r_crawler.hdel('crawler:queue:hash', hash_query) + r_crawler.srem(f'crawler:queue:type:{domain_type}', task_uuid) + r_crawler.srem('crawler:queue:queued', task_uuid) + +def get_crawlers_tasks_status(): + tasks_status = [] + tasks = r_crawler.smembers('crawler:queue:queued') + for task_uuid in tasks: + tasks_status.append(get_task_status(task_uuid)) + return tasks_status + +##-- CRAWLER TASK --## + +#### CRAWLER TASK API #### + +# # TODO: ADD user agent +# # TODO: sanitize URL +def api_add_crawler_task(data, user_id=None): + url = data.get('url', None) + if not url or url=='\n': + return ({'status': 'error', 'reason': 'No url supplied'}, 400) + + screenshot = data.get('screenshot', False) + if screenshot: + screenshot = True + else: + screenshot = False + har = data.get('har', False) + if har: + har = True + else: + har = False + depth_limit = data.get('depth_limit', 1) + if depth_limit: + try: + depth_limit = int(depth_limit) + if depth_limit < 0: + depth_limit = 0 + except ValueError: + return ({'error':'invalid depth limit'}, 400) + else: + depth_limit = 0 + + # cookiejar_uuid = data.get('cookiejar_uuid', None) + # if cookiejar_uuid: + # if not exist_cookiejar(cookiejar_uuid): + # return ({'error': 'unknow cookiejar uuid', 'cookiejar_uuid': cookiejar_uuid}, 404) + # level = get_cookiejar_level(cookiejar_uuid) + # if level == 0: # # TODO: check if user is admin ###################################################### + # cookie_owner = get_cookiejar_owner(cookiejar_uuid) + # if cookie_owner != user_id: + # return ({'error': 'The access to this cookiejar is restricted'}, 403) + + # if auto_crawler: + # try: + # crawler_delta = int(crawler_delta) + # if crawler_delta < 0: + # return ({'error':'invalid delta between two pass of the crawler'}, 400) + # except ValueError: + # return ({'error':'invalid delta between two pass of the crawler'}, 400) + + proxy = data.get('proxy', None) + if proxy == 'onion' or proxy == 'tor': + proxy = 'force_tor' + else: + # TODO sanitize PROXY + proxy = None + + # TODO ############################################################################################################# + auto_crawler = auto_crawler + crawler_delta = crawler_delta + parent = 'manual' + + return add_crawler_task(url, depth=depth_limit, har=har, screenshot=screenshot, cookiejar=cookiejar_uuid, + proxy=proxy, user_agent=user_agent, parent='manual', priority=90), 200 + + + + + +#### #### + + +################################################################################### +################################################################################### +################################################################################### +################################################################################### + + + + + + + + + #### CRAWLER GLOBAL #### -## TODO: # FIXME: config db, dynamic load +# TODO: # FIXME: config db, dynamic load def is_crawler_activated(): return activate_crawler == 'True' def get_crawler_all_types(): - return ['onion', 'regular'] + return ['onion', 'web'] def sanitize_crawler_types(l_crawler_types): all_crawler_types = get_crawler_all_types() @@ -534,8 +970,8 @@ def get_stats_last_crawled_domains(crawler_types, date): statDomains = {} for crawler_type in crawler_types: stat_type = {} - stat_type['domains_up'] = r_serv_onion.scard('{}_up:{}'.format(crawler_type, date)) - stat_type['domains_down'] = r_serv_onion.scard('{}_down:{}'.format(crawler_type, date)) + stat_type['domains_up'] = r_crawler.scard('{}_up:{}'.format(crawler_type, date)) + stat_type['domains_down'] = r_crawler.scard('{}_down:{}'.format(crawler_type, date)) stat_type['total'] = stat_type['domains_up'] + stat_type['domains_down'] stat_type['domains_queue'] = get_nb_elem_to_crawl_by_type(crawler_type) statDomains[crawler_type] = stat_type @@ -545,7 +981,7 @@ def get_stats_last_crawled_domains(crawler_types, date): def get_splash_crawler_latest_stats(): now = datetime.now() date = now.strftime("%Y%m%d") - return get_stats_last_crawled_domains(['onion', 'regular'], date) + return get_stats_last_crawled_domains(['onion', 'web'], date) def get_nb_crawlers_to_launch_by_splash_name(splash_name): res = r_serv_onion.hget('all_crawlers_to_launch', splash_name) @@ -604,9 +1040,9 @@ def api_set_nb_crawlers_to_launch(dict_splash_name): try: nb_to_launch = int(dict_splash_name.get(splash_name, 0)) if nb_to_launch < 0: - return ({'error':'The number of crawlers to launch is negative'}, 400) + return {'error':'The number of crawlers to launch is negative'}, 400 except: - return ({'error':'invalid number of crawlers to launch'}, 400) + return {'error':'invalid number of crawlers to launch'}, 400 if nb_to_launch > 0: dict_crawlers_to_launch[splash_name] = nb_to_launch @@ -614,10 +1050,7 @@ def api_set_nb_crawlers_to_launch(dict_splash_name): set_nb_crawlers_to_launch(dict_crawlers_to_launch) return (dict_crawlers_to_launch, 200) else: - return ({'error':'invalid input'}, 400) - -def get_domains_blacklist(domain_type): - return r_serv_onion.smembers(f'blacklist_{domain_type}') + return {'error':'invalid input'}, 400 def add_domain_blacklist(domain_type, domain): r_serv_onion.sadd(f'blacklist_{domain_type}', domain) @@ -629,7 +1062,7 @@ def add_domain_blacklist(domain_type, domain): def get_auto_crawler_all_domain(l_crawler_types=[]): l_crawler_types = sanitize_crawler_types(l_crawler_types) if len(l_crawler_types) == 1: - return r_serv_onion.smembers(f'auto_crawler_url:{crawler_type[0]}') + return r_serv_onion.smembers(f'auto_crawler_url:{l_crawler_types[0]}') else: l_keys_name = [] for crawler_type in l_crawler_types: @@ -637,7 +1070,7 @@ def get_auto_crawler_all_domain(l_crawler_types=[]): return r_serv_onion.sunion(l_keys_name[0], *l_keys_name[1:]) def add_auto_crawler_in_queue(domain, domain_type, port, epoch, delta, message): - r_serv_onion.zadd('crawler_auto_queue', int(time.time() + delta) , f'{message};{domain_type}') + r_serv_onion.zadd('crawler_auto_queue', {f'{message};{domain_type}': int(time.time() + delta)}) # update list, last auto crawled domains r_serv_onion.lpush('last_auto_crawled', f'{domain}:{port};{epoch}') r_serv_onion.ltrim('last_auto_crawled', 0, 9) @@ -656,201 +1089,15 @@ def update_auto_crawler_queue(): ##-- AUTOMATIC CRAWLER --## #### CRAWLER TASK #### -def create_crawler_task(url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, crawler_type=None, cookiejar_uuid=None, user_agent=None): - - crawler_config = {} - crawler_config['depth_limit'] = depth_limit - crawler_config['closespider_pagecount'] = max_pages - - if screenshot: - crawler_config['png'] = True - else: - crawler_config['png'] = False - if har: - crawler_config['har'] = True - else: - crawler_config['har'] = False - - if user_agent: - crawler_config['user_agent'] = user_agent - if cookiejar_uuid: - crawler_config['cookiejar_uuid'] = cookiejar_uuid - - if auto_crawler: - crawler_mode = 'auto' - crawler_config['time'] = crawler_delta - else: - crawler_mode = 'manual' - - # get crawler_mode - faup.decode(url) - unpack_url = faup.get() - ## TODO: # FIXME: remove me - try: - domain = unpack_url['domain'].decode() - except: - domain = unpack_url['domain'] - - ## TODO: # FIXME: remove me - try: - tld = unpack_url['tld'].decode() - except: - tld = unpack_url['tld'] - - if crawler_type=='None': - crawler_type = None - - if crawler_type: - if crawler_type=='tor': - crawler_type = 'onion' - else: - if tld == 'onion': - crawler_type = 'onion' - else: - crawler_type = 'regular' - - save_crawler_config(crawler_mode, crawler_type, crawler_config, domain, url=url) - send_url_to_crawl_in_queue(crawler_mode, crawler_type, url) - -def save_crawler_config(crawler_mode, crawler_type, crawler_config, domain, url=None): - if crawler_mode == 'manual': - r_cache.set('crawler_config:{}:{}:{}'.format(crawler_mode, crawler_type, domain), json.dumps(crawler_config)) - elif crawler_mode == 'auto': - r_serv_onion.set('crawler_config:{}:{}:{}:{}'.format(crawler_mode, crawler_type, domain, url), json.dumps(crawler_config)) - -def send_url_to_crawl_in_queue(crawler_mode, crawler_type, url): - print('{}_crawler_priority_queue'.format(crawler_type), '{};{}'.format(url, crawler_mode)) - r_serv_onion.sadd('{}_crawler_priority_queue'.format(crawler_type), '{};{}'.format(url, crawler_mode)) - # add auto crawled url for user UI - if crawler_mode == 'auto': - r_serv_onion.sadd('auto_crawler_url:{}'.format(crawler_type), url) - -#### #### -#### CRAWLER TASK API #### - -def api_add_crawler_task(json_dict): - user_id = None ############################################### - user_agent = data.get('user_agent', None) - url = json_dict.get('url', '') - if not is_valid_uuid_v4(investigation_uuid): - return {"status": "error", "reason": f"Invalid Investigation uuid: {investigation_uuid}"}, 400 - - screenshot = json_dict.get('screenshot', True) #### - screenshot = screenshot == True - har = json_dict.get('screenshot', True) #### - har = har == True - - depth_limit = data.get('depth_limit', 1) - try: - depth_limit = int(depth_limit) - if depth_limit < 0: - depth_limit = 0 - except ValueError: - return ({'error':'invalid depth limit'}, 400) - - max_pages = data.get('max_pages', 100) - if max_pages: - try: - max_pages = int(max_pages) - if max_pages < 1: - max_pages = 1 - except ValueError: - return ({'error':'invalid max_pages limit'}, 400) - - auto_crawler = data.get('auto_crawler', False) - auto_crawler = auto_crawler == True - crawler_delta = data.get('crawler_delta', 3600) - if auto_crawler: - try: - crawler_delta = int(crawler_delta) - if crawler_delta < 0: - return ({'error':'invalid delta between two pass of the crawler'}, 400) - except ValueError: - return ({'error':'invalid delta between two pass of the crawler'}, 400) - - - crawler_type = data.get('crawler_type', None) - - cookiejar_uuid = data.get('cookiejar_uuid', None) - if cookiejar_uuid: - if not exist_cookiejar(cookiejar_uuid): - return ({'error': 'unknow cookiejar uuid', 'cookiejar_uuid': cookiejar_uuid}, 404) - level = get_cookiejar_level(cookiejar_uuid) - if level == 0: # # TODO: check if user is admin ###################################################### - cookie_owner = get_cookiejar_owner(cookiejar_uuid) - if cookie_owner != user_id: - return ({'error': 'The access to this cookiejar is restricted'}, 403) +##-- CRAWLER TASK --## - - create_crawler_task(url, screenshot=screenshot, har=har, depth_limit=depth_limit, - max_pages=max_pages, crawler_type=crawler_type, - auto_crawler=auto_crawler, crawler_delta=crawler_delta, - cookiejar_uuid=cookiejar_uuid, user_agent=user_agent) - - -# # TODO: # FIXME: REPLACE ME -def api_create_crawler_task(user_id, url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, crawler_type=None, cookiejar_uuid=None, user_agent=None): - # validate url - if url is None or url=='' or url=='\n': - return ({'error':'invalid depth limit'}, 400) - - if depth_limit: - try: - depth_limit = int(depth_limit) - if depth_limit < 0: - depth_limit = 0 - except ValueError: - return ({'error':'invalid depth limit'}, 400) - if max_pages: - try: - max_pages = int(max_pages) - if max_pages < 1: - max_pages = 1 - except ValueError: - return ({'error':'invalid max_pages limit'}, 400) - - if auto_crawler: - try: - crawler_delta = int(crawler_delta) - if crawler_delta < 0: - return ({'error':'invalid delta bettween two pass of the crawler'}, 400) - except ValueError: - return ({'error':'invalid delta bettween two pass of the crawler'}, 400) - - if cookiejar_uuid: - if not exist_cookiejar(cookiejar_uuid): - return ({'error': 'unknow cookiejar uuid', 'cookiejar_uuid': cookiejar_uuid}, 404) - level = get_cookiejar_level(cookiejar_uuid) - if level == 0: # # TODO: check if user is admin - cookie_owner = get_cookiejar_owner(cookiejar_uuid) - if cookie_owner != user_id: - return ({'error': 'The access to this cookiejar is restricted'}, 403) - - # # TODO: verify splash name/crawler type - - create_crawler_task(url, screenshot=screenshot, har=har, depth_limit=depth_limit, max_pages=max_pages, - crawler_type=crawler_type, - auto_crawler=auto_crawler, crawler_delta=crawler_delta, cookiejar_uuid=cookiejar_uuid, user_agent=user_agent) - return None - #### #### -#### SPLASH API #### -def is_splash_reachable(splash_url, timeout=1.0): - try: - r = requests.get(splash_url , timeout=timeout) - except Exception: - return False - if r.status_code == 200: - return True - else: - return False -#### #### def is_redirection(domain, last_url): url = urlparse(last_url) @@ -859,46 +1106,6 @@ def is_redirection(domain, last_url): last_domain = '{}.{}'.format(last_domain[-2], last_domain[-1]) return domain != last_domain -# domain up -def create_domain_metadata(domain_type, domain, current_port, date, date_month): - # Add to global set - r_serv_onion.sadd('{}_up:{}'.format(domain_type, date), domain) - r_serv_onion.sadd('full_{}_up'.format(domain_type), domain) - r_serv_onion.sadd('month_{}_up:{}'.format(domain_type, date_month), domain) - - # create onion metadata - if not r_serv_onion.exists('{}_metadata:{}'.format(domain_type, domain)): - r_serv_onion.hset('{}_metadata:{}'.format(domain_type, domain), 'first_seen', date) - r_serv_onion.hset('{}_metadata:{}'.format(domain_type, domain), 'last_check', date) - - # Update domain port number - all_domain_ports = r_serv_onion.hget('{}_metadata:{}'.format(domain_type, domain), 'ports') - if all_domain_ports: - all_domain_ports = all_domain_ports.split(';') - else: - all_domain_ports = [] - if current_port not in all_domain_ports: - all_domain_ports.append(current_port) - r_serv_onion.hset('{}_metadata:{}'.format(domain_type, domain), 'ports', ';'.join(all_domain_ports)) - -def add_last_crawled_domain(domain_type, domain, port, epoch): - # update list, last crawled domains - redis_crawler.lpush(f'last_{domain_type}', f'{domain}:{port};{epoch}') - redis_crawler.ltrim(f'last_{domain_type}', 0, 15) - -# add root_item to history -# if down -> root_item = epoch_date -def add_domain_root_item(root_item, domain_type, domain, epoch_date, port): - # Create/Update crawler history - r_serv_onion.zadd(f'crawler_history_{domain_type}:{domain}:{port}', epoch_date, root_item) - -def create_item_metadata(item_id, domain, url, port, item_father): - r_serv_metadata.hset('paste_metadata:{}'.format(item_id), 'father', item_father) - r_serv_metadata.hset('paste_metadata:{}'.format(item_id), 'domain', '{}:{}'.format(domain, port)) - r_serv_metadata.hset('paste_metadata:{}'.format(item_id), 'real_link', url) - # add this item_id to his father - r_serv_metadata.sadd('paste_children:{}'.format(item_father), item_id) - def create_item_id(item_dir, domain): # remove / domain = domain.replace('/', '_') @@ -908,15 +1115,6 @@ def create_item_id(item_dir, domain): UUID = domain+str(uuid.uuid4()) return os.path.join(item_dir, UUID) -def save_crawled_item(item_id, item_content): - try: - gzipencoded = gzip.compress(item_content.encode()) - gzip64encoded = base64.standard_b64encode(gzipencoded).decode() - return gzip64encoded - except: - print("file error: {}".format(item_id)) - return False - def save_har(har_dir, item_id, har_content): if not os.path.exists(har_dir): os.makedirs(har_dir) @@ -932,7 +1130,7 @@ def api_add_crawled_item(dict_crawled): # create item_id item_id = save_crawled_item(item_id, response.data['html']) - create_item_metadata(item_id, domain, 'last_url', port, 'father') + create_item_metadata(item_id, domain, 'last_url', 'father') #### CRAWLER QUEUES #### @@ -962,7 +1160,6 @@ def get_stats_elem_to_crawl_by_queue_type(queue_type): return dict_stats def get_all_queues_stats(): - print(get_all_crawlers_queues_types()) dict_stats = {} for queue_type in get_crawler_all_types(): dict_stats[queue_type] = get_stats_elem_to_crawl_by_queue_type(queue_type) @@ -985,31 +1182,6 @@ def is_item_in_queue(queue_type, url, item_id, queue_name=None): return True return False -def add_item_to_discovery_queue(queue_type, domain, subdomain, url, item_id): - date_month = datetime.now().strftime("%Y%m") - date = datetime.now().strftime("%Y%m%d") - - # check blacklist - if r_serv_onion.sismember(f'blacklist_{queue_type}', domain): - return - - # too many subdomain # # FIXME: move to crawler module ? - if len(subdomain.split('.')) > 3: - subdomain = f'{subdomain[-3]}.{subdomain[-2]}.{queue_type}' - - if not r_serv_onion.sismember(f'month_{queue_type}_up:{date_month}', subdomain) and not r_serv_onion.sismember(f'{queue_type}_down:{date}' , subdomain): - if not r_serv_onion.sismember(f'{queue_type}_domain_crawler_queue', subdomain): - r_serv_onion.sadd(f'{queue_type}_domain_crawler_queue', subdomain) - msg = f'{url};{item_id}' - # First time we see this domain => Add to discovery queue (priority=2) - if not r_serv_onion.hexists(f'{queue_type}_metadata:{subdomain}', 'first_seen'): - r_serv_onion.sadd(f'{queue_type}_crawler_discovery_queue', msg) - print(f'sent to priority queue: {subdomain}') - # Add to default queue (priority=3) - else: - r_serv_onion.sadd(f'{queue_type}_crawler_queue', msg) - print(f'sent to queue: {subdomain}') - def queue_test_clean_up(queue_type, domain, item_id): date_month = datetime.now().strftime("%Y%m") r_serv_onion.srem(f'month_{queue_type}_up:{date_month}', domain) @@ -1052,9 +1224,9 @@ def get_crawler_queue_types_by_splash_name(splash_name): #if not is_splash_used_in_discovery(splash_name) if crawler_type == 'tor': all_domain_type.append('onion') - all_domain_type.append('regular') + all_domain_type.append('web') else: - all_domain_type.append('regular') + all_domain_type.append('web') return all_domain_type def get_crawler_type_by_url(url): @@ -1069,7 +1241,7 @@ def get_crawler_type_by_url(url): if tld == 'onion': crawler_type = 'onion' else: - crawler_type = 'regular' + crawler_type = 'web' return crawler_type @@ -1088,7 +1260,7 @@ def get_elem_to_crawl_by_queue_type(l_queue_type): splitted = message.rsplit(';', 1) if len(splitted) == 2: url, item_id = splitted - item_id = item_id.replace(PASTES_FOLDER+'/', '') + item_id = item_id.replace(ITEMS_FOLDER+'/', '') else: # # TODO: to check/refractor item_id = None @@ -1104,258 +1276,10 @@ def get_elem_to_crawl_by_queue_type(l_queue_type): # SPLASH MANAGER # # # # # # # # # # # # # # # -def get_splash_manager_url(reload=False): # TODO: add in db config - return r_serv_onion.get('crawler:splash:manager:url') - -def get_splash_api_key(reload=False): # TODO: add in db config - return r_serv_onion.get('crawler:splash:manager:key') - -def get_hidden_splash_api_key(): # TODO: add in db config - key = get_splash_api_key() - if key: - if len(key)==41: - return f'{key[:4]}*********************************{key[-4:]}' - -def is_valid_api_key(api_key, search=re.compile(r'[^a-zA-Z0-9_-]').search): - if len(api_key) != 41: - return False - return not bool(search(api_key)) - -def save_splash_manager_url_api(url, api_key): - r_serv_onion.set('crawler:splash:manager:url', url) - r_serv_onion.set('crawler:splash:manager:key', api_key) - -def get_splash_url_from_manager_url(splash_manager_url, splash_port): - url = urlparse(splash_manager_url) - host = url.netloc.split(':', 1)[0] - return '{}:{}'.format(host, splash_port) - -# def is_splash_used_in_discovery(splash_name): -# res = r_serv_onion.hget('splash:metadata:{}'.format(splash_name), 'discovery_queue') -# if res == 'True': -# return True -# else: -# return False - -def restart_splash_docker(splash_url, splash_name): - splash_port = splash_url.split(':')[-1] - return _restart_splash_docker(splash_port, splash_name) - -def is_splash_manager_connected(delta_check=30): - last_check = r_cache.hget('crawler:splash:manager', 'last_check') - if last_check: - if int(time.time()) - int(last_check) > delta_check: - ping_splash_manager() - else: - ping_splash_manager() - res = r_cache.hget('crawler:splash:manager', 'connected') - return res == 'True' - -def update_splash_manager_connection_status(is_connected, req_error=None): - r_cache.hset('crawler:splash:manager', 'connected', str(is_connected)) - r_cache.hset('crawler:splash:manager', 'last_check', int(time.time())) - if not req_error: - r_cache.hdel('crawler:splash:manager', 'error') - else: - r_cache.hset('crawler:splash:manager', 'status_code', req_error['status_code']) - r_cache.hset('crawler:splash:manager', 'error', req_error['error']) - -def get_splash_manager_connection_metadata(force_ping=False): - dict_manager={} - if force_ping: - dict_manager['status'] = ping_splash_manager() - else: - dict_manager['status'] = is_splash_manager_connected() - if not dict_manager['status']: - dict_manager['status_code'] = r_cache.hget('crawler:splash:manager', 'status_code') - dict_manager['error'] = r_cache.hget('crawler:splash:manager', 'error') - return dict_manager - - ## API ## -def ping_splash_manager(): - splash_manager_url = get_splash_manager_url() - if not splash_manager_url: - return False - try: - req = requests.get('{}/api/v1/ping'.format(splash_manager_url), headers={"Authorization": get_splash_api_key()}, verify=False) - if req.status_code == 200: - update_splash_manager_connection_status(True) - return True - else: - try: - res = req.json() - if 'reason' in res: - req_error = {'status_code': req.status_code, 'error': res['reason']} - else: - print(req.json()) - req_error = {'status_code': req.status_code, 'error': json.dumps(req.json())} - except json.decoder.JSONDecodeError: - print(req.status_code) - print(req.headers) - req_error = {'status_code': req.status_code, 'error': 'Invalid response'} - update_splash_manager_connection_status(False, req_error=req_error) - return False - except requests.exceptions.ConnectionError: - pass - # splash manager unreachable - req_error = {'status_code': 500, 'error': 'splash manager unreachable'} - update_splash_manager_connection_status(False, req_error=req_error) - return False - -def get_splash_manager_session_uuid(): - try: - req = requests.get('{}/api/v1/get/session_uuid'.format(get_splash_manager_url()), headers={"Authorization": get_splash_api_key()}, verify=False) - if req.status_code == 200: - res = req.json() - if res: - return res['session_uuid'] - else: - print(req.json()) - except (requests.exceptions.ConnectionError, requests.exceptions.MissingSchema): - # splash manager unreachable - update_splash_manager_connection_status(False) - -def get_splash_manager_version(): - splash_manager_url = get_splash_manager_url() - if splash_manager_url: - try: - req = requests.get('{}/api/v1/version'.format(splash_manager_url), headers={"Authorization": get_splash_api_key()}, verify=False) - if req.status_code == 200: - return req.json()['message'] - else: - print(req.json()) - except requests.exceptions.ConnectionError: - pass - -def get_all_splash_manager_containers_name(): - req = requests.get('{}/api/v1/get/splash/all'.format(get_splash_manager_url()), headers={"Authorization": get_splash_api_key()}, verify=False) - if req.status_code == 200: - return req.json() - else: - print(req.json()) - -def get_all_splash_manager_proxies(): - req = requests.get('{}/api/v1/get/proxies/all'.format(get_splash_manager_url()), headers={"Authorization": get_splash_api_key()}, verify=False) - if req.status_code == 200: - return req.json() - else: - print(req.json()) - -def _restart_splash_docker(splash_port, splash_name): - dict_to_send = {'port': splash_port, 'name': splash_name} - req = requests.post('{}/api/v1/splash/restart'.format(get_splash_manager_url()), headers={"Authorization": get_splash_api_key()}, verify=False, json=dict_to_send) - if req.status_code == 200: - return req.json() - else: - print(req.json()) - -def api_save_splash_manager_url_api(data): - # unpack json - manager_url = data.get('url', None) - api_key = data.get('api_key', None) - if not manager_url or not api_key: - return ({'status': 'error', 'reason': 'No url or API key supplied'}, 400) - # check if is valid url - try: - result = urlparse(manager_url) - if not all([result.scheme, result.netloc]): - return ({'status': 'error', 'reason': 'Invalid url'}, 400) - except: - return ({'status': 'error', 'reason': 'Invalid url'}, 400) - - # check if is valid key - if not is_valid_api_key(api_key): - return ({'status': 'error', 'reason': 'Invalid API key'}, 400) - - save_splash_manager_url_api(manager_url, api_key) - return ({'url': manager_url, 'api_key': get_hidden_splash_api_key()}, 200) - ## -- ## - - ## SPLASH ## -def get_all_splash(r_list=False): - res = r_serv_onion.smembers('all_splash') - if not res: - res = set() - if r_list: - return list(res) - else: - return res - -def get_splash_proxy(splash_name): - return r_serv_onion.hget('splash:metadata:{}'.format(splash_name), 'proxy') - -def get_splash_all_url(splash_name, r_list=False): - res = r_serv_onion.smembers('splash:url:{}'.format(splash_name)) - if not res: - res = set() - if r_list: - return list(res) - else: - return res - -def get_splash_name_by_url(splash_url): - return r_serv_onion.get('splash:map:url:name:{}'.format(splash_url)) - -def get_splash_crawler_type(splash_name): - return r_serv_onion.hget('splash:metadata:{}'.format(splash_name), 'crawler_type') - -def get_splash_crawler_description(splash_name): - return r_serv_onion.hget('splash:metadata:{}'.format(splash_name), 'description') - -def get_splash_crawler_metadata(splash_name): - dict_splash = {} - dict_splash['proxy'] = get_splash_proxy(splash_name) - dict_splash['type'] = get_splash_crawler_type(splash_name) - dict_splash['description'] = get_splash_crawler_description(splash_name) - return dict_splash - -def get_all_splash_crawler_metadata(): - dict_splash = {} - for splash_name in get_all_splash(): - dict_splash[splash_name] = get_splash_crawler_metadata(splash_name) - return dict_splash - -def get_all_splash_by_proxy(proxy_name, r_list=False): - res = r_serv_onion.smembers('proxy:splash:{}'.format(proxy_name)) - if res: - if r_list: - return list(res) - else: - return res - else: - return [] - -def get_all_splash_name_by_crawler_type(crawler_type): - l_splash_name = [] - for splash_name in get_all_splash(): - if get_splash_crawler_type(splash_name) == crawler_type: - l_splash_name.append(splash_name) - return l_splash_name - -def get_all_splash_url_by_crawler_type(crawler_type): - l_splash_url = [] - for splash_name in get_all_splash_name_by_crawler_type(crawler_type): - for splash_url in get_splash_all_url(splash_name, r_list=True): - l_splash_url.append(splash_url) - return l_splash_url - -def delete_all_splash_containers(): - for splash_name in get_all_splash(): - delete_splash_container(splash_name) - -def delete_splash_container(splash_name): - r_serv_onion.srem('proxy:splash:{}'.format(get_splash_proxy(splash_name)), splash_name) - r_serv_onion.delete('splash:metadata:{}'.format(splash_name)) - - for splash_url in get_splash_all_url(splash_name): - r_serv_onion.delete('splash:map:url:name:{}'.format(splash_url), splash_name) - r_serv_onion.srem('splash:url:{}'.format(splash_name), splash_url) - r_serv_onion.srem('all_splash', splash_name) - ## -- ## ## PROXY ## def get_all_proxies(r_list=False): - res = r_serv_onion.smembers('all_proxy') + res = r_serv_onion.smembers('all_proxies') if res: return list(res) else: @@ -1403,170 +1327,168 @@ def delete_proxy(proxy_name): # # TODO: force delete (delete all proxy) #if proxy_splash: # print('error, a splash container is using this proxy') r_serv_onion.delete('proxy:metadata:{}'.format(proxy_name)) - r_serv_onion.srem('all_proxy', proxy_name) + r_serv_onion.srem('all_proxies', proxy_name) ## -- ## - ## LOADER ## -def load_all_splash_containers(): - delete_all_splash_containers() - all_splash_containers_name = get_all_splash_manager_containers_name() - for splash_name in all_splash_containers_name: - r_serv_onion.sadd('all_splash', splash_name) +#### ---- #### - proxy = all_splash_containers_name[splash_name]['proxy'] - if not proxy: - proxy = {'name': 'no_proxy', 'crawler_type': 'web'} - r_serv_onion.sadd('proxy:splash:{}'.format(proxy['name']), splash_name) +# # # # CRAWLER LACUS # # # # - r_serv_onion.hset('splash:metadata:{}'.format(splash_name), 'crawler_type', proxy['crawler_type']) - r_serv_onion.hset('splash:metadata:{}'.format(splash_name), 'proxy', proxy['name']) - description = all_splash_containers_name[splash_name].get('description', None) - if description: - r_serv_onion.hset('splash:metadata:{}'.format(splash_name), 'description', description) +def get_lacus_url(): + return r_db.hget('crawler:lacus', 'url') - for port in all_splash_containers_name[splash_name]['ports']: - splash_url = get_splash_url_from_manager_url(get_splash_manager_url(), port) - r_serv_onion.sadd('splash:url:{}'.format(splash_name), splash_url) - r_serv_onion.set('splash:map:url:name:{}'.format(splash_url), splash_name) +def get_lacus_api_key(reload=False): # TODO: add in db config + return r_db.hget('crawler:lacus', 'key') -def load_all_proxy(): - delete_all_proxies() - all_proxies = get_all_splash_manager_proxies() - for proxy_name in all_proxies: - proxy_dict = all_proxies[proxy_name] - r_serv_onion.hset('proxy:metadata:{}'.format(proxy_name), 'host', proxy_dict['host']) - r_serv_onion.hset('proxy:metadata:{}'.format(proxy_name), 'port', proxy_dict['port']) - r_serv_onion.hset('proxy:metadata:{}'.format(proxy_name), 'type', proxy_dict['type']) - r_serv_onion.hset('proxy:metadata:{}'.format(proxy_name), 'crawler_type', proxy_dict['crawler_type']) - description = all_proxies[proxy_name].get('description', None) - if description: - r_serv_onion.hset('proxy:metadata:{}'.format(proxy_name), 'description', description) - r_serv_onion.sadd('all_proxy', proxy_name) +# TODO Rewrite with new API key +def get_hidden_lacus_api_key(): # TODO: add in db config + key = get_lacus_api_key() + if key: + if len(key)==41: + return f'{key[:4]}*********************************{key[-4:]}' -def reload_splash_and_proxies_list(): - if ping_splash_manager(): - # LOAD PROXIES containers - load_all_proxy() - # LOAD SPLASH containers - load_all_splash_containers() - return True - else: +# TODO Rewrite with new API key +def is_valid_api_key(api_key, search=re.compile(r'[^a-zA-Z0-9_-]').search): + if len(api_key) != 41: return False - # # TODO: kill crawler screen ? - ## -- ## + return not bool(search(api_key)) - ## SPLASH CONTROLLER ## -def launch_ail_splash_crawler(splash_url, script_options=''): - screen_name = 'Crawler_AIL' - dir_project = os.environ['AIL_HOME'] - script_location = os.path.join(os.environ['AIL_BIN']) - script_name = 'Crawler.py' - screen.create_screen(screen_name) - screen.launch_uniq_windows_script(screen_name, splash_url, dir_project, script_location, script_name, script_options=script_options, kill_previous_windows=True) +def save_lacus_url_api(url, api_key): + r_db.hset('crawler:lacus', 'url', url) + r_db.hset('crawler:lacus', 'key', api_key) + +def is_lacus_connected(delta_check=30): + last_check = r_cache.hget('crawler:lacus', 'last_check') + if last_check: + if int(time.time()) - int(last_check) > delta_check: + ping_lacus() + else: + ping_lacus() + is_connected = r_cache.hget('crawler:lacus', 'connected') + return is_connected == 'True' + +def get_lacus_connection_metadata(force_ping=False): + dict_manager={} + if force_ping: + dict_manager['status'] = ping_lacus() + else: + dict_manager['status'] = is_lacus_connected() + if not dict_manager['status']: + dict_manager['status_code'] = r_cache.hget('crawler:lacus', 'status_code') + dict_manager['error'] = r_cache.hget('crawler:lacus', 'error') + return dict_manager + +def get_lacus(): + url = get_lacus_url() + if url: + return PyLacus(get_lacus_url()) + +# TODO CATCH EXCEPTIONS +def ping_lacus(): + # TODO CATCH EXCEPTION + lacus = get_lacus() + if not lacus: + ping = False + else: + ping = lacus.is_up + update_lacus_connection_status(ping) + return ping + +def update_lacus_connection_status(is_connected, req_error=None): + r_cache.hset('crawler:lacus', 'connected', str(is_connected)) + r_cache.hset('crawler:lacus', 'last_check', int(time.time())) + if not req_error: + r_cache.hdel('crawler:lacus', 'error') + else: + r_cache.hset('crawler:lacus', 'status_code', req_error['status_code']) + r_cache.hset('crawler:lacus', 'error', req_error['error']) + +def api_save_lacus_url_key(data): + # unpack json + manager_url = data.get('url', None) + api_key = data.get('api_key', None) + if not manager_url: # or not api_key: + return {'status': 'error', 'reason': 'No url or API key supplied'}, 400 + # check if is valid url + try: + result = urlparse(manager_url) + if not all([result.scheme, result.netloc]): + return {'status': 'error', 'reason': 'Invalid url'}, 400 + except: + return {'status': 'error', 'reason': 'Invalid url'}, 400 + + # # check if is valid key CURRENTLY DISABLE + # if not is_valid_api_key(api_key): + # return ({'status': 'error', 'reason': 'Invalid API key'}, 400) + + save_lacus_url_api(manager_url, api_key) + return {'url': manager_url, 'api_key': get_hidden_lacus_api_key()}, 200 + + + + + ## PROXY ## + + # TODO SAVE PROXY URL + ADD PROXY TESTS + # -> name + url + + ## PROXY ## def is_test_ail_crawlers_successful(): - return r_serv_onion.hget('crawler:tor:test', 'success') == 'True' + return r_db.hget('crawler:tor:test', 'success') == 'True' + def get_test_ail_crawlers_message(): - return r_serv_onion.hget('crawler:tor:test', 'message') + return r_db.hget('crawler:tor:test', 'message') + def save_test_ail_crawlers_result(test_success, message): - r_serv_onion.hset('crawler:tor:test', 'success', bool(test_success)) - r_serv_onion.hset('crawler:tor:test', 'message', message) + r_db.hset('crawler:tor:test', 'success', str(test_success)) + r_db.hset('crawler:tor:test', 'message', message) -# # FIXME: # TODO: stderr CATCH ????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? def test_ail_crawlers(): - # # TODO: test regular domain - if not ping_splash_manager(): - manager_url = get_splash_manager_url() - error_message = f'Error: Can\'t connect to AIL Splash Manager, {manager_url}' + # # TODO: test web domain + if not ping_lacus(): + lacus_url = get_lacus_url() + error_message = f'Error: Can\'t connect to AIL Lacus, {lacus_url}' print(error_message) save_test_ail_crawlers_result(False, error_message) return False - splash_url = get_all_splash_url_by_crawler_type('tor') - if not splash_url: - error_message = f'Error: No Tor Splash Launched' - print(error_message) - save_test_ail_crawlers_result(False, error_message) - return False - splash_url = splash_url[0] + lacus = get_lacus() commit_id = git_status.get_last_commit_id_from_local() - crawler_options = {'html': True, - 'har': False, - 'png': False, - 'depth_limit': 0, - 'closespider_pagecount': 100, - 'cookiejar_uuid': None, - 'user_agent': commit_id + '-AIL SPLASH CRAWLER'} - date = {'date_day': datetime.now().strftime("%Y%m%d"), - 'date_month': datetime.now().strftime("%Y%m"), - 'epoch': int(time.time())} - crawler_config = {'splash_url': f'http://{splash_url}', - 'service_type': 'onion', - 'url': 'http://eswpccgr5xyovsahffkehgleqthrasfpfdblwbs4lstd345dwq5qumqd.onion', - 'domain': 'eswpccgr5xyovsahffkehgleqthrasfpfdblwbs4lstd345dwq5qumqd.onion', - 'port': 80, - 'original_item': None, - 'item': None, - 'crawler_options': crawler_options, - 'date': date, - 'requested': 'test'} - - ## CHECK IF SPLASH AVAILABLE ## - try: - r = requests.get(f'http://{splash_url}' , timeout=30.0) - retry = False - except Exception as e: - error_message = f'Error: Can\'t connect to Splash Docker, http://{splash_url}' - print(error_message) - save_test_ail_crawlers_result(False, error_message) - return False - ## -- ## + user_agent = f'commit_id-AIL LACUS CRAWLER' + domain = 'eswpccgr5xyovsahffkehgleqthrasfpfdblwbs4lstd345dwq5qumqd.onion' + url = 'http://eswpccgr5xyovsahffkehgleqthrasfpfdblwbs4lstd345dwq5qumqd.onion' ## LAUNCH CRAWLER, TEST MODE ## - set_current_crawler_status(splash_url, 'CRAWLER TEST', started_time=True, crawled_domain='TEST DOMAIN', crawler_type='onion') - UUID = str(uuid.uuid4()) - r_cache.set('crawler_request:{}'.format(UUID), json.dumps(crawler_config)) - - ## LAUNCH CRAWLER, TEST MODE ## - tor_crawler_script = os.path.join(os.environ['AIL_BIN'], 'torcrawler/tor_crawler.py') - process = subprocess.Popen(["python", tor_crawler_script, UUID], - stdout=subprocess.PIPE) - while process.poll() is None: + # set_current_crawler_status(splash_url, 'CRAWLER TEST', started_time=True, + # crawled_domain='TEST DOMAIN', crawler_type='onion') + capture_uuid = lacus.enqueue(url=url, depth=0, user_agent=user_agent, proxy='force_tor', + force=True, general_timeout_in_sec=90) + status = lacus.get_capture_status(capture_uuid) + launch_time = int(time.time()) # capture timeout + while int(time.time()) - launch_time < 60 and status != CaptureStatus.DONE: + # DEBUG + print(int(time.time()) - launch_time) + print(status) time.sleep(1) + status = lacus.get_capture_status(capture_uuid) - if process.returncode == 0: - # Scrapy-Splash ERRORS - stderr = process.stdout.read().decode() - #print(stderr) - if stderr: - print(f'stderr: {stderr}') - save_test_ail_crawlers_result(False, f'Error: {stderr}') - set_current_crawler_status(splash_url, 'Error') - - output = process.stdout.read().decode() - #print(output) - # error: splash:Connection to proxy refused - if 'Connection to proxy refused' in output: - print('{} SPASH, PROXY DOWN OR BAD CONFIGURATION'.format(splash_url)) - save_test_ail_crawlers_result(False, 'SPASH, PROXY DOWN OR BAD CONFIGURATION') - set_current_crawler_status(splash_url, 'Error') - return False - else: - set_current_crawler_status(splash_url, 'Waiting') - return True - else: - # ERROR - stderr = process.stdout.read().decode() - output = process.stdout.read().decode() - error = f'-stderr-\n{stderr}\n-stdout-\n{output}' - print(error) - save_test_ail_crawlers_result(splash_url, error) + # TODO CRAWLER STATUS OR QUEUED CAPTURE LIST + entries = lacus.get_capture(capture_uuid) + if 'error' in entries: + save_test_ail_crawlers_result(False, entries['error']) return False - return True - ## -- ## + elif 'html' in entries and entries['html']: + mess = 'It works!' + if mess in entries['html']: + save_test_ail_crawlers_result(True, mess) + return True + else: + return False + return False #### ---- #### @@ -1574,14 +1496,19 @@ def test_ail_crawlers(): #### ---- #### + +# TODO MOVE ME +load_blacklist() + if __name__ == '__main__': # res = get_splash_manager_version() # res = test_ail_crawlers() # res = is_test_ail_crawlers_successful() # print(res) # print(get_test_ail_crawlers_message()) - #print(get_all_queues_stats()) + # print(get_all_queues_stats()) - #res = get_auto_crawler_all_domain() - res = get_all_cookiejar() + # res = get_auto_crawler_all_domain() + # res = get_all_cookiejar() + res = unpack_url('http://test.com/') print(res) diff --git a/bin/lib/data_retention_engine.py b/bin/lib/data_retention_engine.py index 55624dd4..6ccc33b7 100755 --- a/bin/lib/data_retention_engine.py +++ b/bin/lib/data_retention_engine.py @@ -8,7 +8,7 @@ sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib')) import ConfigLoader config_loader = ConfigLoader.ConfigLoader() -r_serv_db = config_loader.get_db_conn("Kvrocks_DB") +r_serv_db = config_loader.get_db_conn("Kvrocks_Objects") config_loader = None def get_first_object_date(object_type, subtype, field=''): @@ -24,15 +24,15 @@ def get_last_object_date(object_type, subtype, field=''): return int(last_date) def _set_first_object_date(object_type, subtype, date, field=''): - return r_serv_db.zadd('objs:first_date', f'{object_type}:{subtype}:{field}', date) + return r_serv_db.zadd('objs:first_date', {f'{object_type}:{subtype}:{field}': date}) def _set_last_object_date(object_type, subtype, date, field=''): - return r_serv_db.zadd('objs:last_date', f'{object_type}:{subtype}:{field}', date) + return r_serv_db.zadd('objs:last_date', {f'{object_type}:{subtype}:{field}': float(date)}) def update_first_object_date(object_type, subtype, date, field=''): first_date = get_first_object_date(object_type, subtype, field=field) if int(date) < first_date: - _set_first_object_date(object_typel, subtype, date, field=field) + _set_first_object_date(object_type, subtype, date, field=field) return date else: return first_date diff --git a/bin/lib/item_basic.py b/bin/lib/item_basic.py index b9120f6d..b62b0193 100755 --- a/bin/lib/item_basic.py +++ b/bin/lib/item_basic.py @@ -7,15 +7,15 @@ import gzip import magic -sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) -import ConfigLoader -import Tag +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from lib import ConfigLoader +from lib import Tag + config_loader = ConfigLoader.ConfigLoader() -# get and sanityze PASTE DIRECTORY -PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "pastes")) + '/' -PASTES_FOLDER = os.path.join(os.path.realpath(PASTES_FOLDER), '') - r_cache = config_loader.get_redis_conn("Redis_Cache") r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata") config_loader = None @@ -28,15 +28,15 @@ def exist_item(item_id): return False def get_item_filepath(item_id): - filename = os.path.join(PASTES_FOLDER, item_id) + filename = os.path.join(ConfigLoader.get_items_dir(), item_id) return os.path.realpath(filename) def get_item_date(item_id, add_separator=False): - l_directory = item_id.split('/') + l_dir = item_id.split('/') if add_separator: - return '{}/{}/{}'.format(l_directory[-4], l_directory[-3], l_directory[-2]) + return f'{l_dir[-4]}/{l_dir[-3]}/{l_dir[-2]}' else: - return '{}{}{}'.format(l_directory[-4], l_directory[-3], l_directory[-2]) + return f'{l_dir[-4]}{l_dir[-3]}{l_dir[-2]}' def get_basename(item_id): return os.path.basename(item_id) @@ -53,17 +53,17 @@ def get_item_domain(item_id): return item_id[19:-36] def get_item_content_binary(item_id): - item_full_path = os.path.join(PASTES_FOLDER, item_id) + item_full_path = os.path.join(ConfigLoader.get_items_dir(), item_id) try: with gzip.open(item_full_path, 'rb') as f: item_content = f.read() except Exception as e: print(e) - item_content = '' + item_content = b'' return item_content def get_item_content(item_id): - item_full_path = os.path.join(PASTES_FOLDER, item_id) + item_full_path = os.path.join(ConfigLoader.get_items_dir(), item_id) try: item_content = r_cache.get(item_full_path) except UnicodeDecodeError: @@ -84,7 +84,7 @@ def get_item_content(item_id): def get_item_mimetype(item_id): return magic.from_buffer(get_item_content(item_id), mime=True) -#### TREE CHILD/FATHER #### +# # # # TREE CHILD/FATHER # # # # def is_father(item_id): return r_serv_metadata.exists('paste_children:{}'.format(item_id)) @@ -127,6 +127,18 @@ def is_domain_root(item_id): def get_item_url(item_id): return r_serv_metadata.hget(f'paste_metadata:{item_id}', 'real_link') +def get_item_har(item_id): + har = '/'.join(item_id.rsplit('/')[-4:]) + har = f'{har}.json' + path = os.path.join(ConfigLoader.get_hars_dir(), har) + if os.path.isfile(path): + return har + +def get_item_har_content(har): + with open(har, 'rb') as f: + har_content = f.read() + return har_content + def get_nb_children(item_id): return r_serv_metadata.scard('paste_children:{}'.format(item_id)) @@ -140,14 +152,14 @@ def get_item_children(item_id): # # TODO: handle domain last origin in domain lib def _delete_node(item_id): # only if item isn't deleted - #if is_crawled(item_id): + # if is_crawled(item_id): # r_serv_metadata.hrem('paste_metadata:{}'.format(item_id), 'real_link') for children_id in get_item_children(item_id): r_serv_metadata.hdel('paste_metadata:{}'.format(children_id), 'father') r_serv_metadata.delete('paste_children:{}'.format(item_id)) # delete regular - # simple if leaf + # simple if leaf # delete item node @@ -210,7 +222,7 @@ def _get_dir_source_name(directory, source_name=None, l_sources_name=set(), filt else: for src_name in l_dir: if len(src_name) == 4: - #try: + # try: int(src_name) to_add = os.path.join(source_name) # filter sources, remove first directory @@ -218,7 +230,7 @@ def _get_dir_source_name(directory, source_name=None, l_sources_name=set(), filt to_add = to_add.replace('archive/', '').replace('alerts/', '') l_sources_name.add(to_add) return l_sources_name - #except: + # except: # pass if source_name: src_name = os.path.join(source_name, src_name) @@ -227,7 +239,7 @@ def _get_dir_source_name(directory, source_name=None, l_sources_name=set(), filt def get_all_items_sources(filter_dir=False, r_list=False): - res = _get_dir_source_name(PASTES_FOLDER, filter_dir=filter_dir) + res = _get_dir_source_name(ConfigLoader.get_items_dir(), filter_dir=filter_dir) if res: if r_list: res = list(res) diff --git a/bin/lib/objects/CryptoCurrencies.py b/bin/lib/objects/CryptoCurrencies.py index 3983612e..279e135f 100755 --- a/bin/lib/objects/CryptoCurrencies.py +++ b/bin/lib/objects/CryptoCurrencies.py @@ -52,9 +52,9 @@ class CryptoCurrency(AbstractSubtypeObject): def get_link(self, flask_context=False): if flask_context: - url = url_for('correlation.show_correlation', object_type=self.type, type_id=self.subtype, correlation_id=self.id) + url = url_for('correlation.show_correlation', type=self.type, subtype=self.subtype, id=self.id) else: - url = f'{baseurl}/correlation/show_correlation?object_type={self.type}&type_id={self.subtype}&correlation_id={self.id}' + url = f'{baseurl}/correlation/show?type={self.type}&subtype={self.subtype}&id={self.id}' return url def get_svg_icon(self): @@ -89,7 +89,11 @@ class CryptoCurrency(AbstractSubtypeObject): return obj def get_meta(self, options=set()): - return self._get_meta() + meta = self._get_meta() + meta['id'] = self.id + meta['subtype'] = self.subtype + meta['tags'] = self.get_tags() + return meta diff --git a/bin/lib/objects/Cves.py b/bin/lib/objects/Cves.py new file mode 100755 index 00000000..6290dc8c --- /dev/null +++ b/bin/lib/objects/Cves.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import sys + +from flask import url_for + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from lib.ConfigLoader import ConfigLoader +from lib.objects.abstract_daterange_object import AbstractDaterangeObject +from packages import Date + +config_loader = ConfigLoader() +r_objects = config_loader.get_db_conn("Kvrocks_Objects") +baseurl = config_loader.get_config_str("Notifications", "ail_domain") +config_loader = None + + +################################################################################ +################################################################################ +################################################################################ + +# # TODO: COMPLETE CLASS + +class Cve(AbstractDaterangeObject): + """ + AIL Cve Object. + """ + + def __init__(self, id): + super(Cve, self).__init__('cve', id) + + # def get_ail_2_ail_payload(self): + # payload = {'raw': self.get_gzip_content(b64=True), + # 'compress': 'gzip'} + # return payload + + # # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\ + def delete(self): + # # TODO: + pass + + def get_link(self, flask_context=False): + if flask_context: + url = url_for('correlation.show_correlation', type=self.type, id=self.id) + else: + url = f'{baseurl}/correlation/show?type={self.type}&id={self.id}' + return url + + # TODO # CHANGE COLOR + def get_svg_icon(self): + return {'style': 'fas', 'icon': '\uf188', 'color': '#1E88E5', 'radius': 5} + + # TODO # TODO # TODO # TODO # TODO # TODO # TODO # TODO + def get_misp_object(self): + pass + + def get_meta(self, options=set()): + meta = self._get_meta(options=options) + meta['id'] = self.id + meta['subtype'] = self.subtype + meta['tags'] = self.get_tags() + return meta + + def add(self, date, item_id): + self._add(date, item_id) + + +# TODO # TODO # TODO # TODO # TODO # TODO # TODO # TODO # TODO # TODO +def get_all_cves(): + cves = [] + return cves + +def get_cves_by_date(date): + # return r_objects.zrange(f'cve:date:{date}', 0, -1) + return set(r_objects.hkeys(f'cve:date:{date}')) + +def get_cves_by_daterange(date_from, date_to): + cves = set() + for date in Date.substract_date(date_from, date_to): + cves | get_cves_by_date(date) + return cves + +def get_cves_meta(cves_id, options=set()): + dict_cve = {} + for cve_id in cves_id: + cve = Cve(cve_id) + dict_cve[cve_id] = cve.get_meta(options=options) + return dict_cve + +def api_get_cves_meta_by_daterange(date_from, date_to): + date = Date.sanitise_date_range(date_from, date_to) + return get_cves_meta(get_cves_by_daterange(date['date_from'], date['date_to']), options=['sparkline']) + +# if __name__ == '__main__': diff --git a/bin/lib/objects/Decodeds.py b/bin/lib/objects/Decodeds.py index 087cd708..c1bc995b 100755 --- a/bin/lib/objects/Decodeds.py +++ b/bin/lib/objects/Decodeds.py @@ -69,9 +69,9 @@ class Decoded(AbstractObject): def get_link(self, flask_context=False): if flask_context: - url = url_for('correlation.show_correlation', object_type="decoded", correlation_id=self.id) + url = url_for('correlation.show_correlation', type="decoded", id=self.id) else: - url = f'{baseurl}/correlation/show_correlation?object_type={self.type}&correlation_id={self.id}' + url = f'{baseurl}/correlation/show?type={self.type}&id={self.id}' return url def get_svg_icon(self): @@ -90,7 +90,7 @@ class Decoded(AbstractObject): return {'style': 'fas', 'icon': icon, 'color': '#88CCEE', 'radius':5} ''' - Return the estimed type of a given decoded item. + Return the estimated type of a given decoded item. :param sha1_string: sha1_string ''' @@ -170,8 +170,11 @@ class Decoded(AbstractObject): if date > last_seen: self.set_last_seen(date) - def get_meta(self): - pass + def get_meta(self, options=set()): + meta = {'id': self.id, + 'subtype': self.subtype, + 'tags': self.get_tags()} + return meta def get_meta_vt(self): meta = {} @@ -209,7 +212,7 @@ class Decoded(AbstractObject): def is_seen_this_day(self, date): for decoder in get_decoders_names(): - if r_metadata.zscore(f'{decoder_name}_date:{date}', self.id): + if r_metadata.zscore(f'{decoder}_date:{date}', self.id): return True return False @@ -324,6 +327,9 @@ class Decoded(AbstractObject): ####################################################################################### ####################################################################################### + def is_vt_enabled(self): + return VT_ENABLED + def set_vt_report(self, report): r_metadata.hset(f'metadata_hash:{self.id}', 'vt_report', report) @@ -354,7 +360,6 @@ class Decoded(AbstractObject): print(report) return report elif response.status_code == 403: - Flask_config.vt_enabled = False return 'Virustotal key is incorrect (e.g. for public API not for virustotal intelligence), authentication failed' elif response.status_code == 204: return 'Rate Limited' diff --git a/bin/lib/objects/Domains.py b/bin/lib/objects/Domains.py index d42ac4f7..1c4c389a 100755 --- a/bin/lib/objects/Domains.py +++ b/bin/lib/objects/Domains.py @@ -4,18 +4,31 @@ import os import sys import time +import zipfile +from datetime import datetime from flask import url_for +from io import BytesIO +from pymisp import MISPObject sys.path.append(os.environ['AIL_BIN']) -from lib.ConfigLoader import ConfigLoader +################################## +# Import Project packages +################################## +from lib import ConfigLoader from lib.objects.abstract_object import AbstractObject -from lib.item_basic import get_item_children, get_item_date, get_item_url +from lib.item_basic import get_item_children, get_item_date, get_item_url, get_item_har from lib import data_retention_engine -config_loader = ConfigLoader() -r_onion = config_loader.get_redis_conn("ARDB_Onion") +from packages import Date + +config_loader = ConfigLoader.ConfigLoader() +r_crawler = config_loader.get_db_conn("Kvrocks_Crawler") + +r_metadata = config_loader.get_redis_conn("ARDB_Metadata") ###################################### + +baseurl = config_loader.get_config_str("Notifications", "ail_domain") config_loader = None @@ -42,36 +55,37 @@ class Domain(AbstractObject): if str(self.id).endswith('.onion'): return 'onion' else: - return 'regular' + return 'web' def exists(self): - return r_onion.exists(f'{self.domain_type}_metadata:{self.id}') + return r_crawler.exists(f'domain:meta:{self.id}') def get_first_seen(self, r_int=False, separator=True): - first_seen = r_onion.hget(f'{self.domain_type}_metadata:{self.id}', 'first_seen') + first_seen = r_crawler.hget(f'domain:meta:{self.id}', 'first_seen') if first_seen: - if separator: - first_seen = f'{first_seen[0:4]}/{first_seen[4:6]}/{first_seen[6:8]}' - elif r_int==True: + if r_int: first_seen = int(first_seen) + elif separator: + first_seen = f'{first_seen[0:4]}/{first_seen[4:6]}/{first_seen[6:8]}' return first_seen def get_last_check(self, r_int=False, separator=True): - last_check = r_onion.hget(f'{self.domain_type}_metadata:{self.id}', 'last_check') + last_check = r_crawler.hget(f'domain:meta:{self.id}', 'last_check') if last_check is not None: - if separator: - last_check = f'{last_check[0:4]}/{last_check[4:6]}/{last_check[6:8]}' - elif r_format=="int": + if r_int: last_check = int(last_check) + elif separator: + last_check = f'{last_check[0:4]}/{last_check[4:6]}/{last_check[6:8]}' return last_check def _set_first_seen(self, date): - r_onion.hset(f'{self.domain_type}_metadata:{self.id}', 'first_seen', date) + r_crawler.hset(f'domain:meta:{self.id}', 'first_seen', date) def _set_last_check(self, date): - r_onion.hset(f'{self.domain_type}_metadata:{self.id}', 'last_check', date) + r_crawler.hset(f'domain:meta:{self.id}', 'last_check', date) def update_daterange(self, date): + date = int(date) first_seen = self.get_first_seen(r_int=True) last_check = self.get_last_check(r_int=True) if not first_seen: @@ -82,65 +96,101 @@ class Domain(AbstractObject): elif int(last_check) < date: self._set_last_check(date) - def get_last_origin(self): - return r_onion.hget(f'{self.domain_type}_metadata:{self.id}', 'paste_parent') + def get_last_origin(self, obj=False): + origin = {'item': r_crawler.hget(f'domain:meta:{self.id}', 'last_origin')} + if obj and origin['item']: + if origin['item'] != 'manual' and origin['item'] != 'auto': + item_id = origin['item'] + origin['domain'] = r_metadata.hget(f'paste_metadata:{item_id}', 'domain') + origin['url'] = r_metadata.hget(f'paste_metadata:{item_id}', 'url') + return origin def set_last_origin(self, origin_id): - r_onion.hset(f'{self.domain_type}_metadata:{self.id}', 'paste_parent', origin_id) + r_crawler.hset(f'domain:meta:{self.id}', 'last_origin', origin_id) - def is_up(self, ports=[]): - if not ports: - ports = self.get_ports() - for port in ports: - res = r_onion.zrevrange(f'crawler_history_{self.domain_type}:{self.id}:{port}', 0, 0, withscores=True) - if res: - item_core, epoch = res[0] - try: - epoch = int(item_core) - except: - print('True') - return True - print('False') + def is_up(self): + res = r_crawler.zrevrange(f'domain:history:{self.id}', 0, 0, withscores=True) + if res: + item_core, epoch = res[0] + try: + int(item_core) + except ValueError: + return True return False def was_up(self): - return r_onion.hexists(f'{self.domain_type}_metadata:{self.id}', 'ports') + return r_crawler.exists(f'domain:history:{self.id}') + + def is_up_by_month(self, date_month): + # FIXME DIRTY PATCH + if r_crawler.exists(f'month_{self.domain_type}_up:{date_month}'): + return r_crawler.sismember(f'month_{self.domain_type}_up:{date_month}', self.get_id()) + else: + return False + + def is_up_this_month(self): + date_month = datetime.now().strftime("%Y%m") + return self.is_up_by_month(date_month) + + def is_down_by_day(self, date): + # FIXME DIRTY PATCH + if r_crawler.exists(f'{self.domain_type}_down:{date}'): + return r_crawler.sismember(f'{self.domain_type}_down:{date}', self.id) + else: + return False + + def is_down_today(self): + date = datetime.now().strftime("%Y%m%d") + return self.is_down_by_day(date) + + def is_up_by_epoch(self, epoch): + history = r_crawler.zrevrangebyscore(f'domain:history:{self.id}', int(epoch), int(epoch)) + if not history: + return False + else: + history = history[0] + try: + int(history) + return False + except ValueError: + return True def get_ports(self, r_set=False): - l_ports = r_onion.hget(f'{self.domain_type}_metadata:{self.id}', 'ports') + l_ports = r_crawler.hget(f'domain:meta:{self.id}', 'ports') if l_ports: l_ports = l_ports.split(";") - if r_set: - return set(l_ports) - else: - return l_ports - return [] + else: + l_ports = [] + if r_set: + return set(l_ports) + else: + return l_ports def _set_ports(self, ports): - ports = ';'.join(ports) - r_onion.hset(f'{self.domain_type}_metadata:{self.id}', 'ports', ports) + ports = ';'.join(str(p) for p in ports) + r_crawler.hset(f'domain:meta:{self.id}', 'ports', ports) def add_ports(self, port): ports = self.get_ports(r_set=True) ports.add(port) self._set_ports(ports) - def get_history_by_port(self, port, status=False, root=False): - ''' + def get_history(self, status=False, root=False): + """ Return . :return: :rtype: list of tuple (item_core, epoch) - ''' - history_tuple = r_onion.zrange(f'crawler_history_{self.domain_type}:{self.id}:{port}', 0, -1, withscores=True) + """ + history_tuple = r_crawler.zrange(f'domain:history:{self.id}', 0, -1, withscores=True) history = [] for root_id, epoch in history_tuple: dict_history = {} - epoch = int(epoch) # force int + epoch = int(epoch) # force int dict_history["epoch"] = epoch - dict_history["date"] = time.strftime('%Y/%m/%d - %H:%M.%S', time.gmtime(epoch_val)) + dict_history["date"] = time.strftime('%Y/%m/%d - %H:%M.%S', time.gmtime(epoch)) try: - int(root_item) + int(root_id) if status: dict_history['status'] = False except ValueError: @@ -152,30 +202,31 @@ class Domain(AbstractObject): return history def get_languages(self): - return r_onion.smembers(f'domain:language:{self.id}') + return r_crawler.smembers(f'domain:language:{self.id}') def get_meta_keys(self): return ['type', 'first_seen', 'last_check', 'last_origin', 'ports', 'status', 'tags', 'languages'] # options: set of optional meta fields def get_meta(self, options=set()): - meta = {} - meta['type'] = self.domain_type - meta['first_seen'] = self.get_first_seen() - meta['last_check'] = self.get_last_check() - meta['tags'] = self.get_tags(r_list=True) - meta['ports'] = self.get_ports() - meta['status'] = self.is_up(ports=meta['ports']) + meta = {'type': self.domain_type, + 'id': self.id, + 'domain': self.id, # TODO Remove me -> Fix templates + 'first_seen': self.get_first_seen(), + 'last_check': self.get_last_check(), + 'tags': self.get_tags(r_list=True), + 'status': self.is_up() + } + # meta['ports'] = self.get_ports() if 'last_origin' in options: - meta['last_origin'] = self.get_last_origin() - #meta['is_tags_safe'] = ################################## + meta['last_origin'] = self.get_last_origin(obj=True) + # meta['is_tags_safe'] = ################################## if 'languages' in options: meta['languages'] = self.get_languages() - #meta['screenshot'] = + # meta['screenshot'] = return meta - # # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\ def delete(self): # # TODO: @@ -196,12 +247,12 @@ class Domain(AbstractObject): else: style = 'fab' icon = '\uf13b' - return {'style': style, 'icon': icon, 'color':color, 'radius':5} + return {'style': style, 'icon': icon, 'color': color, 'radius': 5} def is_crawled_item(self, item_id): - domain_lenght = len(self.id) - if len(item_id) > (domain_lenght+48): - if item_id[-36-domain_lenght:-36] == self.id: + domain_length = len(self.id) + if len(item_id) > (domain_length+48): + if item_id[-36-domain_length:-36] == self.id: return True return False @@ -215,169 +266,231 @@ class Domain(AbstractObject): for item_id in get_item_children(root_id): if self.is_crawled_item(item_id): crawled_items.append(item_id) - crawled_items.extend(self.get_crawled_items_children(self.id, item_id)) + crawled_items.extend(self.get_crawled_items_children(item_id)) return crawled_items - def get_all_urls(self, date=False): ## parameters to add first_seen/last_seen ?????????????????????????????? + def get_last_item_root(self): + root_item = r_crawler.zrevrange(f'domain:history:{self.id}', 0, 0, withscores=True) + if not root_item: + return None + root_item = root_item[0][0] + try: + int(root_item) + return None + except ValueError: + pass + return root_item + + def get_item_root_by_epoch(self, epoch): + root_item = r_crawler.zrevrangebyscore(f'domain:history:{self.id}', int(epoch), int(epoch), withscores=True) + if not root_item: + return None + root_item = root_item[0][0] + try: + int(root_item) + return None + except ValueError: + pass + return root_item + + def get_crawled_items_by_epoch(self, epoch=None): + if epoch: + root_item = self.get_item_root_by_epoch(epoch) + else: + root_item = self.get_last_item_root() + if root_item: + return self.get_crawled_items(root_item) + + # TODO FIXME + def get_all_urls(self, date=False, epoch=None): if date: urls = {} else: urls = set() - for port in self.get_ports(): - for history in self.get_history_by_port(port, root=True): - if history.get('root'): - for item_id in self.get_crawled_items(history.get('root')): - url = get_item_url(item_id) - if url: - if date: - item_date = int(get_item_date(item_id)) - if url not in urls: - urls[url] = {'first_seen': item_date,'last_seen': item_date} - else: # update first_seen / last_seen - if item_date < urls[url]['first_seen']: - all_url[url]['first_seen'] = item_date - if item_date > urls[url]['last_seen']: - all_url[url]['last_seen'] = item_date - else: - urls.add(url) + + items = self.get_crawled_items_by_epoch(epoch=epoch) + if items: + for item_id in items: + url = get_item_url(item_id) + if url: + if date: + item_date = int(get_item_date(item_id)) + if url not in urls: + urls[url] = {'first_seen': item_date, 'last_seen': item_date} + else: # update first_seen / last_seen + if item_date < urls[url]['first_seen']: + urls[url]['first_seen'] = item_date + if item_date > urls[url]['last_seen']: + urls[url]['last_seen'] = item_date + else: + urls.add(url) return urls - def get_misp_object(self): + def get_misp_object(self, epoch=None): # create domain-ip obj obj_attrs = [] obj = MISPObject('domain-crawled', standalone=True) obj.first_seen = self.get_first_seen() obj.last_seen = self.get_last_check() - obj_attrs.append( obj.add_attribute('domain', value=self.id) ) - urls = self.get_all_urls(date=True) + obj_attrs.append(obj.add_attribute('domain', value=self.id)) + urls = self.get_all_urls(date=True, epoch=epoch) for url in urls: attribute = obj.add_attribute('url', value=url) attribute.first_seen = str(urls[url]['first_seen']) attribute.last_seen = str(urls[url]['last_seen']) - obj_attrs.append( attribute ) + obj_attrs.append(attribute) for obj_attr in obj_attrs: for tag in self.get_tags(): obj_attr.add_tag(tag) return obj + # TODO ADD MISP Event Export + # TODO DOWN DOMAIN + def get_download_zip(self, epoch=None): + hars_dir = ConfigLoader.get_hars_dir() + items_dir = ConfigLoader.get_items_dir() + screenshots_dir = ConfigLoader.get_screenshots_dir() + items = self.get_crawled_items_by_epoch(epoch=epoch) + if not items: + return None + map_file = 'ITEM ID : URL' + # zip buffer + zip_buffer = BytesIO() + with zipfile.ZipFile(zip_buffer, "a") as zf: + for item_id in items: + url = get_item_url(item_id) + basename = os.path.basename(item_id) + # Item + _write_in_zip_buffer(zf, os.path.join(items_dir, item_id), f'{basename}.gz') + map_file = map_file + f'\n{item_id} : {url}' + # HAR + har = get_item_har(item_id) + if har: + print(har) + _write_in_zip_buffer(zf, os.path.join(hars_dir, har), f'{basename}.json') + # Screenshot + screenshot = self._get_external_correlation('item', '', item_id, 'screenshot') + if screenshot: + screenshot = screenshot['screenshot'].pop()[1:] + screenshot = os.path.join(screenshot[0:2], screenshot[2:4], screenshot[4:6], screenshot[6:8], + screenshot[8:10], screenshot[10:12], screenshot[12:]) + _write_in_zip_buffer(zf, os.path.join(screenshots_dir, f'{screenshot}.png'), f'{basename}.png') + + zf.writestr('_URL_MAP_', BytesIO(map_file.encode()).getvalue()) + misp_object = self.get_misp_object().to_json().encode() + zf.writestr('misp.json', BytesIO(misp_object).getvalue()) + zip_buffer.seek(0) + return zip_buffer + def add_language(self, language): - r_onion.sadd('all_domains_languages', language) - r_onion.sadd(f'all_domains_languages:{self.domain_type}', language) - r_onion.sadd(f'language:domains:{self.domain_type}:{language}', self.id) - r_onion.sadd(f'domain:language:{self.id}', language) - + r_crawler.sadd('all_domains_languages', language) + r_crawler.sadd(f'all_domains_languages:{self.domain_type}', language) + r_crawler.sadd(f'language:domains:{self.domain_type}:{language}', self.id) + r_crawler.sadd(f'domain:language:{self.id}', language) ############################################################################ ############################################################################ - def create(self, first_seen, last_check, ports, status, tags, languages): + def create(self, first_seen, last_check, status, tags, languages): - r_onion.hset(f'{self.domain_type}_metadata:{self.id}', 'first_seen', first_seen) - r_onion.hset(f'{self.domain_type}_metadata:{self.id}', 'last_check', last_check) + r_crawler.hset(f'domain:meta:{self.id}', 'first_seen', first_seen) + r_crawler.hset(f'domain:meta:{self.id}', 'last_check', last_check) for language in languages: self.add_language(language) - #### CRAWLER #### - # add root_item to history # if domain down -> root_item = epoch - def _add_history_root_item(self, root_item, epoch, port): + def _add_history_root_item(self, root_item, epoch): # Create/Update crawler history - r_onion.zadd(f'crawler_history_{self.domain_type}:{self.id}:{port}', epoch, int(root_item)) + r_crawler.zadd(f'domain:history:{self.id}', {root_item: epoch}) # if domain down -> root_item = epoch - def add_history(self, epoch, port, root_item=None, date=None): + def add_history(self, epoch, root_item=None, date=None): if not date: date = time.strftime('%Y%m%d', time.gmtime(epoch)) try: - int(root_item) - except ValueError: - root_item = None + root_item = int(root_item) + status = False + except (ValueError, TypeError): + status = True data_retention_engine.update_object_date('domain', self.domain_type, date) - update_first_object_date(date, self.domain_type) - update_last_object_date(date, self.domain_type) # UP - if root_item: - r_onion.srem(f'full_{self.domain_type}_down', self.id) - r_onion.sadd(f'full_{self.domain_type}_up', self.id) - r_onion.sadd(f'{self.domain_type}_up:{date}', self.id) # # TODO: -> store first day - r_onion.sadd(f'month_{self.domain_type}_up:{date[0:6]}', self.id) # # TODO: -> store first month - self._add_history_root_item(root_item, epoch, port) + if status: + r_crawler.srem(f'full_{self.domain_type}_down', self.id) + r_crawler.sadd(f'full_{self.domain_type}_up', self.id) + r_crawler.sadd(f'{self.domain_type}_up:{date}', self.id) # # TODO: -> store first day + r_crawler.sadd(f'month_{self.domain_type}_up:{date[0:6]}', self.id) # # TODO: -> store first month + self._add_history_root_item(root_item, epoch) else: - if port: - r_onion.sadd(f'{self.domain_type}_down:{date}', self.id) # # TODO: -> store first month - self._add_history_root_item(epoch, epoch, port) + r_crawler.sadd(f'{self.domain_type}_down:{date}', self.id) + if self.was_up(): + self._add_history_root_item(epoch, epoch) else: - r_onion.sadd(f'{self.domain_type}_down:{date}', self.id) - if not self.was_up(): - r_onion.sadd(f'full_{self.domain_type}_down', self.id) + r_crawler.sadd(f'full_{self.domain_type}_down', self.id) - def add_crawled_item(self, url, port, item_id, item_father): + # TODO RENAME PASTE_METADATA + def add_crawled_item(self, url, item_id, item_father): r_metadata.hset(f'paste_metadata:{item_id}', 'father', item_father) - r_metadata.hset(f'paste_metadata:{item_id}', 'domain', f'{self.id}:{port}') + r_metadata.hset(f'paste_metadata:{item_id}', 'domain', self.id) # FIXME REMOVE ME -> extract for real link ????????? r_metadata.hset(f'paste_metadata:{item_id}', 'real_link', url) # add this item_id to his father r_metadata.sadd(f'paste_children:{item_father}', item_id) - ##-- CRAWLER --## +############################################################################ +# In memory zipfile +def _write_in_zip_buffer(zf, path, filename): + with open(path, "rb") as f: + content = f.read() + zf.writestr( filename, BytesIO(content).getvalue()) - ############################################################################ - ############################################################################ +############################################################################ def get_all_domains_types(): - return ['onion', 'regular'] # i2p + return ['onion', 'web'] # i2p def get_all_domains_languages(): - return r_onion.smembers('all_domains_languages') + return r_crawler.smembers('all_domains_languages') def get_domains_up_by_type(domain_type): - return r_onion.smembers(f'full_{domain_type}_up') + return r_crawler.smembers(f'full_{domain_type}_up') def get_domains_down_by_type(domain_type): - return r_onion.smembers(f'full_{domain_type}_down') + return r_crawler.smembers(f'full_{domain_type}_down') -def get_first_object_date(subtype, field=''): - first_date = r_onion.zscore('objs:first_date', f'domain:{subtype}:{field}') - if not first_date: - first_date = 99999999 - return int(first_date) +def get_domains_up_by_date(date, domain_type): + return r_crawler.smembers(f'{domain_type}_up:{date}') -def get_last_object_date(subtype, field=''): - last_date = r_onion.zscore('objs:last_date', f'domain:{subtype}:{field}') - if not last_date: - last_date = 0 - return int(last_date) +def get_domains_down_by_date(date, domain_type): + return r_crawler.smembers(f'{domain_type}_down:{date}') -def _set_first_object_date(date, subtype, field=''): - return r_onion.zadd('objs:first_date', f'domain:{subtype}:{field}', date) - -def _set_last_object_date(date, subtype, field=''): - return r_onion.zadd('objs:last_date', f'domain:{subtype}:{field}', date) - -def update_first_object_date(date, subtype, field=''): - first_date = get_first_object_date(subtype, field=field) - if int(date) < first_date: - _set_first_object_date(date, subtype, field=field) - return date - else: - return first_date - -def update_last_object_date(date, subtype, field=''): - last_date = get_last_object_date(subtype, field=field) - if int(date) > last_date: - _set_last_object_date(date, subtype, field=field) - return date - else: - return last_date +def get_domains_by_daterange(date_from, date_to, domain_type, up=True, down=False): + date_domains = {} + for date in Date.substract_date(date_from, date_to): + domains = [] + if up: + domains.extend(get_domains_up_by_date(date, domain_type)) + if down: + domains.extend(get_domains_down_by_date(date, domain_type)) + if domains: + date_domains[date] = list(domains) + return date_domains +def get_domains_meta(domains): + metas = [] + for domain in domains: + dom = Domain(domain) + metas.append(dom.get_meta()) + return metas ################################################################################ ################################################################################ -#if __name__ == '__main__': +if __name__ == '__main__': + dom = Domain('') + dom.get_download_zip() diff --git a/bin/lib/objects/Items.py b/bin/lib/objects/Items.py index 7d54c591..a38fb83d 100755 --- a/bin/lib/objects/Items.py +++ b/bin/lib/objects/Items.py @@ -3,10 +3,10 @@ import base64 import gzip +import magic import os import re import sys -import redis import cld3 import html2text @@ -233,8 +233,9 @@ class Item(AbstractObject): return self.id[19:-36] def get_screenshot(self): - s = r_serv_metadata.hget(f'paste_metadata:{self.id}', 'screenshot') + s = self.get_correlation('screenshot') if s: + s = s['screenshot'].pop()[1:] return os.path.join(s[0:2], s[2:4], s[4:6], s[6:8], s[8:10], s[10:12], s[12:]) def get_har(self): @@ -315,6 +316,11 @@ class Item(AbstractObject): all_languages.append(lang) return all_languages + def get_mimetype(self, content=None): + if not content: + content = self.get_content() + return magic.from_buffer(content, mime=True) + ############################################################################ ############################################################################ diff --git a/bin/lib/objects/Pgps.py b/bin/lib/objects/Pgps.py index f25f34e2..8f58ea47 100755 --- a/bin/lib/objects/Pgps.py +++ b/bin/lib/objects/Pgps.py @@ -41,14 +41,18 @@ class Pgp(AbstractSubtypeObject): pass # # TODO: - def get_meta(self): - return None + def get_meta(self, options=set()): + meta = self._get_meta() + meta['id'] = self.id + meta['subtype'] = self.subtype + meta['tags'] = self.get_tags() + return meta def get_link(self, flask_context=False): if flask_context: - url = url_for('correlation.show_correlation', object_type=self.type, type_id=self.subtype, correlation_id=self.id) + url = url_for('correlation.show_correlation', type=self.type, subtype=self.subtype, id=self.id) else: - url = f'{baseurl}/correlation/show_correlation?object_type={self.type}&type_id={self.subtype}&correlation_id={self.id}' + url = f'{baseurl}/correlation/show?type={self.type}&subtype={self.subtype}&id={self.id}' return url def get_svg_icon(self): diff --git a/bin/lib/objects/Screenshots.py b/bin/lib/objects/Screenshots.py index c4ff940f..9539587d 100755 --- a/bin/lib/objects/Screenshots.py +++ b/bin/lib/objects/Screenshots.py @@ -1,14 +1,18 @@ #!/usr/bin/env python3 # -*-coding:UTF-8 -* +import base64 import os import sys +from hashlib import sha256 from io import BytesIO from flask import url_for sys.path.append(os.environ['AIL_BIN']) -#from lib import Tag +################################## +# Import Project packages +################################## from lib.ConfigLoader import ConfigLoader from lib.objects.abstract_object import AbstractObject @@ -17,14 +21,15 @@ r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata") SCREENSHOT_FOLDER = config_loader.get_files_directory('screenshot') config_loader = None + class Screenshot(AbstractObject): """ AIL Screenshot Object. (strings) """ # ID = SHA256 - def __init__(self, id): - super(Screenshot, self).__init__('screenshot', id) + def __init__(self, screenshot_id): + super(Screenshot, self).__init__('screenshot', screenshot_id) # def get_ail_2_ail_payload(self): # payload = {'raw': self.get_gzip_content(b64=True), @@ -41,13 +46,13 @@ class Screenshot(AbstractObject): def get_link(self, flask_context=False): if flask_context: - url = url_for('correlation.show_correlation', object_type=self.type, correlation_id=self.id) + url = url_for('correlation.show_correlation', type=self.type, id=self.id) else: - url = f'{baseurl}/correlation/show_correlation?object_type={self.type}&correlation_id={self.id}' + url = f'{baseurl}/correlation/show?type={self.type}&id={self.id}' return url def get_svg_icon(self): - return {'style': 'fas', 'icon': '\uf03e', 'color': '#E1F5DF', 'radius':5} + return {'style': 'fas', 'icon': '\uf03e', 'color': '#E1F5DF', 'radius': 5} def get_rel_path(self, add_extension=False): rel_path = os.path.join(self.id[0:2], self.id[2:4], self.id[4:6], self.id[6:8], self.id[8:10], self.id[10:12], self.id[12:]) @@ -77,12 +82,11 @@ class Screenshot(AbstractObject): return obj def get_meta(self, options=set()): - meta = {} - meta['id'] = self.id - meta['img'] = get_screenshot_rel_path(self.id) ######### # TODO: Rename ME ?????? + meta = {'id': self.id} + meta['img'] = get_screenshot_rel_path(self.id) ######### # TODO: Rename ME ?????? meta['tags'] = self.get_tags(r_list=True) # TODO: ADD IN ABSTRACT CLASS - #meta['is_tags_safe'] = Tag.is_tags_safe(metadata_dict['tags']) ################## # TODO: ADD IN ABSZTRACT CLASS + #meta['is_tags_safe'] = Tag.is_tags_safe(metadata_dict['tags']) ################## # TODO: ADD IN ABSTRACT CLASS return meta def get_screenshot_dir(): @@ -90,7 +94,7 @@ def get_screenshot_dir(): # get screenshot relative path def get_screenshot_rel_path(sha256_str, add_extension=False): - screenshot_path = os.path.join(sha256_str[0:2], sha256_str[2:4], sha256_str[4:6], sha256_str[6:8], sha256_str[8:10], sha256_str[10:12], sha256_str[12:]) + screenshot_path = os.path.join(sha256_str[0:2], sha256_str[2:4], sha256_str[4:6], sha256_str[6:8], sha256_str[8:10], sha256_str[10:12], sha256_str[12:]) if add_extension: screenshot_path = f'{screenshot_path}.png' return screenshot_path @@ -106,5 +110,22 @@ def get_all_screenshots(): screenshots.append(screenshot_id) return screenshots +# FIXME STR SIZE LIMIT +def create_screenshot(content, size_limit=5000000, b64=True, force=False): + size = (len(content)*3) / 4 + if size <= size_limit or size_limit < 0 or force: + if b64: + content = base64.standard_b64decode(content.encode()) + screenshot_id = sha256(content).hexdigest() + screenshot = Screenshot(screenshot_id) + if not screenshot.exists(): + filepath = screenshot.get_filepath() + dirname = os.path.dirname(filepath) + if not os.path.exists(dirname): + os.makedirs(dirname) + with open(filepath, 'wb') as f: + f.write(content) + return screenshot + return None #if __name__ == '__main__': diff --git a/bin/lib/objects/Usernames.py b/bin/lib/objects/Usernames.py index 3eef0c9f..305e672a 100755 --- a/bin/lib/objects/Usernames.py +++ b/bin/lib/objects/Usernames.py @@ -10,12 +10,14 @@ from pymisp import MISPObject # sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/')) -sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) -import ConfigLoader - +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from lib.ConfigLoader import ConfigLoader from lib.objects.abstract_subtype_object import AbstractSubtypeObject, get_all_id -config_loader = ConfigLoader.ConfigLoader() +config_loader = ConfigLoader() config_loader = None @@ -44,9 +46,9 @@ class Username(AbstractSubtypeObject): def get_link(self, flask_context=False): if flask_context: - url = url_for('correlation.show_correlation', object_type=self.type, type_id=self.subtype, correlation_id=self.id) + url = url_for('correlation.show_correlation', type=self.type, subtype=self.subtype, id=self.id) else: - url = f'{baseurl}/correlation/show_correlation?object_type={self.type}&type_id={self.subtype}&correlation_id={self.id}' + url = f'{baseurl}/correlation/show?type={self.type}&subtype={self.subtype}&id={self.id}' return url def get_svg_icon(self): @@ -61,6 +63,13 @@ class Username(AbstractSubtypeObject): icon = '\uf007' return {'style': style, 'icon': icon, 'color': '#4dffff', 'radius':5} + def get_meta(self, options=set()): + meta = self._get_meta() + meta['id'] = self.id + meta['subtype'] = self.subtype + meta['tags'] = self.get_tags() + return meta + def get_misp_object(self): obj_attrs = [] if self.subtype == 'telegram': diff --git a/bin/lib/objects/abstract_daterange_object.py b/bin/lib/objects/abstract_daterange_object.py new file mode 100755 index 00000000..bffa6d88 --- /dev/null +++ b/bin/lib/objects/abstract_daterange_object.py @@ -0,0 +1,139 @@ +# -*-coding:UTF-8 -* +""" +Base Class for AIL Objects +""" + +################################## +# Import External packages +################################## +import os +import sys +from abc import abstractmethod, ABC + +#from flask import url_for + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from lib.objects.abstract_object import AbstractObject +from lib.ConfigLoader import ConfigLoader +from lib.item_basic import is_crawled, get_item_domain + +from packages import Date + +# LOAD CONFIG +config_loader = ConfigLoader() +# r_metadata = config_loader.get_redis_conn("ARDB_Metadata") +r_object = config_loader.get_db_conn("Kvrocks_Objects") +config_loader = None + +class AbstractDaterangeObject(AbstractObject, ABC): + """ + Abstract Subtype Object + """ + + def __init__(self, obj_type, id): + """ Abstract for all the AIL object + + :param obj_type: object type (item, ...) + :param id: Object ID + """ + super().__init__(obj_type, id) + + def exists(self): + return r_object.exists(f'{self.type}:meta:{self.id}') + + def get_first_seen(self, r_int=False): + first_seen = r_object.hget(f'{self.type}:meta:{self.id}', 'first_seen') + if r_int: + if first_seen: + return int(first_seen) + else: + return 99999999 + else: + return first_seen + + def get_last_seen(self, r_int=False): + last_seen = r_object.hget(f'{self.type}:meta:{self.id}', 'last_seen') + if r_int: + if last_seen: + return int(last_seen) + else: + return 0 + else: + return last_seen + + def get_nb_seen(self): + return r_object.hget(f'{self.type}:meta:{self.id}', 'nb') + + def get_nb_seen_by_date(self, date): + nb = r_object.hget(f'{self.type}:date:{date}', self.id) + if nb is None: + return 0 + else: + return int(nb) + + def _get_meta(self, options=[]): + meta_dict = {'first_seen': self.get_first_seen(), + 'last_seen': self.get_last_seen(), + 'nb_seen': self.get_nb_seen()} + if 'sparkline' in options: + meta_dict['sparkline'] = self.get_sparkline() + return meta_dict + + def set_first_seen(self, first_seen): + r_object.hset(f'{self.type}:meta:{self.id}', 'first_seen', first_seen) + + def set_last_seen(self, last_seen): + r_object.hset(f'{self.type}:meta:{self.id}', 'last_seen', last_seen) + + def update_daterange(self, date): + date = int(date) + # obj don't exit + if not self.exists(): + self.set_first_seen(date) + self.set_last_seen(date) + else: + first_seen = self.get_first_seen(r_int=True) + last_seen = self.get_last_seen(r_int=True) + if date < first_seen: + self.set_first_seen(date) + if date > last_seen: + self.set_last_seen(date) + + def get_sparkline(self): + sparkline = [] + for date in Date.get_previous_date_list(6): + sparkline.append(self.get_nb_seen_by_date(date)) + return sparkline + + def _add(self, date, item_id): + if not self.exists(): + self.set_first_seen(date) + self.set_last_seen(date) + r_object.sadd(f'{self.type}:all', self.id) + else: + self.update_daterange(date) + + # NB Object seen by day + r_object.hincrby(f'{self.type}:date:{date}', self.id, 1) + r_object.zincrby(f'{self.type}:date:{date}', self.id, 1) # # # # # # # # # # + # NB Object seen + r_object.hincrby(f'{self.type}:meta:{self.id}', 'nb', 1) + + # Correlations + self.add_correlation('item', '', item_id) + if is_crawled(item_id): # Domain + domain = get_item_domain(item_id) + self.add_correlation('domain', '', domain) + + # TODO:ADD objects + Stats + def _create(self, first_seen, last_seen): + self.set_first_seen(first_seen) + self.set_last_seen(last_seen) + r_object.sadd(f'{self.type}:all', self.id) + + # TODO + def _delete(self): + pass diff --git a/bin/lib/objects/abstract_object.py b/bin/lib/objects/abstract_object.py index 63a9c1b6..1cba7e75 100755 --- a/bin/lib/objects/abstract_object.py +++ b/bin/lib/objects/abstract_object.py @@ -144,7 +144,7 @@ class AbstractObject(ABC): pass @abstractmethod - def get_meta(self): + def get_meta(self, options=set()): """ get Object metadata """ @@ -165,6 +165,18 @@ class AbstractObject(ABC): def get_misp_object(self): pass + def _get_external_correlation(self, req_type, req_subtype, req_id, obj_type): + """ + Get object correlation + """ + return get_correlations(req_type, req_subtype, req_id, filter_types=[obj_type]) + + def get_correlation(self, obj_type): + """ + Get object correlation + """ + return get_correlations(self.type, self.subtype, self.id, filter_types=[obj_type]) + def get_correlations(self): """ Get object correlations diff --git a/bin/lib/objects/abstract_subtype_object.py b/bin/lib/objects/abstract_subtype_object.py index 188e3702..66435353 100755 --- a/bin/lib/objects/abstract_subtype_object.py +++ b/bin/lib/objects/abstract_subtype_object.py @@ -20,6 +20,8 @@ from lib.objects.abstract_object import AbstractObject from lib.ConfigLoader import ConfigLoader from lib.item_basic import is_crawled, get_item_domain +from packages import Date + # LOAD CONFIG config_loader = ConfigLoader() r_metadata = config_loader.get_redis_conn("ARDB_Metadata") @@ -115,6 +117,11 @@ class AbstractSubtypeObject(AbstractObject): if date > last_seen: self.set_last_seen(date) + def get_sparkline(self): + sparkline = [] + for date in Date.get_previous_date_list(6): + sparkline.append(self.get_nb_seen_by_date(date)) + return sparkline # # HANDLE Others objects ???? # diff --git a/bin/lib/objects/ail_objects.py b/bin/lib/objects/ail_objects.py index 467c9aeb..90b5064c 100755 --- a/bin/lib/objects/ail_objects.py +++ b/bin/lib/objects/ail_objects.py @@ -12,11 +12,15 @@ from flask import url_for sys.path.append(os.environ['AIL_BIN']) ################################## # Import Project packages +################################## from lib.ConfigLoader import ConfigLoader from lib.ail_core import get_all_objects from lib import correlations_engine +from lib import btc_ail +from lib import Tag from lib.objects.CryptoCurrencies import CryptoCurrency +from lib.objects.Cves import Cve from lib.objects.Decodeds import Decoded from lib.objects.Domains import Domain from lib.objects.Items import Item @@ -39,12 +43,11 @@ def is_valid_object_type(obj_type): def sanitize_objs_types(objs): l_types = [] - print('sanitize') - print(objs) - print(get_all_objects()) for obj in objs: if is_valid_object_type(obj): l_types.append(obj) + if not l_types: + l_types = get_all_objects() return l_types def get_object(obj_type, subtype, id): @@ -54,6 +57,8 @@ def get_object(obj_type, subtype, id): return Domain(id) elif obj_type == 'decoded': return Decoded(id) + elif obj_type == 'cve': + return Cve(id) elif obj_type == 'screenshot': return Screenshot(id) elif obj_type == 'cryptocurrency': @@ -63,23 +68,48 @@ def get_object(obj_type, subtype, id): elif obj_type == 'username': return Username(id, subtype) -def exists_obj(obj_type, subtype, id): - object = get_object(obj_type, subtype, id) - return object.exists() +def exists_obj(obj_type, subtype, obj_id): + obj = get_object(obj_type, subtype, obj_id) + if obj: + return obj.exists() + else: + return False def get_object_link(obj_type, subtype, id, flask_context=False): - object = get_object(obj_type, subtype, id) - return object.get_link(flask_context=flask_context) + obj = get_object(obj_type, subtype, id) + return obj.get_link(flask_context=flask_context) def get_object_svg(obj_type, subtype, id): - object = get_object(obj_type, subtype, id) - return object.get_svg_icon() + obj = get_object(obj_type, subtype, id) + return obj.get_svg_icon() -def get_object_meta(obj_type, subtype, id, flask_context=False): - object = get_object(obj_type, subtype, id) - meta = object.get_meta() - meta['icon'] = object.get_svg_icon() - meta['link'] = object.get_link(flask_context=flask_context) +def get_object_meta(obj_type, subtype, id, options=[], flask_context=False): + obj = get_object(obj_type, subtype, id) + meta = obj.get_meta(options=options) + meta['icon'] = obj.get_svg_icon() + meta['link'] = obj.get_link(flask_context=flask_context) + return meta + +def get_objects_meta(objs, options=[], flask_context=False): + metas = [] + for obj_dict in objs: + metas.append(get_object_meta(obj_dict['type'], obj_dict['subtype'], obj_dict['id'], options=options, flask_context=flask_context)) + return metas + +def get_object_card_meta(obj_type, subtype, id, related_btc=False): + obj = get_object(obj_type, subtype, id) + meta = obj.get_meta() + meta['icon'] = obj.get_svg_icon() + if subtype or obj_type == 'cve': + meta['sparkline'] = obj.get_sparkline() + if subtype == 'bitcoin' and related_btc: + meta["related_btc"] = btc_ail.get_bitcoin_info(obj.id) + if obj.get_type() == 'decoded': + meta["vt"] = obj.get_meta_vt() + meta["vt"]["status"] = obj.is_vt_enabled() + # TAGS MODAL + if obj.get_type() == 'screenshot' or obj.get_type() == 'decoded': + meta["add_tags_modal"] = Tag.get_modal_add_tags(obj.id, object_type=obj.get_type()) return meta def get_ui_obj_tag_table_keys(obj_type): @@ -203,7 +233,6 @@ def create_correlation_graph_nodes(nodes_set, obj_str_id, flask_context=True): dict_node['style']['node_radius'] = dict_node['style']['radius'] # # TODO: # FIXME: in UI - dict_node['style'] dict_node['text'] = obj_id if node_id == obj_str_id: dict_node["style"]["node_color"] = 'orange' diff --git a/bin/lib/regex_helper.py b/bin/lib/regex_helper.py index 796dd7bb..edaff949 100755 --- a/bin/lib/regex_helper.py +++ b/bin/lib/regex_helper.py @@ -36,17 +36,19 @@ def _regex_findall(redis_key, regex, item_content, r_set): all_items = re.findall(regex, item_content) if r_set: if len(all_items) > 1: - r_serv_cache.sadd(redis_key, *all_items) + for item in all_items: + r_serv_cache.sadd(redis_key, str(item)) r_serv_cache.expire(redis_key, 360) elif all_items: - r_serv_cache.sadd(redis_key, all_items[0]) + r_serv_cache.sadd(redis_key, str(all_items[0])) r_serv_cache.expire(redis_key, 360) else: if len(all_items) > 1: - r_serv_cache.lpush(redis_key, *all_items) + for item in all_items: + r_serv_cache.lpush(redis_key, str(item)) r_serv_cache.expire(redis_key, 360) elif all_items: - r_serv_cache.lpush(redis_key, all_items[0]) + r_serv_cache.lpush(redis_key, str(all_items[0])) r_serv_cache.expire(redis_key, 360) def regex_findall(module_name, redis_key, regex, item_id, item_content, max_time=30, r_set=True): diff --git a/bin/modules/ApiKey.py b/bin/modules/ApiKey.py index 42ea0c7a..400e09c4 100755 --- a/bin/modules/ApiKey.py +++ b/bin/modules/ApiKey.py @@ -11,16 +11,16 @@ Search for API keys on an item content. """ -import re import os +import re import sys -sys.path.append(os.path.join(os.environ['AIL_BIN'])) - -# project packages +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## from modules.abstract_module import AbstractModule -from packages.Item import Item -from lib import regex_helper +from lib.objects.Items import Item class ApiKey(AbstractModule): """ApiKey module for AIL framework""" @@ -28,13 +28,11 @@ class ApiKey(AbstractModule): def __init__(self): super(ApiKey, self).__init__() - self.redis_cache_key = regex_helper.generate_redis_cache_key(self.module_name) - # # TODO: ENUM or dict # TODO improve REGEX - #r'(? self.criticalNumberToAlert: print(f"========> Found more than 10 credentials in this file : {item.get_id()}") self.redis_logger.warning(to_print) @@ -122,11 +115,11 @@ class Credential(AbstractModule): msg = f'infoleak:automatic-detection="credential";{item.get_id()}' self.send_message_to_queue(msg, 'Tags') - site_occurence = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.regex_site_for_stats, item.get_id(), item_content, max_time=self.max_execution_time, r_set=False) + site_occurrence = self.regex_findall(self.regex_site_for_stats, item.get_id(), item_content) creds_sites = {} - for site in site_occurence: + for site in site_occurrence: site_domain = site[1:-1].lower() if site_domain in creds_sites.keys(): creds_sites[site_domain] += 1 @@ -136,7 +129,7 @@ class Credential(AbstractModule): for url in all_sites: self.faup.decode(url) domain = self.faup.get()['domain'] - ## TODO: # FIXME: remove me, check faup versionb + # # TODO: # FIXME: remove me, check faup versionb try: domain = domain.decode() except: @@ -159,10 +152,10 @@ class Credential(AbstractModule): date = datetime.now().strftime("%Y%m") nb_tlds = {} for cred in all_credentials: - maildomains = re.findall("@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,20}", cred.lower())[0] + maildomains = re.findall(r"@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,20}", cred.lower())[0] self.faup.decode(maildomains) tld = self.faup.get()['tld'] - ## TODO: # FIXME: remove me + # # TODO: # FIXME: remove me try: tld = tld.decode() except: diff --git a/bin/modules/CreditCards.py b/bin/modules/CreditCards.py index 70a119ae..62843c0f 100755 --- a/bin/modules/CreditCards.py +++ b/bin/modules/CreditCards.py @@ -17,14 +17,13 @@ It apply credit card regexes on item content and warn if a valid card number is import os import re import sys -import time sys.path.append(os.environ['AIL_BIN']) ################################## # Import Project packages ################################## from modules.abstract_module import AbstractModule -from packages.Item import Item +from lib.objects.Items import Item from packages import lib_refine class CreditCards(AbstractModule): @@ -53,15 +52,14 @@ class CreditCards(AbstractModule): # Send module state to logs self.redis_logger.info(f"Module {self.module_name} initialized") - def compute(self, message, r_result=False): - id, score = message.split() - item = Item(id) + item_id, score = message.split() + item = Item(item_id) content = item.get_content() all_cards = re.findall(self.regex, content) if len(all_cards) > 0: - #self.redis_logger.debug(f'All matching {all_cards}') + # self.redis_logger.debug(f'All matching {all_cards}') creditcard_set = set([]) for card in all_cards: @@ -70,9 +68,9 @@ class CreditCards(AbstractModule): self.redis_logger.debug(f'{clean_card} is valid') creditcard_set.add(clean_card) - #pprint.pprint(creditcard_set) + # pprint.pprint(creditcard_set) to_print = f'CreditCard;{item.get_source()};{item.get_date()};{item.get_basename()};' - if (len(creditcard_set) > 0): + if len(creditcard_set) > 0: self.redis_logger.warning(f'{to_print}Checked {len(creditcard_set)} valid number(s);{item.get_id()}') msg = f'infoleak:automatic-detection="credit-card";{item.get_id()}' @@ -83,7 +81,7 @@ class CreditCards(AbstractModule): else: self.redis_logger.info(f'{to_print}CreditCard related;{item.get_id()}') -if __name__ == '__main__': +if __name__ == '__main__': module = CreditCards() module.run() diff --git a/bin/modules/Cve.py b/bin/modules/Cve.py index bb913c9c..42048347 100755 --- a/bin/modules/Cve.py +++ b/bin/modules/Cve.py @@ -22,6 +22,7 @@ sys.path.append(os.environ['AIL_BIN']) # Import Project packages ################################## from modules.abstract_module import AbstractModule +from lib.objects import Cves from lib.objects.Items import Item @@ -36,13 +37,12 @@ class Cve(AbstractModule): # regex to find CVE self.reg_cve = re.compile(r'CVE-[1-2]\d{1,4}-\d{1,5}') - # Waiting time in secondes between to message proccessed + # Waiting time in seconds between to message processed self.pending_seconds = 1 # Send module state to logs self.redis_logger.info(f'Module {self.module_name} initialized') - def compute(self, message): item_id, count = message.split() @@ -51,17 +51,23 @@ class Cve(AbstractModule): cves = self.regex_findall(self.reg_cve, item_id, item.get_content()) if cves: + print(cves) + date = item.get_date() + for cve_id in cves: + cve = Cves.Cve(cve_id) + cve.add(date, item_id) + warning = f'{item_id} contains CVEs {cves}' print(warning) self.redis_logger.warning(warning) + msg = f'infoleak:automatic-detection="cve";{item_id}' # Send to Tags Queue self.send_message_to_queue(msg, 'Tags') - - if __name__ == '__main__': module = Cve() - module.run() + # module.run() + module.compute('crawled/2022/09/15/circl.lu1e4f9721-06dc-404f-aabf-3c3bd0b533bd 9') diff --git a/bin/modules/Decoder.py b/bin/modules/Decoder.py index d818072e..85449498 100755 --- a/bin/modules/Decoder.py +++ b/bin/modules/Decoder.py @@ -65,49 +65,45 @@ class Decoder(AbstractModule): #hexStr = ''.join( hex_string.split(" ") ) return bytes(bytearray([int(hexStr[i:i+2], 16) for i in range(0, len(hexStr), 2)])) - # TODO to lambda expr def binary_decoder(self, binary_string): return bytes(bytearray([int(binary_string[i:i+8], 2) for i in range(0, len(binary_string), 8)])) - # TODO to lambda expr def base64_decoder(self, base64_string): return base64.b64decode(base64_string) - def __init__(self): super(Decoder, self).__init__() - regex_binary = '[0-1]{40,}' - #regex_hex = '(0[xX])?[A-Fa-f0-9]{40,}' - regex_hex = '[A-Fa-f0-9]{40,}' - regex_base64 = '(?:[A-Za-z0-9+/]{4}){2,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)' + regex_binary = r'[0-1]{40,}' + # regex_hex = r'(0[xX])?[A-Fa-f0-9]{40,}' + regex_hex = r'[A-Fa-f0-9]{40,}' + regex_base64 = r'(?:[A-Za-z0-9+/]{4}){2,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)' cmp_regex_binary = re.compile(regex_binary) cmp_regex_hex = re.compile(regex_hex) cmp_regex_base64 = re.compile(regex_base64) # map decoder function - self.decoder_function = {'binary':self.binary_decoder,'hexadecimal':self.hex_decoder, 'base64':self.base64_decoder} + self.decoder_function = {'binary': self.binary_decoder, 'hexadecimal': self.hex_decoder, 'base64': self.base64_decoder} # list all decoder with regex, decoder_binary = {'name': 'binary', 'regex': cmp_regex_binary, 'encoded_min_size': 300, 'max_execution_time': binary_max_execution_time} decoder_hexadecimal = {'name': 'hexadecimal', 'regex': cmp_regex_hex, 'encoded_min_size': 300, 'max_execution_time': hex_max_execution_time} decoder_base64 = {'name': 'base64', 'regex': cmp_regex_base64, 'encoded_min_size': 40, 'max_execution_time': base64_max_execution_time} - self.decoder_order = [ decoder_base64, decoder_binary, decoder_hexadecimal, decoder_base64] + self.decoder_order = [decoder_base64, decoder_binary, decoder_hexadecimal, decoder_base64] for decoder in self.decoder_order: serv_metadata.sadd('all_decoder', decoder['name']) - # Waiting time in secondes between to message proccessed + # Waiting time in seconds between to message processed self.pending_seconds = 1 # Send module state to logs self.redis_logger.info(f'Module {self.module_name} initialized') - def compute(self, message): item = Item(message) @@ -128,10 +124,9 @@ class Decoder(AbstractModule): else: signal.alarm(0) - if(len(encoded_list) > 0): + if len(encoded_list) > 0: content = self.decode_string(content, item.id, date, encoded_list, decoder['name'], decoder['encoded_min_size']) - def decode_string(self, content, item_id, date, encoded_list, decoder_name, encoded_min_size): find = False for encoded in encoded_list: @@ -153,12 +148,12 @@ class Decoder(AbstractModule): save_item_relationship(sha1_string, item_id) ################################ - #remove encoded from item content + # remove encoded from item content content = content.replace(encoded, '', 1) self.redis_logger.debug(f'{item_id} : {decoder_name} - {mimetype}') print(f'{item_id} : {decoder_name} - {mimetype}') - if(find): + if find: self.redis_logger.info(f'{decoder_name} decoded') print(f'{decoder_name} decoded') @@ -169,6 +164,7 @@ class Decoder(AbstractModule): # perf: remove encoded from item content return content + if __name__ == '__main__': # # TODO: TEST ME diff --git a/bin/modules/DomClassifier.py b/bin/modules/DomClassifier.py index 08f202ab..da2cea14 100755 --- a/bin/modules/DomClassifier.py +++ b/bin/modules/DomClassifier.py @@ -15,7 +15,6 @@ the out output of the Global module. ################################## import os import sys -import time import DomainClassifier.domainclassifier sys.path.append(os.environ['AIL_BIN']) @@ -23,11 +22,8 @@ sys.path.append(os.environ['AIL_BIN']) # Import Project packages ################################## from modules.abstract_module import AbstractModule -from packages.Item import Item - -sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib')) -import d4 -import item_basic +from lib.objects.Items import Item +from lib import d4 class DomClassifier(AbstractModule): @@ -38,7 +34,7 @@ class DomClassifier(AbstractModule): def __init__(self): super(DomClassifier, self).__init__() - # Waiting time in secondes between to message proccessed + # Waiting time in seconds between to message processed self.pending_seconds = 1 addr_dns = self.process.config.get("DomClassifier", "dns") @@ -51,11 +47,10 @@ class DomClassifier(AbstractModule): # Send module state to logs self.redis_logger.info(f"Module: {self.module_name} Launched") - def compute(self, message, r_result=False): - host, id = message.split() + host, item_id = message.split() - item = Item(id) + item = Item(item_id) item_basename = item.get_basename() item_date = item.get_date() item_source = item.get_source() @@ -64,7 +59,7 @@ class DomClassifier(AbstractModule): self.c.text(rawtext=host) print(self.c.domain) self.c.validdomain(passive_dns=True, extended=False) - #self.redis_logger.debug(self.c.vdomain) + # self.redis_logger.debug(self.c.vdomain) print(self.c.vdomain) print() diff --git a/bin/modules/Duplicates.py b/bin/modules/Duplicates.py index 169295ae..b9b5a440 100755 --- a/bin/modules/Duplicates.py +++ b/bin/modules/Duplicates.py @@ -12,14 +12,12 @@ Its input comes from other modules, namely: Perform comparisions with ssdeep and tlsh """ -import redis - import os import sys import time -#from datetime import datetime, timedelta +# from datetime import datetime, timedelta import datetime sys.path.append(os.environ['AIL_BIN']) @@ -51,7 +49,6 @@ class Duplicates(AbstractModule): self.redis_logger.info(f"Module: {self.module_name} Launched") - def compute(self, message): # IOError: "CRC Checksum Failed on : {id}" @@ -72,7 +69,7 @@ class Duplicates(AbstractModule): self.algos['ssdeep']['hash'] = Duplicate.get_ssdeep_hash(content) self.algos['tlsh']['hash'] = Duplicate.get_tlsh_hash(content) - # TODO: Handle coputed duplicates + # TODO: Handle computed duplicates nb_duplicates = 0 @@ -99,7 +96,7 @@ class Duplicates(AbstractModule): y = time.time() print(f'{item.get_id()} Processed in {y-x} sec') - #self.redis_logger.debug('{}Processed in {} sec'.format(to_print, y-x)) + # self.redis_logger.debug('{}Processed in {} sec'.format(to_print, y-x)) if __name__ == "__main__": diff --git a/bin/modules/Global.py b/bin/modules/Global.py index 503b42cb..a93712fe 100755 --- a/bin/modules/Global.py +++ b/bin/modules/Global.py @@ -31,7 +31,6 @@ import os import sys import time import datetime -import redis from hashlib import md5 from uuid import uuid4 @@ -57,19 +56,18 @@ class Global(AbstractModule): self.processed_item = 0 self.time_last_stats = time.time() - # Get and sanityze ITEM DIRECTORY + # Get and sanitize ITEM DIRECTORY # # TODO: rename PASTE => ITEM self.PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], self.process.config.get("Directories", "pastes")) self.PASTES_FOLDERS = self.PASTES_FOLDER + '/' self.PASTES_FOLDERS = os.path.join(os.path.realpath(self.PASTES_FOLDERS), '') - # Waiting time in secondes between to message proccessed + # Waiting time in seconds between to message processed self.pending_seconds = 0.5 # Send module state to logs self.redis_logger.info(f"Module {self.module_name} initialized") - def computeNone(self): difftime = time.time() - self.time_last_stats if int(difftime) > 30: @@ -80,7 +78,6 @@ class Global(AbstractModule): self.time_last_stats = time.time() self.processed_item = 0 - def compute(self, message, r_result=False): # Recovering the streamed message informations splitted = message.split() @@ -129,7 +126,8 @@ class Global(AbstractModule): item_id = item_id.replace(self.PASTES_FOLDERS, '', 1) self.send_message_to_queue(item_id) - self.processed_item+=1 + self.processed_item += 1 + print(item_id) if r_result: return item_id @@ -137,7 +135,6 @@ class Global(AbstractModule): self.redis_logger.debug(f"Empty Item: {message} not processed") print(f"Empty Item: {message} not processed") - def check_filename(self, filename, new_file_content): """ Check if file is not a duplicated file @@ -181,10 +178,8 @@ class Global(AbstractModule): # File not unzipped filename = None - return filename - def gunzip_file(self, filename): """ Unzip a file @@ -224,7 +219,6 @@ class Global(AbstractModule): return gunzipped_bytes_obj - def rreplace(self, s, old, new, occurrence): li = s.rsplit(old, occurrence) return new.join(li) diff --git a/bin/modules/Hosts.py b/bin/modules/Hosts.py index 9ecb2590..782681ee 100755 --- a/bin/modules/Hosts.py +++ b/bin/modules/Hosts.py @@ -17,7 +17,6 @@ It is looking for Hosts import os import re import sys -import time sys.path.append(os.environ['AIL_BIN']) ################################## @@ -25,9 +24,7 @@ sys.path.append(os.environ['AIL_BIN']) ################################## from modules.abstract_module import AbstractModule from lib.ConfigLoader import ConfigLoader -from lib import regex_helper -#from lib.objects.Items import Item -from packages.Item import Item +from lib.objects.Items import Item class Hosts(AbstractModule): """ @@ -40,12 +37,10 @@ class Hosts(AbstractModule): config_loader = ConfigLoader() self.r_cache = config_loader.get_redis_conn("Redis_Cache") - self.redis_cache_key = regex_helper.generate_redis_cache_key(self.module_name) - # regex timeout self.regex_timeout = 30 - # Waiting time in secondes between to message proccessed + # Waiting time in seconds between to message processed self.pending_seconds = 1 self.host_regex = r'\b([a-zA-Z\d-]{,63}(?:\.[a-zA-Z\d-]{,63})+)\b' @@ -53,7 +48,6 @@ class Hosts(AbstractModule): self.redis_logger.info(f"Module: {self.module_name} Launched") - def compute(self, message): item = Item(message) @@ -61,18 +55,16 @@ class Hosts(AbstractModule): # if mimetype.split('/')[0] == "text": content = item.get_content() - - hosts = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.host_regex, item.get_id(), content) + hosts = self.regex_findall(self.host_regex, item.get_id(), content) if hosts: print(f'{len(hosts)} host {item.get_id()}') for host in hosts: - #print(host) + # print(host) msg = f'{host} {item.get_id()}' self.send_message_to_queue(msg, 'Host') - if __name__ == '__main__': module = Hosts() diff --git a/bin/modules/Iban.py b/bin/modules/Iban.py index 1b80f761..4ecb2178 100755 --- a/bin/modules/Iban.py +++ b/bin/modules/Iban.py @@ -34,7 +34,7 @@ class Iban(AbstractModule): """ _LETTERS_IBAN = chain(enumerate(string.digits + string.ascii_uppercase), - enumerate(string.ascii_lowercase, 10)) + enumerate(string.ascii_lowercase, 10)) LETTERS_IBAN = {ord(d): str(i) for i, d in _LETTERS_IBAN} def __init__(self): @@ -44,7 +44,7 @@ class Iban(AbstractModule): self.pending_seconds = 10 self.regex_timeout = 30 - #iban_regex = re.compile(r'\b[A-Za-z]{2}[0-9]{2}(?:[ ]?[0-9]{4}){4}(?:[ ]?[0-9]{1,2})?\b') + # iban_regex = re.compile(r'\b[A-Za-z]{2}[0-9]{2}(?:[ ]?[0-9]{4}){4}(?:[ ]?[0-9]{1,2})?\b') self.iban_regex = re.compile(r'\b([A-Za-z]{2}[ \-]?[0-9]{2})(?=(?:[ \-]?[A-Za-z0-9]){9,30})((?:[ \-]?[A-Za-z0-9]{3,5}){2,6})([ \-]?[A-Za-z0-9]{1,3})\b') self.iban_regex_verify = re.compile(r'^([A-Z]{2})([0-9]{2})([A-Z0-9]{9,30})$') @@ -90,6 +90,7 @@ class Iban(AbstractModule): msg = f'infoleak:automatic-detection="iban";{item_id}' self.send_message_to_queue(msg, 'Tags') + if __name__ == '__main__': module = Iban() diff --git a/bin/modules/Indexer.py b/bin/modules/Indexer.py index cdb65f16..2b80eeb3 100755 --- a/bin/modules/Indexer.py +++ b/bin/modules/Indexer.py @@ -26,7 +26,7 @@ sys.path.append(os.environ['AIL_BIN']) # Import Project packages ################################## from modules.abstract_module import AbstractModule -from packages.Item import Item +from lib.objects.Items import Item class Indexer(AbstractModule): @@ -57,9 +57,7 @@ class Indexer(AbstractModule): self.ix = None if self.indexertype == "whoosh": - self.schema = Schema(title=TEXT(stored=True), path=ID(stored=True, - unique=True), - content=TEXT) + self.schema = Schema(title=TEXT(stored=True), path=ID(stored=True, unique=True), content=TEXT) if not os.path.exists(self.baseindexpath): os.mkdir(self.baseindexpath) @@ -96,7 +94,6 @@ class Indexer(AbstractModule): self.last_refresh = time_now - def compute(self, message): docpath = message.split(" ", -1)[-1] @@ -109,7 +106,7 @@ class Indexer(AbstractModule): try: # Avoid calculating the index's size at each message - if(time.time() - self.last_refresh > self.TIME_WAIT): + if time.time() - self.last_refresh > self.TIME_WAIT: self.last_refresh = time.time() if self.check_index_size() >= self.INDEX_SIZE_THRESHOLD*(1000*1000): timestamp = int(time.time()) @@ -145,10 +142,8 @@ class Indexer(AbstractModule): cur_sum = 0 for root, dirs, files in os.walk(the_index_name): cur_sum += sum(getsize(join(root, name)) for name in files) - return cur_sum - def move_index_into_old_index_folder(self): for cur_file in os.listdir(self.baseindexpath): if not cur_file == "old_index": diff --git a/bin/modules/Keys.py b/bin/modules/Keys.py index 02cc3b12..74d04fba 100755 --- a/bin/modules/Keys.py +++ b/bin/modules/Keys.py @@ -17,7 +17,6 @@ RSA private key, certificate messages ################################## import os import sys -import time from enum import Enum sys.path.append(os.environ['AIL_BIN']) @@ -25,7 +24,7 @@ sys.path.append(os.environ['AIL_BIN']) # Import Project packages ################################## from modules.abstract_module import AbstractModule -from packages.Item import Item +from lib.objects.Items import Item class KeyEnum(Enum): @@ -53,10 +52,9 @@ class Keys(AbstractModule): def __init__(self): super(Keys, self).__init__() - # Waiting time in secondes between to message proccessed + # Waiting time in seconds between to message processed self.pending_seconds = 1 - def compute(self, message): item = Item(message) content = item.get_content() @@ -169,11 +167,12 @@ class Keys(AbstractModule): if get_pgp_content: self.send_message_to_queue(item.get_id(), 'PgpDump') - if find : - #Send to duplicate - self.send_message_to_queue(item.get_id(), 'Duplicate') - self.redis_logger.debug(f'{item.get_id()} has key(s)') - print(f'{item.get_id()} has key(s)') + # if find : + # # Send to duplicate + # self.send_message_to_queue(item.get_id(), 'Duplicate') + # self.redis_logger.debug(f'{item.get_id()} has key(s)') + # print(f'{item.get_id()} has key(s)') + if __name__ == '__main__': diff --git a/bin/modules/Languages.py b/bin/modules/Languages.py index 4e55e083..f775547e 100755 --- a/bin/modules/Languages.py +++ b/bin/modules/Languages.py @@ -11,7 +11,7 @@ sys.path.append(os.environ['AIL_BIN']) from modules.abstract_module import AbstractModule from lib.objects.Domains import Domain from lib.objects.Items import Item -#from lib.ConfigLoader import ConfigLoader +# from lib.ConfigLoader import ConfigLoader class Languages(AbstractModule): """ @@ -31,6 +31,7 @@ class Languages(AbstractModule): for lang in item.get_languages(min_probability=0.8): domain.add_language(lang.language) + if __name__ == '__main__': module = Languages() module.run() diff --git a/bin/modules/LibInjection.py b/bin/modules/LibInjection.py index 18168392..883e2e46 100755 --- a/bin/modules/LibInjection.py +++ b/bin/modules/LibInjection.py @@ -13,12 +13,12 @@ It tries to identify SQL Injections with libinjection. import os import sys -import redis import urllib.request import pylibinjection from datetime import datetime from pyfaup.faup import Faup +from urllib.parse import unquote sys.path.append(os.environ['AIL_BIN']) @@ -27,7 +27,7 @@ sys.path.append(os.environ['AIL_BIN']) ################################## from modules.abstract_module import AbstractModule from lib.ConfigLoader import ConfigLoader -from packages.Item import Item +from lib.objects.Items import Item class LibInjection(AbstractModule): """docstring for LibInjection module.""" @@ -43,38 +43,38 @@ class LibInjection(AbstractModule): self.redis_logger.info(f"Module: {self.module_name} Launched") def compute(self, message): - url, id = message.split() + url, item_id = message.split() self.faup.decode(url) url_parsed = self.faup.get() - ## TODO: # FIXME: remove me + # # TODO: # FIXME: remove me try: resource_path = url_parsed['resource_path'].encode() except: resource_path = url_parsed['resource_path'] - ## TODO: # FIXME: remove me + # # TODO: # FIXME: remove me try: query_string = url_parsed['query_string'].encode() except: query_string = url_parsed['query_string'] - result_path = {'sqli' : False} - result_query = {'sqli' : False} + result_path = {'sqli': False} + result_query = {'sqli': False} if resource_path is not None: result_path = pylibinjection.detect_sqli(resource_path) - #print(f'path is sqli : {result_path}') + # print(f'path is sqli : {result_path}') if query_string is not None: result_query = pylibinjection.detect_sqli(query_string) - #print(f'query is sqli : {result_query}') + # print(f'query is sqli : {result_query}') if result_path['sqli'] is True or result_query['sqli'] is True: - item = Item(id) + item = Item(item_id) item_id = item.get_id() print(f"Detected (libinjection) SQL in URL: {item_id}") - print(urllib.request.unquote(url)) + print(unquote(url)) to_print = f'LibInjection;{item.get_source()};{item.get_date()};{item.get_basename()};Detected SQL in URL;{item_id}' self.redis_logger.warning(to_print) @@ -86,8 +86,8 @@ class LibInjection(AbstractModule): msg = f'infoleak:automatic-detection="sql-injection";{item_id}' self.send_message_to_queue(msg, 'Tags') - #statistics - ## TODO: # FIXME: remove me + # statistics + # # TODO: # FIXME: remove me try: tld = url_parsed['tld'].decode() except: @@ -96,7 +96,7 @@ class LibInjection(AbstractModule): date = datetime.now().strftime("%Y%m") self.server_statistics.hincrby(f'SQLInjection_by_tld:{date}', tld, 1) -if __name__ == "__main__": +if __name__ == "__main__": module = LibInjection() module.run() diff --git a/bin/modules/Mail.py b/bin/modules/Mail.py index 78742774..7af3ea90 100755 --- a/bin/modules/Mail.py +++ b/bin/modules/Mail.py @@ -13,9 +13,7 @@ It apply mail regexes on item content and warn if above a threshold. import os import re -import redis import sys -import time import datetime import dns.resolver @@ -52,7 +50,7 @@ class Mail(AbstractModule): self.mail_threshold = 10 self.regex_timeout = 30 - self.email_regex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}" + self.email_regex = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}" re.compile(self.email_regex) def is_mxdomain_in_cache(self, mxdomain): @@ -64,8 +62,8 @@ class Mail(AbstractModule): def check_mx_record(self, set_mxdomains): """Check if emails MX domains are responding. - :param adress_set: -- (set) This is a set of emails domains - :return: (int) Number of adress with a responding and valid MX domains + :param set_mxdomains: -- (set) This is a set of emails domains + :return: (int) Number of address with a responding and valid MX domains """ resolver = dns.resolver.Resolver() @@ -107,7 +105,7 @@ class Mail(AbstractModule): self.redis_logger.debug('SyntaxError: EmptyLabel') print('SyntaxError: EmptyLabel') except dns.resolver.NXDOMAIN: - #save_mxdomain_in_cache(mxdomain) + # save_mxdomain_in_cache(mxdomain) self.redis_logger.debug('The query name does not exist.') print('The query name does not exist.') except dns.name.LabelTooLong: @@ -115,12 +113,12 @@ class Mail(AbstractModule): print('The Label is too long') except dns.exception.Timeout: print('dns timeout') - #save_mxdomain_in_cache(mxdomain) + # save_mxdomain_in_cache(mxdomain) except Exception as e: print(e) return valid_mxdomain - # # TODO: sanityze mails + # # TODO: sanitize mails def compute(self, message): item_id, score = message.split() item = Item(item_id) @@ -134,7 +132,7 @@ class Mail(AbstractModule): mxdomains_email[mxdomain] = set() mxdomains_email[mxdomain].add(mail) - ## TODO: add MAIL trackers + # # TODO: add MAIL trackers valid_mx = self.check_mx_record(mxdomains_email.keys()) print(f'valid_mx: {valid_mx}') @@ -144,7 +142,7 @@ class Mail(AbstractModule): nb_mails = len(mxdomains_email[domain_mx]) num_valid_email += nb_mails - # Create doamin_mail stats + # Create domain_mail stats msg = f'mail;{nb_mails};{domain_mx};{item_date}' self.send_message_to_queue(msg, 'ModuleStats') @@ -159,8 +157,8 @@ class Mail(AbstractModule): for tld in mx_tlds: Statistics.add_module_tld_stats_by_date('mail', item_date, tld, mx_tlds[tld]) + msg = f'Mails;{item.get_source()};{item_date};{item.get_basename()};Checked {num_valid_email} e-mail(s);{item_id}' if num_valid_email > self.mail_threshold: - msg = f'Mails;{item.get_source()};{item_date};{item.get_basename()};Checked {num_valid_email} e-mail(s);{item_id}' print(f'{item_id} Checked {num_valid_email} e-mail(s)') self.redis_logger.warning(msg) # Tags @@ -170,8 +168,6 @@ class Mail(AbstractModule): self.redis_logger.info(msg) - if __name__ == '__main__': module = Mail() - #module.compute('tests/2021/01/01/mails.gz 50') module.run() diff --git a/bin/modules/ModuleStats.py b/bin/modules/ModuleStats.py index 16e18b51..45767e66 100755 --- a/bin/modules/ModuleStats.py +++ b/bin/modules/ModuleStats.py @@ -25,12 +25,11 @@ class ModuleStats(AbstractModule): Module Statistics module for AIL framework """ - def __init__(self): super(ModuleStats, self).__init__() - # Waiting time in secondes between to message proccessed + # Waiting time in seconds between to message processed self.pending_seconds = 20 def compute(self, message): @@ -38,9 +37,10 @@ class ModuleStats(AbstractModule): # MODULE STATS if len(message.split(';')) > 1: module_name, num, keyword, date = message.split(';') - Statisticsupdate_module_stats(module_name, num, keyword, date) + Statistics.update_module_stats(module_name, num, keyword, date) # ITEM STATS else: + item_id = message item = Item(item_id) source = item.get_source() date = item.get_date() diff --git a/bin/modules/Onion.py b/bin/modules/Onion.py index afd417cc..7adbf8a4 100755 --- a/bin/modules/Onion.py +++ b/bin/modules/Onion.py @@ -13,8 +13,6 @@ Requirements *Need running Redis instances. (Redis) """ -import time -import datetime import os import sys import re @@ -25,68 +23,8 @@ sys.path.append(os.environ['AIL_BIN']) ################################## from modules.abstract_module import AbstractModule from lib.ConfigLoader import ConfigLoader +from lib.objects.Items import Item from lib import crawlers -from lib import regex_helper -from packages.Item import Item - -## Manually fetch first page if crawler is disabled -# import base64 -# import subprocess -# -# torclient_host = '127.0.0.1' -# torclient_port = 9050 -# -# def fetch(p, r_cache, urls, domains): -# now = datetime.datetime.now() -# path = os.path.join('onions', str(now.year).zfill(4), -# str(now.month).zfill(2), -# str(now.day).zfill(2), -# str(int(time.mktime(now.utctimetuple())))) -# failed = [] -# downloaded = [] -# print('{} Urls to fetch'.format(len(urls))) -# for url, domain in zip(urls, domains): -# if r_cache.exists(url) or url in failed: -# continue -# to_fetch = base64.standard_b64encode(url.encode('utf8')) -# print('fetching url: {}'.format(to_fetch)) -# process = subprocess.Popen(["python", './tor_fetcher.py', to_fetch], -# stdout=subprocess.PIPE) -# while process.poll() is None: -# time.sleep(1) -# -# if process.returncode == 0: -# r_cache.setbit(url, 0, 1) -# r_cache.expire(url, 360000) -# downloaded.append(url) -# print('downloaded : {}'.format(downloaded)) -# '''tempfile = process.stdout.read().strip() -# tempfile = tempfile.decode('utf8') -# #with open(tempfile, 'r') as f: -# filename = path + domain + '.gz' -# fetched = f.read() -# content = base64.standard_b64decode(fetched) -# save_path = os.path.join(os.environ['AIL_HOME'], -# p.config.get("Directories", "pastes"), -# filename) -# dirname = os.path.dirname(save_path) -# if not os.path.exists(dirname): -# os.makedirs(dirname) -# with open(save_path, 'w') as ff: -# ff.write(content) -# p.populate_set_out(save_path, 'Global') -# p.populate_set_out(url, 'ValidOnion') -# p.populate_set_out(fetched, 'FetchedOnion')''' -# yield url -# #os.unlink(tempfile) -# else: -# r_cache.setbit(url, 0, 0) -# r_cache.expire(url, 3600) -# failed.append(url) -# print('Failed at downloading', url) -# print(process.stdout.read()) -# print('Failed:', len(failed), 'Downloaded:', len(downloaded)) - class Onion(AbstractModule): """docstring for Onion module.""" @@ -103,68 +41,63 @@ class Onion(AbstractModule): self.regex_timeout = 30 self.faup = crawlers.get_faup() - self.redis_cache_key = regex_helper.generate_redis_cache_key(self.module_name) # activate_crawler = p.config.get("Crawler", "activate_crawler") - self.url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" - self.i2p_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" - re.compile(self.url_regex) - re.compile(self.i2p_regex) + self.onion_regex = r"((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" + # self.i2p_regex = r"((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" + re.compile(self.onion_regex) + # re.compile(self.i2p_regex) self.redis_logger.info(f"Module: {self.module_name} Launched") # TEMP var: SAVE I2P Domain (future I2P crawler) - self.save_i2p = config_loader.get_config_boolean("Onion", "save_i2p") + # self.save_i2p = config_loader.get_config_boolean("Onion", "save_i2p") def compute(self, message): - # list of tuples: (url, subdomains, domain) - urls_to_crawl = [] + onion_urls = [] + domains = [] - id, score = message.split() - item = Item(id) + item_id, score = message.split() + item = Item(item_id) item_content = item.get_content() # max execution time on regex - res = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.url_regex, item.get_id(), item_content) + res = self.regex_findall(self.onion_regex, item.get_id(), item_content) for x in res: # String to tuple x = x[2:-2].replace(" '", "").split("',") url = x[0] - subdomain = x[4].lower() - self.faup.decode(url) - url_unpack = self.faup.get() - try: ## TODO: # FIXME: check faup version - domain = url_unpack['domain'].decode().lower() - except Exception as e: - domain = url_unpack['domain'].lower() + print(url) + # TODO Crawl subdomain + url_unpack = crawlers.unpack_url(url) + domain = url_unpack['domain'] if crawlers.is_valid_onion_domain(domain): - urls_to_crawl.append((url, subdomain, domain)) + domains.append(domain) + onion_urls.append(url) - to_print = f'Onion;{item.get_source()};{item.get_date()};{item.get_basename()};' - if not urls_to_crawl: - self.redis_logger.info(f'{to_print}Onion related;{item.get_id()}') - return + if onion_urls: + if crawlers.is_crawler_activated(): + for domain in domains:# TODO LOAD DEFAULT SCREENSHOT + HAR + task_uuid = crawlers.add_crawler_task(domain, parent=item.get_id()) + if task_uuid: + print(f'{domain} added to crawler queue: {task_uuid}') + else: + to_print = f'Onion;{item.get_source()};{item.get_date()};{item.get_basename()};' + print(f'{to_print}Detected {len(domains)} .onion(s);{item.get_id()}') + self.redis_logger.warning(f'{to_print}Detected {len(domains)} .onion(s);{item.get_id()}') - # TAG Item - msg = f'infoleak:automatic-detection="onion";{item.get_id()}' - self.send_message_to_queue(msg, 'Tags') + # TAG Item + msg = f'infoleak:automatic-detection="onion";{item.get_id()}' + self.send_message_to_queue(msg, 'Tags') - if crawlers.is_crawler_activated(): - for to_crawl in urls_to_crawl: - print(f'{to_crawl[2]} added to crawler queue: {to_crawl[0]}') - crawlers.add_item_to_discovery_queue('onion', to_crawl[2], to_crawl[1], to_crawl[0], item.get_id()) - else: - print(f'{to_print}Detected {len(urls_to_crawl)} .onion(s);{item.get_id()}') - self.redis_logger.warning(f'{to_print}Detected {len(urls_to_crawl)} .onion(s);{item.get_id()}') - # keep manual fetcher ???? - ## Manually fetch first page if crawler is disabled - # for url in fetch(p, r_cache, urls, domains_list): - # publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_rel_path)) if __name__ == "__main__": - module = Onion() + # module.compute('submitted/2022/10/10/submitted_705d1d92-7e9a-4a44-8c21-ccd167bfb7db.gz 9') module.run() + + +# 5ajw6aqf3ep7sijnscdzw77t7xq4xjpsy335yb2wiwgouo7yfxtjlmid.onion to debian.org \ No newline at end of file diff --git a/bin/modules/Phone.py b/bin/modules/Phone.py index b918481a..a7d2a87d 100755 --- a/bin/modules/Phone.py +++ b/bin/modules/Phone.py @@ -17,7 +17,6 @@ It apply phone number regexes on item content and warn if above a threshold. import os import re import sys -import time import phonenumbers sys.path.append(os.environ['AIL_BIN']) @@ -25,7 +24,7 @@ sys.path.append(os.environ['AIL_BIN']) # Import Project packages ################################## from modules.abstract_module import AbstractModule -from packages.Item import Item +from lib.objects.Items import Item # # TODO: # FIXME: improve regex / filter false positives class Phone(AbstractModule): @@ -37,14 +36,12 @@ class Phone(AbstractModule): # reg_phone = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\d{2,3}){3,4})') REG_PHONE = re.compile(r'(\+\d{1,4}(\(\d\))?\d?|0\d?)(\d{6,8}|([-/\. ]{1}\(?\d{2,4}\)?){3,4})') - def __init__(self): super(Phone, self).__init__() - # Waiting time in secondes between to message proccessed + # Waiting time in seconds between to message processed self.pending_seconds = 1 - def compute(self, message): item = Item(message) content = item.get_content() @@ -79,6 +76,5 @@ class Phone(AbstractModule): if __name__ == '__main__': - module = Phone() module.run() diff --git a/bin/modules/SQLInjectionDetection.py b/bin/modules/SQLInjectionDetection.py index 25450a89..17d52741 100755 --- a/bin/modules/SQLInjectionDetection.py +++ b/bin/modules/SQLInjectionDetection.py @@ -14,11 +14,11 @@ It test different possibility to makes some sqlInjection. import os import sys import re -import redis import urllib.request from datetime import datetime from pyfaup.faup import Faup +from urllib.parse import unquote sys.path.append(os.environ['AIL_BIN']) ################################## @@ -26,7 +26,7 @@ sys.path.append(os.environ['AIL_BIN']) ################################## from modules.abstract_module import AbstractModule from lib.ConfigLoader import ConfigLoader -from packages.Item import Item +from lib.objects.Items import Item class SQLInjectionDetection(AbstractModule): """docstring for SQLInjectionDetection module.""" @@ -46,13 +46,13 @@ class SQLInjectionDetection(AbstractModule): self.redis_logger.info(f"Module: {self.module_name} Launched") def compute(self, message): - url, id = message.split() + url, item_id = message.split() if self.is_sql_injection(url): self.faup.decode(url) url_parsed = self.faup.get() - item = Item(id) + item = Item(item_id) item_id = item.get_id() print(f"Detected SQL in URL: {item_id}") print(urllib.request.unquote(url)) @@ -69,7 +69,7 @@ class SQLInjectionDetection(AbstractModule): # statistics tld = url_parsed['tld'] if tld is not None: - ## TODO: # FIXME: remove me + # # TODO: # FIXME: remove me try: tld = tld.decode() except: @@ -77,15 +77,13 @@ class SQLInjectionDetection(AbstractModule): date = datetime.now().strftime("%Y%m") self.server_statistics.hincrby(f'SQLInjection_by_tld:{date}', tld, 1) - # Try to detect if the url passed might be an sql injection by appliying the regex + # Try to detect if the url passed might be an sql injection by applying the regex # defined above on it. def is_sql_injection(self, url_parsed): - line = urllib.request.unquote(url_parsed) - + line = unquote(url_parsed) return re.search(SQLInjectionDetection.SQLI_REGEX, line, re.I) is not None if __name__ == "__main__": - module = SQLInjectionDetection() module.run() diff --git a/bin/modules/SentimentAnalysis.py b/bin/modules/SentimentAnalysis.py index ff0b8142..68b9edf2 100755 --- a/bin/modules/SentimentAnalysis.py +++ b/bin/modules/SentimentAnalysis.py @@ -34,9 +34,8 @@ sys.path.append(os.environ['AIL_BIN']) # Import Project packages ################################## from modules.abstract_module import AbstractModule -from packages import Paste -sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) -import ConfigLoader +from lib.objects.Items import Item +from lib import ConfigLoader class TimeoutException(Exception): @@ -53,12 +52,10 @@ class SentimentAnalysis(AbstractModule): SentimentAnalysis module for AIL framework """ - # Config Variables accepted_Mime_type = ['text/plain'] line_max_length_threshold = 1000 - def __init__(self): super(SentimentAnalysis, self).__init__() @@ -75,7 +72,6 @@ class SentimentAnalysis(AbstractModule): # Send module state to logs self.redis_logger.info(f"Module {self.module_name} initialized") - def compute(self, message): # Max time to compute one entry signal.alarm(60) @@ -87,16 +83,31 @@ class SentimentAnalysis(AbstractModule): else: signal.alarm(0) + def get_p_content_with_removed_lines(self, threshold, item_content): + num_line_removed = 0 + line_length_threshold = threshold + string_content = "" + f = item_content + for line_id, line in enumerate(f): + length = len(line) + + if length < line_length_threshold: + string_content += line + else: + num_line_removed += 1 + + return num_line_removed, string_content def analyse(self, message): - paste = Paste.Paste(message) + item = Item(message) # get content with removed line + number of them - num_line_removed, p_content = paste.get_p_content_with_removed_lines(SentimentAnalysis.line_max_length_threshold) - provider = paste.p_source - p_date = str(paste._get_p_date()) - p_MimeType = paste._get_p_encoding() + num_line_removed, p_content = self.get_p_content_with_removed_lines(SentimentAnalysis.line_max_length_threshold, + item.get_content()) + provider = item.get_source() + p_date = item.get_date() + p_MimeType = item.get_mimetype() # Perform further analysis if p_MimeType == "text/plain": diff --git a/bin/modules/Tags.py b/bin/modules/Tags.py index 9a14b14d..6300a1d1 100755 --- a/bin/modules/Tags.py +++ b/bin/modules/Tags.py @@ -20,7 +20,7 @@ sys.path.append(os.environ['AIL_BIN']) # Import Project packages ################################## from modules.abstract_module import AbstractModule -from packages.Item import Item +from lib.objects.Items import Item from lib import Tag @@ -32,13 +32,12 @@ class Tags(AbstractModule): def __init__(self): super(Tags, self).__init__() - # Waiting time in secondes between to message proccessed + # Waiting time in seconds between to message processed self.pending_seconds = 10 # Send module state to logs self.redis_logger.info(f'Module {self.module_name} initialized') - def compute(self, message): # Extract item ID and tag from message mess_split = message.split(';') @@ -62,6 +61,5 @@ class Tags(AbstractModule): if __name__ == '__main__': - module = Tags() module.run() diff --git a/bin/modules/Telegram.py b/bin/modules/Telegram.py index a44d14e6..4717e3db 100755 --- a/bin/modules/Telegram.py +++ b/bin/modules/Telegram.py @@ -16,7 +16,7 @@ sys.path.append(os.environ['AIL_BIN']) # Import Project packages ################################## from modules.abstract_module import AbstractModule -from packages.Item import Item +from lib.objects.Items import Item from lib import regex_helper from lib import telegram @@ -78,7 +78,7 @@ class Telegram(AbstractModule): # CREATE TAG if invite_code_found: - #tags + # tags msg = f'infoleak:automatic-detection="telegram-invite-hash";{item.get_id()}' self.send_message_to_queue(msg, 'Tags') diff --git a/bin/modules/Urls.py b/bin/modules/Urls.py index 57e53da1..636a00d6 100755 --- a/bin/modules/Urls.py +++ b/bin/modules/Urls.py @@ -13,7 +13,6 @@ This module extract URLs from an item and send them to others modules. # Import External packages ################################## import os -import re import sys from pyfaup.faup import Faup @@ -23,8 +22,7 @@ sys.path.append(os.environ['AIL_BIN']) # Import Project packages ################################## from modules.abstract_module import AbstractModule -from packages.Item import Item -from lib import regex_helper +from lib.objects.Items import Item # # TODO: Faup packages: Add new binding: Check TLD @@ -40,7 +38,6 @@ class Urls(AbstractModule): super(Urls, self).__init__() self.faup = Faup() - self.redis_cache_key = regex_helper.generate_redis_cache_key(self.module_name) # Protocol file path protocolsfile_path = os.path.join(os.environ['AIL_HOME'], @@ -58,21 +55,26 @@ class Urls(AbstractModule): # Send module state to logs self.redis_logger.info(f"Module {self.module_name} initialized") - def compute(self, message): """ Search for Web links from given message """ # Extract item - id, score = message.split() + item_id, score = message.split() - item = Item(id) + item = Item(item_id) item_content = item.get_content() - l_urls = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.url_regex, item.get_id(), item_content) + # TODO Handle invalid URL + l_urls = self.regex_findall(self.url_regex, item.get_id(), item_content) for url in l_urls: self.faup.decode(url) - unpack_url = self.faup.get() + url_decoded = self.faup.get() + # decode URL + try: + url = url_decoded['url'].decode() + except AttributeError: + url = url_decoded['url'] to_send = f"{url} {item.get_id()}" print(to_send) @@ -83,7 +85,7 @@ class Urls(AbstractModule): to_print = f'Urls;{item.get_source()};{item.get_date()};{item.get_basename()};' self.redis_logger.info(f'{to_print}Detected {len(l_urls)} URL;{item.get_id()}') -if __name__ == '__main__': +if __name__ == '__main__': module = Urls() module.run() diff --git a/bin/modules/Zerobins.py b/bin/modules/Zerobins.py index 3a9b1fa7..5fcf9355 100755 --- a/bin/modules/Zerobins.py +++ b/bin/modules/Zerobins.py @@ -11,9 +11,8 @@ This module spots zerobins-like services for further processing ################################## import os import sys -import time -import pdb import re + sys.path.append(os.environ['AIL_BIN']) ################################## # Import Project packages @@ -42,33 +41,31 @@ class Zerobins(AbstractModule): # Send module state to logs self.redis_logger.info(f'Module {self.module_name} initialized') - def computeNone(self): """ Compute when no message in queue """ self.redis_logger.debug("No message in queue") - def compute(self, message): - """regex_helper.regex_findall(self.module_name, self.redis_cache_key + """ Compute a message in queue """ - print(message) - url, id = message.split() + url, item_id = message.split() # Extract zerobins addresses - matching_binz = self.regex_findall(self.regex, id, url) + matching_binz = self.regex_findall(self.regex, item_id, url) if len(matching_binz) > 0: - for bin in matching_binz: - print("send {} to crawler".format(bin)) - crawlers.create_crawler_task(bin, screenshot=False, har=False, depth_limit=1, max_pages=1, auto_crawler=False, crawler_delta=3600, crawler_type=None, cookiejar_uuid=None, user_agent=None) + for bin_url in matching_binz: + print(f'send {bin_url} to crawler') + crawlers.add_crawler_task(bin_url, depth=0, har=False, screenshot=False, proxy='force_tor', + parent='manual', priority=10) self.redis_logger.debug("Compute message in queue") +# TODO TEST ME if __name__ == '__main__': - module = Zerobins() - module.run() \ No newline at end of file + module.run() diff --git a/bin/modules/abstract_module.py b/bin/modules/abstract_module.py index 32c60041..52b299b2 100644 --- a/bin/modules/abstract_module.py +++ b/bin/modules/abstract_module.py @@ -59,6 +59,9 @@ class AbstractModule(ABC): # Setup the I/O queues self.process = Process(self.queue_name) + # Debug Mode + self.debug = False + def get_message(self): """ Get message from the Redis Queue (QueueIn) @@ -104,6 +107,8 @@ class AbstractModule(ABC): # Module processing with the message from the queue self.compute(message) except Exception as err: + if self.debug: + raise err trace = traceback.format_tb(err.__traceback__) trace = ''.join(trace) self.redis_logger.critical(f"Error in module {self.module_name}: {err}") diff --git a/bin/modules/submit_paste.py b/bin/modules/submit_paste.py index 7777b301..9afc55b2 100755 --- a/bin/modules/submit_paste.py +++ b/bin/modules/submit_paste.py @@ -16,7 +16,6 @@ import os import sys import gzip import io -import redis import base64 import datetime import time @@ -51,6 +50,7 @@ class SubmitPaste(AbstractModule): """ super(SubmitPaste, self).__init__(queue_name='submit_paste') + # TODO KVROCKS self.r_serv_db = ConfigLoader.ConfigLoader().get_redis_conn("ARDB_DB") self.r_serv_log_submit = ConfigLoader.ConfigLoader().get_redis_conn("Redis_Log_submit") self.r_serv_tags = ConfigLoader.ConfigLoader().get_redis_conn("ARDB_Tags") @@ -61,7 +61,6 @@ class SubmitPaste(AbstractModule): self.PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], ConfigLoader.ConfigLoader().get_config_str("Directories", "pastes")) + '/' - def compute(self, uuid): """ Main method of the Module to implement @@ -129,7 +128,6 @@ class SubmitPaste(AbstractModule): self.redis_logger.debug(f'{self.module_name}, waiting for new message, Idling {self.pending_seconds}s') time.sleep(self.pending_seconds) - def _manage_text(self, uuid, paste_content, ltags, ltagsgalaxies, source): """ Create a paste for given text @@ -141,7 +139,6 @@ class SubmitPaste(AbstractModule): else: self.abord_file_submission(uuid, f'Text size is over {SubmitPaste.TEXT_MAX_SIZE} bytes') - def _manage_file(self, uuid, file_full_path, ltags, ltagsgalaxies, source): """ Create a paste for given file @@ -230,7 +227,6 @@ class SubmitPaste(AbstractModule): else: self.abord_file_submission(uuid, "Server Error, the archive can't be found") - def _is_compressed_type(self, file_type): """ Check if file type is in the list of compressed file extensions format @@ -239,7 +235,6 @@ class SubmitPaste(AbstractModule): return file_type in compressed_type - def remove_submit_uuid(self, uuid): # save temp value on disk self.r_serv_db.delete(f'{uuid}:ltags') @@ -262,7 +257,6 @@ class SubmitPaste(AbstractModule): self.redis_logger.debug(f'{uuid} all file submitted') print(f'{uuid} all file submitted') - def create_paste(self, uuid, paste_content, ltags, ltagsgalaxies, name, source=None): # # TODO: Use Item create @@ -272,8 +266,8 @@ class SubmitPaste(AbstractModule): source = source if source else 'submitted' save_path = source + '/' + now.strftime("%Y") + '/' + now.strftime("%m") + '/' + now.strftime("%d") + '/submitted_' + name + '.gz' - full_path = filename = os.path.join(os.environ['AIL_HOME'], - self.process.config.get("Directories", "pastes"), save_path) + full_path = os.path.join(os.environ['AIL_HOME'], + self.process.config.get("Directories", "pastes"), save_path) self.redis_logger.debug(f'file path of the paste {full_path}') @@ -281,7 +275,7 @@ class SubmitPaste(AbstractModule): # file not exists in AIL paste directory self.redis_logger.debug(f"new paste {paste_content}") - gzip64encoded = self._compress_encode_content(paste_content) + gzip64encoded = self._compress_encode_content(paste_content, uuid) if gzip64encoded: @@ -321,36 +315,30 @@ class SubmitPaste(AbstractModule): return result - - def _compress_encode_content(self, content): + def _compress_encode_content(self, content, uuid): gzip64encoded = None - try: gzipencoded = gzip.compress(content) gzip64encoded = base64.standard_b64encode(gzipencoded).decode() except: self.abord_file_submission(uuid, "file error") - return gzip64encoded - def addError(self, uuid, errorMessage): self.redis_logger.debug(errorMessage) print(errorMessage) error = self.r_serv_log_submit.get(f'{uuid}:error') - if error != None: + if error is not None: self.r_serv_log_submit.set(f'{uuid}:error', error + '

' + errorMessage) - self.r_serv_log_submit.incr(f'{uuid}:nb_end') - def abord_file_submission(self, uuid, errorMessage): self.redis_logger.debug(f'abord {uuid}, {errorMessage}') self.addError(uuid, errorMessage) self.r_serv_log_submit.set(f'{uuid}:end', 1) curr_date = datetime.date.today() - self.serv_statistics.hincrby(curr_date.strftime("%Y%m%d"),'submit_abord', 1) + self.serv_statistics.hincrby(curr_date.strftime("%Y%m%d"), 'submit_abord', 1) self.remove_submit_uuid(uuid) # # TODO: use Item function @@ -358,14 +346,13 @@ class SubmitPaste(AbstractModule): l_directory = item_filename.split('/') return f'{l_directory[-4]}{l_directory[-3]}{l_directory[-2]}' - def verify_extention_filename(self, filename): if not '.' in filename: return True else: file_type = filename.rsplit('.', 1)[1] - #txt file + # txt file if file_type in SubmitPaste.ALLOWED_EXTENSIONS: return True else: @@ -373,6 +360,5 @@ class SubmitPaste(AbstractModule): if __name__ == '__main__': - module = SubmitPaste() module.run() diff --git a/bin/packages/Date.py b/bin/packages/Date.py index ba234532..86646399 100644 --- a/bin/packages/Date.py +++ b/bin/packages/Date.py @@ -153,6 +153,9 @@ def sanitise_date_range(date_from, date_to, separator='', date_type='str'): date_from = date_to elif not date_to and date_from: date_to = date_from + elif not date_to and not date_from: + date = datetime.date.today().strftime("%Y%m%d") + return {"date_from": date, "date_to": date} if date_type=='str': # remove separators diff --git a/bin/packages/HiddenServices.py b/bin/packages/HiddenServices.py index 7b0c444a..6cd59d51 100755 --- a/bin/packages/HiddenServices.py +++ b/bin/packages/HiddenServices.py @@ -320,39 +320,6 @@ class HiddenServices(object): har_path = os.path.join(self.screenshot_directory, item_path) + '.json' return har_path - def create_domain_basic_archive(self, l_pastes): - all_har = self.get_all_har(l_pastes, filename=True) - all_screenshot = self.get_all_domain_screenshot(l_pastes, filename=True) - all_items = self.get_all_item_full_path(l_pastes, filename=True) - - # try: - - # zip buffer - zip_buffer = BytesIO() - - with zipfile.ZipFile(zip_buffer, "a") as zf: - - #print(all_har) - self.write_in_zip_buffer(zf, all_har) - self.write_in_zip_buffer(zf, all_screenshot) - self.write_in_zip_buffer(zf, all_items) - - # write map url - map_file_content = self.get_metadata_file(l_pastes).encode() - zf.writestr( '_URL_MAP_', BytesIO(map_file_content).getvalue()) - - zip_buffer.seek(0) - return zip_buffer - - # except Exception as e: - # print(e) - # return 'Server Error' - - def write_in_zip_buffer(self, zf, list_file): - for file_path, file_name in list_file: - with open(file_path, "rb") as f: - har_content = f.read() - zf.writestr( file_name, BytesIO(har_content).getvalue()) def get_metadata_file(self, list_items): file_content = '' diff --git a/bin/template.py b/bin/template.py index 88f17cbd..50714d63 100755 --- a/bin/template.py +++ b/bin/template.py @@ -20,7 +20,7 @@ sys.path.append(os.environ['AIL_BIN']) # Import Project packages ################################## from modules.abstract_module import AbstractModule - +# from lib.objects.Items import Item class Template(AbstractModule): """ @@ -36,19 +36,20 @@ class Template(AbstractModule): # Send module state to logs self.redis_logger.info(f'Module {self.module_name} initialized') - def computeNone(self): """ - Compute when no message in queue + Do something when there is no message in the queue """ self.redis_logger.debug("No message in queue") - def compute(self, message): """ - Compute a message in queue + Compute a message in queue / process the message (item_id, ...) """ self.redis_logger.debug("Compute message in queue") + # # if message is an item_id: + # item = Item(message) + # content = item.get_content() if __name__ == '__main__': diff --git a/bin/tor_fetcher.py b/bin/tor_fetcher.py deleted file mode 100644 index 67a2f4f8..00000000 --- a/bin/tor_fetcher.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# -*-coding:UTF-8 -* - -import socks -import socket -import urllib.request -import io -import gzip -import base64 -import sys -import tempfile - -# Max size in Mb -max_size = 5 - -def create_connection(address, timeout=None, source_address=None): - sock = socks.socksocket() - sock.connect(address) - return sock - - -def get_page(url, torclient_host='127.0.0.1', torclient_port=9050): - - request = urllib.request.Request(url) - # UA of the Tor browser bundle - request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0') - return urllib.request.urlopen(request, timeout=5).read(max_size * 100000) - -#FIXME don't work at all -def makegzip64(s): - - out = io.BytesIO() - - with gzip.GzipFile(fileobj=out, mode='ab') as fo: - fo.write(base64.standard_b64encode(s)) - - return out.getvalue() - - -if __name__ == "__main__": - - if len(sys.argv) != 2: - print('usage:', 'tor_fetcher.py', 'URL (base64 encoded)') - exit(1) - - try: - url = base64.standard_b64decode(sys.argv[1]).decode('utf8') - print(url) - except: - print('unable to decode') - exit(1) - - torclient_host = '127.0.0.1' - torclient_port = 9050 - # Setup Proxy - socks.set_default_proxy(socks.SOCKS5, torclient_host, torclient_port, True) - socket.socket = socks.socksocket - socket.create_connection = create_connection - - try: - page = get_page(url) - except: - print('unable to fetch') - exit(1) - - to_write = makegzip64(page) - t, path = tempfile.mkstemp() - #with open(path, 'w') as f: - #f.write(to_write) - print(path) - exit(0) diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py deleted file mode 100644 index 5d319c69..00000000 --- a/bin/torcrawler/TorSplashCrawler.py +++ /dev/null @@ -1,328 +0,0 @@ -#!/usr/bin/env python3 -# -*-coding:UTF-8 -* - -import os -import sys -import uuid -import datetime -import redis -import json -import time - -from hashlib import sha256 - -from scrapy.spidermiddlewares.httperror import HttpError -from twisted.internet.error import DNSLookupError -from twisted.internet.error import TimeoutError -from twisted.web._newclient import ResponseNeverReceived - -from scrapy import Spider -from scrapy.linkextractors import LinkExtractor -from scrapy.crawler import CrawlerProcess, Crawler - -from scrapy_splash import SplashRequest, SplashJsonResponse - -sys.path.append(os.environ['AIL_BIN']) -from Helper import Process - -sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib')) -import ConfigLoader -import Screenshot -import crawlers - -script_cookie = """ -function main(splash, args) - -- Default values - splash.js_enabled = true - splash.private_mode_enabled = true - splash.images_enabled = true - splash.webgl_enabled = true - splash.media_source_enabled = true - - -- Force enable things - splash.plugins_enabled = true - splash.request_body_enabled = true - splash.response_body_enabled = true - - splash.indexeddb_enabled = true - splash.html5_media_enabled = true - splash.http2_enabled = true - - -- User Agent - splash:set_user_agent(args.user_agent) - - -- User defined - splash.resource_timeout = args.resource_timeout - splash.timeout = args.timeout - - -- Allow to pass cookies - splash:init_cookies(args.cookies) - - -- Run - ok, reason = splash:go{args.url} - if not ok and not reason:find("http") then - return { - error = reason, - last_url = splash:url() - } - end - if reason == "http504" then - splash:set_result_status_code(504) - return '' - end - - splash:wait{args.wait} - -- Page instrumentation - -- splash.scroll_position = {y=1000} - -- splash:wait{args.wait} - -- Response - return { - har = splash:har(), - html = splash:html(), - png = splash:png{render_all=true}, - cookies = splash:get_cookies(), - last_url = splash:url(), - } -end -""" - -class TorSplashCrawler(): - - def __init__(self, splash_url, crawler_options): - self.process = CrawlerProcess({'LOG_ENABLED': True}) - self.crawler = Crawler(self.TorSplashSpider, { - 'USER_AGENT': crawler_options['user_agent'], # /!\ overwritten by lua script - 'SPLASH_URL': splash_url, - 'ROBOTSTXT_OBEY': False, - 'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723, - 'scrapy_splash.SplashMiddleware': 725, - 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, - 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, - }, - 'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,}, - 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', - 'HTTPERROR_ALLOW_ALL': True, - 'RETRY_TIMES': 2, - 'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'], - 'DEPTH_LIMIT': crawler_options['depth_limit'], - 'SPLASH_COOKIES_DEBUG': False - }) - - def crawl(self, splash_url, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item): - self.process.crawl(self.crawler, splash_url=splash_url, type=type, crawler_options=crawler_options, date=date, requested_mode=requested_mode, url=url, domain=domain, port=port, cookies=cookies, original_item=original_item) - self.process.start() - - class TorSplashSpider(Spider): - name = 'TorSplashSpider' - - def __init__(self, splash_url, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs): - self.splash_url = splash_url - self.domain_type = type - self.requested_mode = requested_mode - self.original_item = original_item - self.root_key = None - self.start_urls = url - self.domains = [domain] - self.port = str(port) - date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8]) - self.full_date = date['date_day'] - self.date_month = date['date_month'] - self.date_epoch = int(date['epoch']) - - self.user_agent = crawler_options['user_agent'] - self.png = crawler_options['png'] - self.har = crawler_options['har'] - self.cookies = cookies - - config_section = 'Crawler' - self.p = Process(config_section) - self.item_dir = os.path.join(self.p.config.get("Directories", "crawled"), date_str ) - - config_loader = ConfigLoader.ConfigLoader() - self.har_dir = os.path.join(config_loader.get_files_directory('har') , date_str ) - config_loader = None - - self.r_serv_log_submit = redis.StrictRedis( - host=self.p.config.get("Redis_Log_submit", "host"), - port=self.p.config.getint("Redis_Log_submit", "port"), - db=self.p.config.getint("Redis_Log_submit", "db"), - decode_responses=True) - - self.root_key = None - - def build_request_arg(self, cookies): - return {'wait': 10, - 'resource_timeout': 30, # /!\ Weird behaviour if timeout < resource_timeout /!\ - 'timeout': 30, - 'user_agent': self.user_agent, - 'cookies': cookies, - 'lua_source': script_cookie - } - - def start_requests(self): - l_cookies = self.build_request_arg(self.cookies) - yield SplashRequest( - self.start_urls, - self.parse, - errback=self.errback_catcher, - endpoint='execute', - meta={'father': self.original_item, 'current_url': self.start_urls}, - args=l_cookies - ) - - # # TODO: remove duplicate and anchor - def parse(self,response): - #print(response.headers) - #print(response.status) - #print(response.meta) - #print(response.data) # # TODO: handle lua script error - #{'type': 'ScriptError', 'info': {'error': "'}' expected (to close '{' at line 47) near 'error_retry'", - #'message': '[string "..."]:53: \'}\' expected (to close \'{\' at line 47) near \'error_retry\'', - #'type': 'LUA_INIT_ERROR', 'source': '[string "..."]', 'line_number': 53}, - #'error': 400, 'description': 'Error happened while executing Lua script'} - if response.status == 504: - # no response - #print('504 detected') - pass - - # LUA ERROR # # TODO: logs errors - elif 'error' in response.data: - if(response.data['error'] == 'network99'): - ## splash restart ## - error_retry = response.meta.get('error_retry', 0) - if error_retry < 3: - error_retry += 1 - url = response.data['last_url'] - father = response.meta['father'] - - self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url) - time.sleep(10) - if 'cookies' in response.data: - all_cookies = response.data['cookies'] # # TODO: use initial cookie ????? - else: - all_cookies = [] - l_cookies = self.build_request_arg(all_cookies) - yield SplashRequest( - url, - self.parse, - errback=self.errback_catcher, - endpoint='execute', - dont_filter=True, - meta={'father': father, 'current_url': url, 'error_retry': error_retry}, - args=l_cookies - ) - else: - if self.requested_mode == 'test': - crawlers.save_test_ail_crawlers_result(False, 'Connection to proxy refused') - print('Connection to proxy refused') - elif response.data['error'] == 'network3': - if self.requested_mode == 'test': - crawlers.save_test_ail_crawlers_result(False, 'HostNotFoundError: the remote host name was not found (invalid hostname)') - print('HostNotFoundError: the remote host name was not found (invalid hostname)') - else: - if self.requested_mode == 'test': - crawlers.save_test_ail_crawlers_result(False, response.data['error']) - print(response.data['error']) - - elif response.status != 200: - print('other response: {}'.format(response.status)) - # detect connection to proxy refused - error_log = (json.loads(response.body.decode())) - print(error_log) - #elif crawlers.is_redirection(self.domains[0], response.data['last_url']): - # pass # ignore response - else: - ## TEST MODE ## - if self.requested_mode == 'test': - if 'It works!' in response.data['html']: - crawlers.save_test_ail_crawlers_result(True, 'It works!') - else: - print('TEST ERROR') - crawlers.save_test_ail_crawlers_result(False, 'TEST ERROR') - return - ## -- ## - - item_id = crawlers.create_item_id(self.item_dir, self.domains[0]) - self.save_crawled_item(item_id, response.data['html']) - crawlers.create_item_metadata(item_id, self.domains[0], response.data['last_url'], self.port, response.meta['father']) - - if self.root_key is None: - self.root_key = item_id - crawlers.add_domain_root_item(item_id, self.domain_type, self.domains[0], self.date_epoch, self.port) - crawlers.create_domain_metadata(self.domain_type, self.domains[0], self.port, self.full_date, self.date_month) - - if 'cookies' in response.data: - all_cookies = response.data['cookies'] - else: - all_cookies = [] - - # SCREENSHOT - if 'png' in response.data and self.png: - sha256_string = Screenshot.save_crawled_screeshot(response.data['png'], 5000000, f_save=self.requested_mode) - if sha256_string: - Screenshot.save_item_relationship(sha256_string, item_id) - Screenshot.save_domain_relationship(sha256_string, self.domains[0]) - # HAR - if 'har' in response.data and self.har: - crawlers.save_har(self.har_dir, item_id, response.data['har']) - - le = LinkExtractor(allow_domains=self.domains, unique=True) - for link in le.extract_links(response): - l_cookies = self.build_request_arg(all_cookies) - yield SplashRequest( - link.url, - self.parse, - errback=self.errback_catcher, - endpoint='execute', - meta={'father': item_id, 'current_url': link.url}, - args=l_cookies - ) - - def errback_catcher(self, failure): - # catch all errback failures, - self.logger.error(repr(failure)) - - if failure.check(ResponseNeverReceived): - ## DEBUG ## - self.logger.error(failure.request) - if failure.value.response: - self.logger.error(failure.value.response) - ## ----- ## - - # Extract request metadata - url = failure.request.meta['current_url'] - father = failure.request.meta['father'] - l_cookies = self.build_request_arg(failure.request.meta['splash']['args']['cookies']) - - # Check if Splash restarted - if not crawlers.is_splash_reachable(self.splash_url): - self.logger.error('Splash, ResponseNeverReceived for %s, retry in 30s ...', url) - time.sleep(30) - - yield SplashRequest( - url, - self.parse, - errback=self.errback_catcher, - endpoint='execute', - meta={'father': father, 'current_url': url}, - args=l_cookies - ) - - else: - self.logger.error(failure.type) - self.logger.error(failure.getErrorMessage()) - - def save_crawled_item(self, item_id, item_content): - gzip64encoded = crawlers.save_crawled_item(item_id, item_content) - - # Send item to queue - # send paste to Global - relay_message = "{0} {1}".format(item_id, gzip64encoded) - self.p.populate_set_out(relay_message, 'Mixer') - - # increase nb of paste by feeder name - self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1) - - # tag crawled paste - msg = 'infoleak:submission="crawler";{}'.format(item_id) - self.p.populate_set_out(msg, 'Tags') diff --git a/bin/torcrawler/launch_splash_crawler.sh b/bin/torcrawler/launch_splash_crawler.sh deleted file mode 100755 index 87884a57..00000000 --- a/bin/torcrawler/launch_splash_crawler.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/bin/bash - -issplashed=`screen -ls | egrep '[0-9]+.Docker_Splash' | cut -d. -f1` - -usage() { echo "Usage: sudo $0 [-f ] [-p ] [-n ]" 1>&2; - echo " -f: absolute path to splash docker proxy-profiles directory (used for proxy configuration)"; - echo " -p: number of the first splash server port number. This number is incremented for the others splash server"; - echo " -n: number of splash servers to start"; - echo ""; - echo " -options:"; - echo " -u: max unbound in-memory cache (Mb, Restart Splash when full, default=3000 Mb)"; - echo ""; - echo "example:"; - echo "sudo ./launch_splash_crawler.sh -f /home/my_user/AIL-framework/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 3"; - exit 1; - } - -while getopts ":p:f:n:u:" o; do - case "${o}" in - p) - p=${OPTARG} - ;; - f) - f=${OPTARG} - ;; - n) - n=${OPTARG} - ;; - u) - u=${OPTARG} - ;; - *) - usage - ;; - esac -done -shift $((OPTIND-1)) - -if [ -z "${p}" ] || [ -z "${f}" ] || [ -z "${n}" ]; then - usage; -fi - -RED="\\033[1;31m" -DEFAULT="\\033[0;39m" -GREEN="\\033[1;32m" -WHITE="\\033[0;02m" - -if [ "$EUID" -ne 0 ]; then - echo -e $RED"\t* Please run as root or sudo.\n"$DEFAULT - exit 1 -fi - -if [ ! -d "${f}" ]; then - printf "$RED\n Error -f, proxy-profiles directory: $WHITE${f}$RED not found\n$DEFAULT Please check if you enter the correct path\n" - exit 1 -fi - -if [ ! -f "${f}default.ini" ]; then - printf "$RED\n Error -f, proxy configuration file:$WHITE default.ini$RED not found\n$DEFAULT Please check if you enter the correct path\n" - exit 1 -fi - -if [[ $issplashed ]]; then - echo -e $RED"\t* A screen is already launched, please kill it before creating another one."$DEFAULT - exit 1 -fi - -if [ -z "${u}" ]; then - u=3000; -fi - -screen -dmS "Docker_Splash" -sleep 0.1 - -for ((i=0;i<=$((${n} - 1));i++)); do - port_number=$((${p} + $i)) - screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -d -p '$port_number':8050 --restart=always --cpus=1 --memory=2G -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash --maxrss '$u'; read x' - sleep 0.1 - printf "$GREEN Splash server launched on port $port_number$DEFAULT\n" -done diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py deleted file mode 100755 index 3f493b84..00000000 --- a/bin/torcrawler/tor_crawler.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python3 -# -*-coding:UTF-8 -* - -import os -import sys -import json -import redis -from TorSplashCrawler import TorSplashCrawler - -sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) -import ConfigLoader -import crawlers - -if __name__ == '__main__': - - if len(sys.argv) != 2: - print('usage:', 'tor_crawler.py', 'uuid') - exit(1) - - - config_loader = ConfigLoader.ConfigLoader() - redis_cache = config_loader.get_redis_conn("Redis_Cache") - config_loader = None - - # get crawler config key - uuid = sys.argv[1] - - # get configs - crawler_json = json.loads(redis_cache.get('crawler_request:{}'.format(uuid))) - - splash_url = crawler_json['splash_url'] - service_type = crawler_json['service_type'] - url = crawler_json['url'] - domain = crawler_json['domain'] - port = crawler_json['port'] - original_item = crawler_json['item'] - crawler_options = crawler_json['crawler_options'] - date = crawler_json['date'] - requested_mode = crawler_json['requested'] - - if crawler_options['cookiejar_uuid']: - cookies = crawlers.load_crawler_cookies(crawler_options['cookiejar_uuid'], domain, crawler_type=service_type) - else: - cookies = [] - - redis_cache.delete('crawler_request:{}'.format(uuid)) - - try: - crawler = TorSplashCrawler(splash_url, crawler_options) - crawler.crawl(splash_url, service_type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item) - except Exception as e: - print(e) - print(e, file=sys.stderr) diff --git a/bin/trackers/Retro_Hunt.py b/bin/trackers/Retro_Hunt.py index 037520fc..434eaae0 100755 --- a/bin/trackers/Retro_Hunt.py +++ b/bin/trackers/Retro_Hunt.py @@ -10,7 +10,6 @@ The Retro_Hunt trackers module # Import External packages ################################## import os -import re import sys import time import yara @@ -20,15 +19,15 @@ sys.path.append(os.environ['AIL_BIN']) # Import Project packages ################################## from modules.abstract_module import AbstractModule -from packages.Item import Item -from packages.Item import Date +from lib.objects.Items import Item +from packages import Date from lib import Tracker import NotificationHelper # # TODO: refractor class Retro_Hunt(AbstractModule): - #mail_body_template = "AIL Framework,\nNew YARA match: {}\nitem id: {}\nurl: {}{}" + # mail_body_template = "AIL Framework,\nNew YARA match: {}\nitem id: {}\nurl: {}{}" """ Retro_Hunt module for AIL framework @@ -39,9 +38,6 @@ class Retro_Hunt(AbstractModule): self.full_item_url = self.process.config.get("Notifications", "ail_domain") + "/object/item?id=" - self.refresh_deleta = 10 - self.last_refresh = 0 - # reset on each loop self.task_uuid = None self.date_from = 0 @@ -49,13 +45,12 @@ class Retro_Hunt(AbstractModule): self.nb_src_done = 0 self.progress = 0 self.item = None + self.tags = [] self.redis_logger.info(f"Module: {self.module_name} Launched") - # # TODO: send mails - # # TODO: # start_time - # end_time + # # TODO: # start_time # end_time def compute(self, task_uuid): self.redis_logger.warning(f'{self.module_name}, starting Retro hunt task {task_uuid}') @@ -75,7 +70,7 @@ class Retro_Hunt(AbstractModule): self.tags = Tracker.get_retro_hunt_task_tags(task_uuid) curr_date = Tracker.get_retro_hunt_task_current_date(task_uuid) self.nb_src_done = Tracker.get_retro_hunt_task_nb_src_done(task_uuid, sources=sources) - self.progress = self.update_progress(sources, curr_date) + self.update_progress(sources, curr_date) # iterate on date filter_last = True while int(curr_date) <= int(self.date_to): @@ -91,14 +86,15 @@ class Retro_Hunt(AbstractModule): self.redis_logger.debug(f'{self.module_name}, Retro Hunt searching in directory {dir}') l_obj = Tracker.get_items_to_analyze(dir) for id in l_obj: - #print(f'{dir} / {id}') + # print(f'{dir} / {id}') self.item = Item(id) # save current item in cache Tracker.set_cache_retro_hunt_task_id(task_uuid, id) self.redis_logger.debug(f'{self.module_name}, Retro Hunt rule {task_uuid}, searching item {id}') - yara_match = rule.match(data=self.item.get_content(), callback=self.yara_rules_match, which_callbacks=yara.CALLBACK_MATCHES, timeout=timeout) + yara_match = rule.match(data=self.item.get_content(), callback=self.yara_rules_match, + which_callbacks=yara.CALLBACK_MATCHES, timeout=timeout) # save last item if nb_id % 10 == 0: # # TODO: Add nb before save in DB @@ -110,7 +106,7 @@ class Retro_Hunt(AbstractModule): self.update_progress(sources, curr_date) if Tracker.check_retro_hunt_pause(task_uuid): Tracker.set_retro_hunt_last_analyzed(task_uuid, id) - #self.update_progress(sources, curr_date, save_db=True) + # self.update_progress(sources, curr_date, save_db=True) Tracker.pause_retro_hunt_task(task_uuid) Tracker.clear_retro_hunt_task_cache(task_uuid) return None @@ -142,7 +138,7 @@ class Retro_Hunt(AbstractModule): def yara_rules_match(self, data): id = self.item.get_id() - #print(data) + # print(data) task_uuid = data['namespace'] self.redis_logger.info(f'{self.module_name}, Retro hunt {task_uuid} match found: {id}') @@ -177,9 +173,9 @@ class Retro_Hunt(AbstractModule): if task_uuid: # Module processing with the message from the queue self.redis_logger.debug(task_uuid) - #try: + # try: self.compute(task_uuid) - #except Exception as err: + # except Exception as err: # self.redis_logger.error(f'Error in module {self.module_name}: {err}') # # Remove uuid ref # self.remove_submit_uuid(uuid) diff --git a/bin/trackers/Tracker_Regex.py b/bin/trackers/Tracker_Regex.py index a5d4b4e9..0a939b77 100755 --- a/bin/trackers/Tracker_Regex.py +++ b/bin/trackers/Tracker_Regex.py @@ -9,7 +9,6 @@ It processes every item coming from the global module and test the regex """ import os -import re import sys import time import requests @@ -19,10 +18,9 @@ sys.path.append(os.environ['AIL_BIN']) # Import Project packages ################################## from modules.abstract_module import AbstractModule -from packages.Item import Item +from lib.objects.Items import Item from packages import Term from lib import Tracker -from lib import regex_helper import NotificationHelper @@ -42,8 +40,6 @@ class Tracker_Regex(AbstractModule): self.full_item_url = self.process.config.get("Notifications", "ail_domain") + "/object/item?id=" - self.redis_cache_key = regex_helper.generate_redis_cache_key(self.module_name) - # refresh Tracked Regex self.dict_regex_tracked = Term.get_regex_tracked_words_dict() self.last_refresh = time.time() @@ -63,7 +59,7 @@ class Tracker_Regex(AbstractModule): item_content = item.get_content() for regex in self.dict_regex_tracked: - matched = regex_helper.regex_search(self.module_name, self.redis_cache_key, self.dict_regex_tracked[regex], item_id, item_content, max_time=self.max_execution_time) + matched = self.regex_findall(self.dict_regex_tracked[regex], item_id, item_content) if matched: self.new_tracker_found(regex, 'regex', item) @@ -92,8 +88,8 @@ class Tracker_Regex(AbstractModule): if mail_to_notify: mail_subject = Tracker.get_email_subject(tracker_uuid) mail_body = Tracker_Regex.mail_body_template.format(tracker, item_id, self.full_item_url, item_id) - for mail in mail_to_notify: - NotificationHelper.sendEmailNotification(mail, mail_subject, mail_body) + for mail in mail_to_notify: + NotificationHelper.sendEmailNotification(mail, mail_subject, mail_body) # Webhook webhook_to_post = Term.get_term_webhook(tracker_uuid) diff --git a/bin/trackers/Tracker_Term.py b/bin/trackers/Tracker_Term.py index c23b1077..b4ed5a87 100755 --- a/bin/trackers/Tracker_Term.py +++ b/bin/trackers/Tracker_Term.py @@ -22,7 +22,7 @@ sys.path.append(os.environ['AIL_BIN']) ################################## from modules.abstract_module import AbstractModule import NotificationHelper -from packages.Item import Item +from lib.objects.Items import Item from packages import Term from lib import Tracker @@ -96,7 +96,7 @@ class Tracker_Term(AbstractModule): # Term.create_token_statistics(item_date, word, dict_words_freq[word]) # check solo words - ####### # TODO: check if source needed ####### + # ###### # TODO: check if source needed ####### for word in self.list_tracked_words: if word in dict_words_freq: self.new_term_found(word, 'word', item) @@ -136,10 +136,10 @@ class Tracker_Term(AbstractModule): if mail_to_notify: mail_subject = Tracker.get_email_subject(term_uuid) mail_body = Tracker_Term.mail_body_template.format(term, item_id, self.full_item_url, item_id) - for mail in mail_to_notify: - self.redis_logger.debug(f'Send Mail {mail_subject}') - print(f'S print(item_content)end Mail {mail_subject}') - NotificationHelper.sendEmailNotification(mail, mail_subject, mail_body) + for mail in mail_to_notify: + self.redis_logger.debug(f'Send Mail {mail_subject}') + print(f'S print(item_content)end Mail {mail_subject}') + NotificationHelper.sendEmailNotification(mail, mail_subject, mail_body) # Webhook webhook_to_post = Term.get_term_webhook(term_uuid) @@ -162,7 +162,6 @@ class Tracker_Term(AbstractModule): self.redis_logger.error(f"Webhook request failed for {webhook_to_post}\nReason: Something went wrong") - if __name__ == '__main__': module = Tracker_Term() module.run() diff --git a/bin/trackers/Tracker_Yara.py b/bin/trackers/Tracker_Yara.py index 3b23fbee..e5194178 100755 --- a/bin/trackers/Tracker_Yara.py +++ b/bin/trackers/Tracker_Yara.py @@ -8,7 +8,6 @@ # Import External packages ################################## import os -import re import sys import time import yara @@ -20,10 +19,10 @@ sys.path.append(os.environ['AIL_BIN']) ################################## from modules.abstract_module import AbstractModule from packages import Term -from packages.Item import Item +from lib.objects.Items import Item from lib import Tracker -import NotificationHelper # # TODO: refactor +import NotificationHelper # # TODO: refactor class Tracker_Yara(AbstractModule): @@ -46,7 +45,6 @@ class Tracker_Yara(AbstractModule): self.redis_logger.info(f"Module: {self.module_name} Launched") - def compute(self, item_id): # refresh YARA list if self.last_refresh < Tracker.get_tracker_last_updated_by_type('yara'): @@ -58,7 +56,8 @@ class Tracker_Yara(AbstractModule): self.item = Item(item_id) item_content = self.item.get_content() try: - yara_match = self.rules.match(data=item_content, callback=self.yara_rules_match, which_callbacks=yara.CALLBACK_MATCHES, timeout=60) + yara_match = self.rules.match(data=item_content, callback=self.yara_rules_match, + which_callbacks=yara.CALLBACK_MATCHES, timeout=60) if yara_match: self.redis_logger.info(f'{self.item.get_id()}: {yara_match}') print(f'{self.item.get_id()}: {yara_match}') @@ -91,10 +90,10 @@ class Tracker_Yara(AbstractModule): if mail_to_notify: mail_subject = Tracker.get_email_subject(tracker_uuid) mail_body = Tracker_Yara.mail_body_template.format(data['rule'], item_id, self.full_item_url, item_id) - for mail in mail_to_notify: - self.redis_logger.debug(f'Send Mail {mail_subject}') - print(f'Send Mail {mail_subject}') - NotificationHelper.sendEmailNotification(mail, mail_subject, mail_body) + for mail in mail_to_notify: + self.redis_logger.debug(f'Send Mail {mail_subject}') + print(f'Send Mail {mail_subject}') + NotificationHelper.sendEmailNotification(mail, mail_subject, mail_body) # Webhook webhook_to_post = Term.get_term_webhook(tracker_uuid) @@ -116,7 +115,6 @@ class Tracker_Yara(AbstractModule): except: self.redis_logger.error(f"Webhook request failed for {webhook_to_post}\nReason: Something went wrong") - return yara.CALLBACK_CONTINUE diff --git a/configs/core.cfg.sample b/configs/core.cfg.sample index 962cd2ff..05db26e1 100644 --- a/configs/core.cfg.sample +++ b/configs/core.cfg.sample @@ -262,14 +262,10 @@ db = 0 [Crawler] activate_crawler = False -crawler_depth_limit = 1 -default_crawler_har = True -default_crawler_png = True -default_crawler_closespider_pagecount = 50 -default_crawler_user_agent = Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0 -splash_url = http://127.0.0.1 -splash_port = 8050-8052 -domain_proxy = onion.foundation +default_depth_limit = 1 +default_har = True +default_screenshot = True +onion_proxy = onion.foundation [IP] # list of comma-separated CIDR that you wish to be alerted for. e.g: diff --git a/installing_deps.sh b/installing_deps.sh index 9b8f4d32..c1513c39 100755 --- a/installing_deps.sh +++ b/installing_deps.sh @@ -94,7 +94,7 @@ DEFAULT_HOME=$(pwd) #### KVROCKS #### test ! -d kvrocks/ && git clone https://github.com/apache/incubator-kvrocks.git kvrocks pushd kvrocks -./build.sh build +./x.py build popd DEFAULT_KVROCKS_DATA=$DEFAULT_HOME/DATA_KVROCKS diff --git a/requirements.txt b/requirements.txt index f2ebbc1c..0361a083 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,12 @@ pyail +pylacus pymisp>=2.4.144 d4-pyclient>=0.1.6 thehive4py # Core -redis==2.10.6 +redis==3.0.0 python-magic>0.4.15 yara-python>4.0.2 diff --git a/tests/test_modules.py b/tests/test_modules.py index e65edcc7..0731b26d 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -40,6 +40,7 @@ class Test_Module_ApiKey(unittest.TestCase): def setUp(self): self.module_obj = ApiKey() + self.module_obj.debug = True def test_module(self): item_id = 'tests/2021/01/01/api_keys.gz' @@ -56,6 +57,7 @@ class Test_Module_Categ(unittest.TestCase): def setUp(self): self.module_obj = Categ() + self.module_obj.debug = True def test_module(self): item_id = 'tests/2021/01/01/categ.gz' @@ -69,14 +71,15 @@ class Test_Module_CreditCards(unittest.TestCase): def setUp(self): self.module_obj = CreditCards() + self.module_obj.debug = True def test_module(self): item_id = 'tests/2021/01/01/credit_cards.gz 7' - test_cards = ['341039324930797', # American Express - '6011613905509166', # Discover Card - '3547151714018657', # Japan Credit Bureau (JCB) - '5492981206527330', # 16 digits MasterCard - '4024007132849695', # '4532525919781' # 16-digit VISA, with separators + test_cards = ['341039324930797', # American Express + '6011613905509166', # Discover Card + '3547151714018657', # Japan Credit Bureau (JCB) + '5492981206527330', # 16 digits MasterCard + '4024007132849695', # '4532525919781' # 16-digit VISA, with separators ] result = self.module_obj.compute(item_id, r_result=True) @@ -86,6 +89,7 @@ class Test_Module_DomClassifier(unittest.TestCase): def setUp(self): self.module_obj = DomClassifier() + self.module_obj.debug = True def test_module(self): test_host = 'foo.be' @@ -98,6 +102,7 @@ class Test_Module_Global(unittest.TestCase): def setUp(self): self.module_obj = Global() + self.module_obj.debug = True def test_module(self): # # TODO: delete item @@ -138,6 +143,7 @@ class Test_Module_Keys(unittest.TestCase): def setUp(self): self.module_obj = Keys() + self.module_obj.debug = True def test_module(self): item_id = 'tests/2021/01/01/keys.gz' @@ -148,6 +154,7 @@ class Test_Module_Onion(unittest.TestCase): def setUp(self): self.module_obj = Onion() + self.module_obj.debug = True def test_module(self): item_id = 'tests/2021/01/01/onion.gz' @@ -157,7 +164,7 @@ class Test_Module_Onion(unittest.TestCase): self.module_obj.compute(f'{item_id} 3') if crawlers.is_crawler_activated(): - ## check domain queues + # # check domain queues # all domains queue self.assertTrue(crawlers.is_domain_in_queue('onion', domain_1)) # all url/item queue @@ -177,11 +184,13 @@ class Test_Module_Telegram(unittest.TestCase): def setUp(self): self.module_obj = Telegram() + self.module_obj.debug = True def test_module(self): item_id = 'tests/2021/01/01/keys.gz' # # TODO: check results result = self.module_obj.compute(item_id) + if __name__ == '__main__': unittest.main() diff --git a/bin/empty_queue.py b/tools/empty_queue.py similarity index 95% rename from bin/empty_queue.py rename to tools/empty_queue.py index 5b763a32..51f1b633 100755 --- a/bin/empty_queue.py +++ b/tools/empty_queue.py @@ -12,10 +12,8 @@ Requirements: """ -import redis import os import time -from packages import Paste from pubsublogger import publisher from Helper import Process diff --git a/tools/extract_cryptocurrency.py b/tools/extract_cryptocurrency.py index da5f4d00..a5c601dd 100755 --- a/tools/extract_cryptocurrency.py +++ b/tools/extract_cryptocurrency.py @@ -37,8 +37,9 @@ def get_object_correlation_json(correlation_id, subtype, max_nodes): object_type = 'cryptocurrency' max_nodes = sanitise_nb_max_nodes(max_nodes) + # FIXME # ALL correlations - correlation_names = Correlate_object.sanitise_correlation_names('') + #correlation_names = Correlate_object.sanitise_correlation_names('') #correlation_objects = Correlate_object.sanitise_correlation_objects('') correlation_objects = ['domain'] diff --git a/var/www/Flask_server.py b/var/www/Flask_server.py index ae7a95ab..01833bb4 100755 --- a/var/www/Flask_server.py +++ b/var/www/Flask_server.py @@ -29,12 +29,10 @@ sys.path.append(os.environ['AIL_BIN']) ################################## # Import Project packages ################################## +from lib.ConfigLoader import ConfigLoader from lib.Users import User from lib import Tag -sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) -import ConfigLoader - # Import config import Flask_config @@ -50,14 +48,14 @@ from blueprints.hunters import hunters from blueprints.old_endpoints import old_endpoints from blueprints.ail_2_ail_sync import ail_2_ail_sync from blueprints.settings_b import settings_b +from blueprints.objects_cve import objects_cve from blueprints.objects_decoded import objects_decoded -from blueprints.objects_range import objects_range Flask_dir = os.environ['AIL_FLASK'] # CONFIG # -config_loader = ConfigLoader.ConfigLoader() +config_loader = ConfigLoader() baseUrl = config_loader.get_config_str("Flask", "baseurl") host = config_loader.get_config_str("Flask", "host") baseUrl = baseUrl.replace('/', '') @@ -111,8 +109,8 @@ app.register_blueprint(hunters, url_prefix=baseUrl) app.register_blueprint(old_endpoints, url_prefix=baseUrl) app.register_blueprint(ail_2_ail_sync, url_prefix=baseUrl) app.register_blueprint(settings_b, url_prefix=baseUrl) +app.register_blueprint(objects_cve, url_prefix=baseUrl) app.register_blueprint(objects_decoded, url_prefix=baseUrl) -app.register_blueprint(objects_range, url_prefix=baseUrl) # ========= =========# # ========= Cookie name ======== @@ -162,33 +160,32 @@ for root, dirs, files in os.walk(os.path.join(Flask_dir, 'modules')): if name == 'Flask_config.py': continue name = name.strip('.py') - #print('importing {}'.format(name)) importlib.import_module(name) elif name == 'header_{}.html'.format(module_name): with open(join(root, name), 'r') as f: to_add_to_header_dico[module_name] = f.read() -#create header.html +# create header.html complete_header = "" with open(os.path.join(Flask_dir, 'templates', 'header_base.html'), 'r') as f: complete_header = f.read() modified_header = complete_header -#Add the header in the supplied order +# Add the header in the supplied order for module_name, txt in list(to_add_to_header_dico.items()): to_replace = ''.format(module_name) if to_replace in complete_header: modified_header = modified_header.replace(to_replace, txt) del to_add_to_header_dico[module_name] -#Add the header for no-supplied order +# Add the header for no-supplied order to_add_to_header = [] for module_name, txt in to_add_to_header_dico.items(): to_add_to_header.append(txt) modified_header = modified_header.replace('', '\n'.join(to_add_to_header)) -#Write the header.html file +# Write the header.html file with open(os.path.join(Flask_dir, 'templates', 'header.html'), 'w') as f: f.write(modified_header) @@ -250,6 +247,7 @@ def page_not_found(e): # avoid endpoint enumeration return render_template('error/404.html'), 404 + # ========== INITIAL taxonomies ============ default_taxonomies = ["infoleak", "gdpr", "fpf", "dark-web"] diff --git a/var/www/blueprints/correlation.py b/var/www/blueprints/correlation.py index f66e2630..6dc6fa5c 100644 --- a/var/www/blueprints/correlation.py +++ b/var/www/blueprints/correlation.py @@ -26,22 +26,6 @@ sys.path.append(os.environ['AIL_BIN']) ################################## from lib.objects import ail_objects -################################################################################ - - -sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib')) -import Correlate_object -import Domain -import Screenshot -import btc_ail -import Username - -sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages')) -import Cryptocurrency -import Pgp -import Decoded -import Tag - bootstrap_label = Flask_config.bootstrap_label vt_enabled = Flask_config.vt_enabled @@ -74,77 +58,15 @@ def sanitise_nb_max_nodes(nb_max_nodes): nb_max_nodes = 300 return nb_max_nodes -def sanitise_correlation_names(correlation_names): - ''' - correlation_names ex = 'pgp,crypto' - ''' - all_correlation_names = Correlate_object.get_all_correlation_names() - if correlation_names is None: - return all_correlation_names - else: - l_correlation_names = [] - for correl in correlation_names.split(','): - if correl in all_correlation_names: - l_correlation_names.append(correl) - if l_correlation_names: - return l_correlation_names - else: - return all_correlation_names - -def sanitise_correlation_objects(correlation_objects): - ''' - correlation_objects ex = 'domain,decoded' - ''' - all_correlation_objects = Correlate_object.get_all_correlation_objects() - if correlation_objects is None: - return all_correlation_objects - else: - l_correlation_objects = [] - for correl in correlation_objects.split(','): - if correl in all_correlation_objects: - l_correlation_objects.append(correl) - if l_correlation_objects: - return l_correlation_objects - else: - return all_correlation_objects - -def get_card_metadata(object_type, correlation_id, type_id=None, expand_card=False): - card_dict = {} - if object_type == 'cryptocurrency': - card_dict["sparkline"] = Cryptocurrency.cryptocurrency.get_list_nb_previous_correlation_object(type_id, correlation_id, 6) - card_dict["icon"] = Correlate_object.get_correlation_node_icon(object_type, type_id) - if type_id == 'bitcoin' and expand_card: - card_dict["related_btc"] = btc_ail.get_bitcoin_info(correlation_id) - elif object_type == 'pgp': - card_dict["sparkline"] = Pgp.pgp.get_list_nb_previous_correlation_object(type_id, correlation_id, 6) - card_dict["icon"] = Correlate_object.get_correlation_node_icon(object_type, type_id) - elif object_type == 'username': - card_dict["sparkline"] = Username.correlation.get_list_nb_previous_correlation_object(type_id, correlation_id, 6) - card_dict["icon"] = Correlate_object.get_correlation_node_icon(object_type, type_id) - elif object_type == 'decoded': - card_dict["sparkline"] = Decoded.get_list_nb_previous_hash(correlation_id, 6) - card_dict["icon"] = Correlate_object.get_correlation_node_icon(object_type, value=correlation_id) - card_dict["vt"] = Decoded.get_decoded_vt_report(correlation_id) - card_dict["vt"]["status"] = vt_enabled - card_dict["add_tags_modal"] = Tag.get_modal_add_tags(correlation_id, object_type='decoded') - elif object_type == 'domain': - card_dict["icon"] = Correlate_object.get_correlation_node_icon(object_type, value=correlation_id) - card_dict["tags"] = Domain.get_domain_tags(correlation_id) - elif object_type == 'screenshot': - card_dict["add_tags_modal"] = Tag.get_modal_add_tags(correlation_id, object_type='image') - elif object_type == 'paste': - card_dict["icon"] = Correlate_object.get_correlation_node_icon(object_type, value=correlation_id) - return card_dict - # ============= ROUTES ============== -@correlation.route('/correlation/show_correlation', methods=['GET', 'POST']) # GET + POST +@correlation.route('/correlation/show', methods=['GET', 'POST']) # GET + POST @login_required @login_read_only def show_correlation(): if request.method == 'POST': - object_type = request.form.get('object_type') - type_id = request.form.get('type_id') - correlation_id = request.form.get('correlation_id') + object_type = request.form.get('obj_type') + subtype = request.form.get('subtype') + obj_id = request.form.get('obj_id') max_nodes = request.form.get('max_nb_nodes_in') mode = request.form.get('mode') if mode: @@ -153,73 +75,71 @@ def show_correlation(): mode = 'union' ## get all selected correlations - correlation_names = [] - correlation_objects = [] - #correlation_names + filter_types = [] + correl_option = request.form.get('CveCheck') + if correl_option: + filter_types.append('cve') correl_option = request.form.get('CryptocurrencyCheck') if correl_option: - correlation_names.append('cryptocurrency') + filter_types.append('cryptocurrency') correl_option = request.form.get('PgpCheck') if correl_option: - correlation_names.append('pgp') + filter_types.append('pgp') correl_option = request.form.get('UsernameCheck') if correl_option: - correlation_names.append('username') + filter_types.append('username') correl_option = request.form.get('DecodedCheck') if correl_option: - correlation_names.append('decoded') + filter_types.append('decoded') correl_option = request.form.get('ScreenshotCheck') if correl_option: - correlation_names.append('screenshot') + filter_types.append('screenshot') # correlation_objects correl_option = request.form.get('DomainCheck') if correl_option: - correlation_objects.append('domain') - correl_option = request.form.get('PasteCheck') + filter_types.append('domain') + correl_option = request.form.get('ItemCheck') if correl_option: - correlation_objects.append('item') + filter_types.append('item') # list as params - correlation_names = ",".join(correlation_names) - correlation_objects = ",".join(correlation_objects) + filter_types = ",".join(filter_types) # redirect to keep history and bookmark - return redirect(url_for('correlation.show_correlation', object_type=object_type, type_id=type_id, correlation_id=correlation_id, mode=mode, - max_nodes=max_nodes, correlation_names=correlation_names, correlation_objects=correlation_objects)) + return redirect(url_for('correlation.show_correlation', type=object_type, subtype=subtype, id=obj_id, mode=mode, + max_nodes=max_nodes, filter=filter_types)) # request.method == 'GET' else: - object_type = request.args.get('object_type') - type_id = request.args.get('type_id') - correlation_id = request.args.get('correlation_id') + obj_type = request.args.get('type') + subtype = request.args.get('subtype', '') + obj_id = request.args.get('id') max_nodes = sanitise_nb_max_nodes(request.args.get('max_nodes')) mode = sanitise_graph_mode(request.args.get('mode')) - expand_card = request.args.get('expand_card') + related_btc = bool(request.args.get('expand_card', False)) - correlation_names = ail_objects.sanitize_objs_types(request.args.get('correlation_names', '').split(',')) - correlation_objects = ail_objects.sanitize_objs_types(request.args.get('correlation_objects', '').split(',')) + filter_types = ail_objects.sanitize_objs_types(request.args.get('filter', '').split(',')) # # TODO: remove me, rename screenshot to image - if object_type == 'image': - object_type == 'screenshot' + if obj_type == 'image': + obj_type = 'screenshot' - # check if correlation_id exist - if not Correlate_object.exist_object(object_type, correlation_id, type_id=type_id): + # check if obj_id exist + if not ail_objects.exists_obj(obj_type, subtype, obj_id): abort(404) # return 404 - # oject exist + # object exist else: - dict_object = {"object_type": object_type, "correlation_id": correlation_id} - dict_object["max_nodes"] = max_nodes - dict_object["mode"] = mode - dict_object["correlation_names"] = correlation_names - dict_object["correlation_names_str"] = ",".join(correlation_names) - dict_object["correlation_objects"] = correlation_objects - dict_object["correlation_objects_str"] = ",".join(correlation_objects) - dict_object["metadata"] = Correlate_object.get_object_metadata(object_type, correlation_id, type_id=type_id) - if type_id: - dict_object["metadata"]['type_id'] = type_id - dict_object["metadata_card"] = get_card_metadata(object_type, correlation_id, type_id=type_id, expand_card=expand_card) + dict_object = {"object_type": obj_type, + "correlation_id": obj_id, + "max_nodes": max_nodes, "mode": mode, + "filter": filter_types, "filter_str": ",".join(filter_types), + "metadata": ail_objects.get_object_meta(obj_type, subtype, obj_id, flask_context=True) + } + print(dict_object) + if subtype: + dict_object["metadata"]['type_id'] = subtype + dict_object["metadata_card"] = ail_objects.get_object_card_meta(obj_type, subtype, obj_id, related_btc=related_btc) return render_template("show_correlation.html", dict_object=dict_object, bootstrap_label=bootstrap_label) @correlation.route('/correlation/get/description') @@ -254,19 +174,17 @@ def get_description(): @login_required @login_read_only def graph_node_json(): - obj_id = request.args.get('correlation_id') #######################3 - subtype = request.args.get('type_id') ####################### - obj_type = request.args.get('object_type') ####################### + obj_id = request.args.get('id') + subtype = request.args.get('subtype') + obj_type = request.args.get('type') max_nodes = sanitise_nb_max_nodes(request.args.get('max_nodes')) - correlation_names = ail_objects.sanitize_objs_types(request.args.get('correlation_names', '').split(',')) - correlation_objects = ail_objects.sanitize_objs_types(request.args.get('correlation_objects', '').split(',')) + filter_types = ail_objects.sanitize_objs_types(request.args.get('filter', '').split(',')) # # TODO: remove me, rename screenshot if obj_type == 'image': - obj_type == 'screenshot' + obj_type = 'screenshot' - filter_types = correlation_names + correlation_objects json_graph = ail_objects.get_correlations_graph_node(obj_type, subtype, obj_id, filter_types=filter_types, max_nodes=max_nodes, level=2, flask_context=True) #json_graph = Correlate_object.get_graph_node_object_correlation(obj_type, obj_id, 'union', correlation_names, correlation_objects, requested_correl_type=subtype, max_nodes=max_nodes) return jsonify(json_graph) diff --git a/var/www/blueprints/crawler_splash.py b/var/www/blueprints/crawler_splash.py index 5a6de5af..bd9b43d1 100644 --- a/var/www/blueprints/crawler_splash.py +++ b/var/www/blueprints/crawler_splash.py @@ -6,11 +6,13 @@ ''' import os -import sys import json import random +import sys +import time +from datetime import datetime -from flask import Flask, render_template, jsonify, request, Blueprint, redirect, url_for, Response, make_response +from flask import render_template, jsonify, request, Blueprint, redirect, url_for, Response, send_file, abort from flask_login import login_required, current_user, login_user, logout_user sys.path.append('modules') @@ -19,15 +21,6 @@ import Flask_config # Import Role_Manager from Role_Manager import login_admin, login_analyst, login_read_only -sys.path.append(os.environ['AIL_BIN']) -################################## -# Import Project packages -################################## - - -sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages')) -import Tag - sys.path.append(os.environ['AIL_BIN']) ################################## @@ -36,6 +29,10 @@ sys.path.append(os.environ['AIL_BIN']) from lib import crawlers from lib import Language from lib.objects import Domains +from lib.objects.Items import Item +from lib import Tag + +from packages import Date from lib import Domain # # # # # # # # # # # # # # # # TODO: @@ -50,9 +47,9 @@ crawler_splash = Blueprint('crawler_splash', __name__, template_folder=os.path.j # ============ FUNCTIONS ============ -def api_validator(api_response): - if api_response: - return Response(json.dumps(api_response[0], indent=2, sort_keys=True), mimetype='application/json'), api_response[1] +def api_validator(message, code): + if message and code: + return Response(json.dumps(message, indent=2, sort_keys=True), mimetype='application/json'), code def create_json_response(data, status_code): return Response(json.dumps(data, indent=2, sort_keys=True), mimetype='application/json'), status_code @@ -62,26 +59,26 @@ def create_json_response(data, status_code): @login_required @login_read_only def crawlers_dashboard(): - # # TODO: get splash manager status - is_manager_connected = crawlers.get_splash_manager_connection_metadata() - all_splash_crawler_status = crawlers.get_all_spash_crawler_status() - splash_crawlers_latest_stats = crawlers.get_splash_crawler_latest_stats() + is_manager_connected = crawlers.get_lacus_connection_metadata() + crawlers_status = crawlers.get_crawler_capture_status() + print(crawlers_status) + crawlers_latest_stats = crawlers.get_crawlers_stats() + print(crawlers_latest_stats) date = crawlers.get_current_date() - - return render_template("dashboard_splash_crawler.html", all_splash_crawler_status = all_splash_crawler_status, - is_manager_connected=is_manager_connected, date=date, - splash_crawlers_latest_stats=splash_crawlers_latest_stats) + return render_template("dashboard_crawler.html", date=date, + is_manager_connected=is_manager_connected, + crawlers_status=crawlers_status, + crawlers_latest_stats=crawlers_latest_stats) @crawler_splash.route("/crawlers/crawler_dashboard_json", methods=['GET']) @login_required @login_read_only def crawler_dashboard_json(): + crawlers_status = crawlers.get_crawler_capture_status() + crawlers_latest_stats = crawlers.get_crawlers_stats() - all_splash_crawler_status = crawlers.get_all_spash_crawler_status() - splash_crawlers_latest_stats = crawlers.get_splash_crawler_latest_stats() - - return jsonify({'all_splash_crawler_status': all_splash_crawler_status, - 'splash_crawlers_latest_stats':splash_crawlers_latest_stats}) + return jsonify({'crawlers_status': crawlers_status, + 'stats': crawlers_latest_stats}) @crawler_splash.route("/crawlers/manual", methods=['GET']) @login_required @@ -89,12 +86,12 @@ def crawler_dashboard_json(): def manual(): user_id = current_user.get_id() l_cookiejar = crawlers.api_get_cookies_list_select(user_id) - all_crawlers_types = crawlers.get_all_crawlers_queues_types() - all_splash_name = crawlers.get_all_crawlers_to_launch_splash_name() + crawlers_types = crawlers.get_crawler_all_types() + proxies = [] # TODO HANDLE PROXIES return render_template("crawler_manual.html", - is_manager_connected=crawlers.get_splash_manager_connection_metadata(), - all_crawlers_types=all_crawlers_types, - all_splash_name=all_splash_name, + is_manager_connected=crawlers.get_lacus_connection_metadata(), + crawlers_types=crawlers_types, + proxies=proxies, l_cookiejar=l_cookiejar) @crawler_splash.route("/crawlers/send_to_spider", methods=['POST']) @@ -106,17 +103,16 @@ def send_to_spider(): # POST val url = request.form.get('url_to_crawl') crawler_type = request.form.get('crawler_queue_type') - splash_name = request.form.get('splash_name') - auto_crawler = request.form.get('crawler_type') - crawler_delta = request.form.get('crawler_epoch') + proxy = request.form.get('proxy_name') + auto_crawler = request.form.get('crawler_type') # TODO Auto Crawler + crawler_delta = request.form.get('crawler_epoch') # TODO Auto Crawler screenshot = request.form.get('screenshot') har = request.form.get('har') depth_limit = request.form.get('depth_limit') - max_pages = request.form.get('max_pages') cookiejar_uuid = request.form.get('cookiejar') - if splash_name: - crawler_type = splash_name + if crawler_type == 'onion': + proxy = 'force_tor' if cookiejar_uuid: if cookiejar_uuid == 'None': @@ -125,13 +121,55 @@ def send_to_spider(): cookiejar_uuid = cookiejar_uuid.rsplit(':') cookiejar_uuid = cookiejar_uuid[-1].replace(' ', '') - res = crawlers.api_create_crawler_task(user_id, url, screenshot=screenshot, har=har, depth_limit=depth_limit, max_pages=max_pages, - crawler_type=crawler_type, - auto_crawler=auto_crawler, crawler_delta=crawler_delta, cookiejar_uuid=cookiejar_uuid) - if res: + data = {'url': url, 'depth': depth_limit, 'har': har, 'screenshot': screenshot} + if proxy: + data['proxy'] = proxy + if cookiejar_uuid: + data['cookiejar'] = cookiejar_uuid + res = crawlers.api_add_crawler_task(data, user_id=user_id) + + if res[1] != 200: return create_json_response(res[0], res[1]) return redirect(url_for('crawler_splash.manual')) +@crawler_splash.route("/crawlers/last/domains", methods=['GET']) +@login_required +@login_read_only +def crawlers_last_domains(): + domain_type = request.args.get('type') + if domain_type not in crawlers.get_crawler_all_types(): + return jsonify({'error': 'Invalid domain type'}), 400 + + # TODO STAT by EPOCH + domains = [] + for domain_row in crawlers.get_last_crawled_domains(domain_type): + domain, epoch = domain_row.split(':', 1) + dom = Domains.Domain(domain) + meta = dom.get_meta() + meta['epoch'] = epoch + meta['status_epoch'] = dom.is_up_by_epoch(epoch) + domains.append(meta) + crawler_stats = crawlers.get_crawlers_stats(domain_type=domain_type) + + now = datetime.now() + date = now.strftime("%Y%m%d") + date_string = '{}-{}-{}'.format(date[0:4], date[4:6], date[6:8]) + return render_template("last_crawled.html", domains=domains, type=domain_type, + is_manager_connected=crawlers.get_lacus_connection_metadata(), + date_from=date_string, date_to=date_string, + crawler_stats=crawler_stats) + +@crawler_splash.route('/crawlers/last/domains/json') +@login_required +@login_read_only +def crawlers_last_domains_json(): + domain_type = request.args.get('type') + if domain_type not in crawlers.get_crawler_all_types(): + return jsonify({'error': 'Invalid domain type'}), 400 + stats = [] + for date in Date.get_date_range(7): + stats.append(crawlers.get_crawlers_stats_by_day(date, domain_type)) + return jsonify(stats) #### Domains #### @@ -143,36 +181,69 @@ def showDomain(): if request.method == 'POST': domain_name = request.form.get('in_show_domain') epoch = None - port = None else: domain_name = request.args.get('domain') epoch = request.args.get('epoch') - port = request.args.get('port') - - res = api_validator(Domain.api_verify_if_domain_exist(domain_name)) - if res: - return res + try: + epoch = int(epoch) + except (ValueError, TypeError): + epoch = None domain = Domains.Domain(domain_name) - dom = Domain.Domain(domain_name, port=port) + if not domain.exists(): + abort(404) - dict_domain = dom.get_domain_metadata() - dict_domain['domain'] = domain_name - if dom.domain_was_up(): + dict_domain = domain.get_meta(options=['last_origin', 'languages']) + dict_domain['domain'] = domain.id + if domain.was_up(): dict_domain = {**dict_domain, **domain.get_correlations()} - print(dict_domain) dict_domain['correlation_nb'] = len(dict_domain['decoded']) + len(dict_domain['username']) + len(dict_domain['pgp']) + len(dict_domain['cryptocurrency']) + len(dict_domain['screenshot']) - dict_domain['father'] = dom.get_domain_father() - dict_domain['languages'] = Language.get_languages_from_iso(dom.get_domain_languages(), sort=True) - dict_domain['tags'] = dom.get_domain_tags() dict_domain['tags_safe'] = Tag.is_tags_safe(dict_domain['tags']) - dict_domain['history'] = dom.get_domain_history_with_status() - dict_domain['crawler_history'] = dom.get_domain_items_crawled(items_link=True, epoch=epoch, item_screenshot=True, item_tag=True) # # TODO: handle multiple port - if dict_domain['crawler_history'].get('items', []): - dict_domain['crawler_history']['random_item'] = random.choice(dict_domain['crawler_history']['items']) + dict_domain['history'] = domain.get_history(status=True) + curr_epoch = None + # Select valid epoch + if epoch: + for row in dict_domain['history']: + if row['epoch'] == epoch: + curr_epoch = row['epoch'] + break + else: + curr_epoch = -1 + for row in dict_domain['history']: + if row['epoch'] > curr_epoch: + curr_epoch = row['epoch'] + dict_domain['epoch'] = curr_epoch + dict_domain["date"] = time.strftime('%Y/%m/%d - %H:%M.%S', time.gmtime(curr_epoch)) - return render_template("showDomain.html", dict_domain=dict_domain, bootstrap_label=bootstrap_label, - modal_add_tags=Tag.get_modal_add_tags(dict_domain['domain'], object_type="domain")) + print(dict_domain['epoch']) + + dict_domain['crawler_history_items'] = [] + for item_id in domain.get_crawled_items_by_epoch(epoch): + dict_domain['crawler_history_items'].append(Item(item_id).get_meta(options=['crawler'])) + if dict_domain['crawler_history_items']: + dict_domain['random_item'] = random.choice(dict_domain['crawler_history_items']) + + return render_template("showDomain.html", + dict_domain=dict_domain, bootstrap_label=bootstrap_label, + modal_add_tags=Tag.get_modal_add_tags(dict_domain['domain'], object_type="domain")) + +@crawler_splash.route('/crawlers/domain/download', methods=['GET']) +@login_required +@login_read_only +def crawlers_domain_download(): + domain = request.args.get('domain') + epoch = request.args.get('epoch') + try: + epoch = int(epoch) + except (ValueError, TypeError): + epoch = None + dom = Domains.Domain(domain) + if not dom.exists(): + abort(404) + zip_file = dom.get_download_zip(epoch=epoch) + if not zip_file: + abort(404) + return send_file(zip_file, download_name=f'{dom.get_id()}.zip', as_attachment=True) @crawler_splash.route('/domains/explorer/domain_type_post', methods=['POST']) @login_required @@ -304,13 +375,36 @@ def domains_search_name(): l_dict_domains=l_dict_domains, bootstrap_label=bootstrap_label, domains_types=domains_types) -@crawler_splash.route('/domains/TODO', methods=['GET']) +@crawler_splash.route('/domains/date', methods=['GET']) @login_required @login_analyst -def domains_todo(): +def domains_search_date(): + # TODO sanitize type + date domain_type = request.args.get('type') - last_domains = Domain.get_last_crawled_domains(domain_type) + date_from = request.args.get('date_from') + date_to = request.args.get('date_to') + # page = request.args.get('page') + date = Date.sanitise_date_range(date_from, date_to) + domains_date = Domains.get_domains_by_daterange(date['date_from'], date['date_to'], domain_type) + dict_domains = {} + for d in domains_date: + dict_domains[d] = Domains.get_domains_meta(domains_date[d]) + date_from = f"{date['date_from'][0:4]}-{date['date_from'][4:6]}-{date['date_from'][6:8]}" + date_to = f"{date['date_to'][0:4]}-{date['date_to'][4:6]}-{date['date_to'][6:8]}" + + return render_template("domains_daterange.html", date_from=date_from, date_to=date_to, + bootstrap_label=bootstrap_label, + dict_domains=dict_domains, type=domain_type) + +@crawler_splash.route('/domains/date/post', methods=['POST']) +@login_required +@login_analyst +def domains_search_date_post(): + domain_type = request.form.get('type') + date_from = request.form.get('date_from') + date_to = request.form.get('date_to') + return redirect(url_for('crawler_splash.domains_search_date', date_from=date_from, date_to=date_to, type=domain_type)) ##-- --## @@ -521,49 +615,8 @@ def crawler_cookiejar_cookie_json_add_post(): return redirect(url_for('crawler_splash.crawler_cookiejar_cookie_add', cookiejar_uuid=cookiejar_uuid)) -@crawler_splash.route('/crawler/settings', methods=['GET']) -@login_required -@login_analyst -def crawler_splash_setings(): - all_proxies = crawlers.get_all_proxies_metadata() - all_splash = crawlers.get_all_splash_crawler_metadata() - splash_manager_url = crawlers.get_splash_manager_url() - api_key = crawlers.get_hidden_splash_api_key() - is_manager_connected = crawlers.get_splash_manager_connection_metadata(force_ping=True) +#--- Cookiejar ---# - nb_crawlers_to_launch = crawlers.get_nb_crawlers_to_launch() - #crawler_full_config = Config_DB.get_full_config_by_section('crawler') - is_crawler_working = crawlers.is_test_ail_crawlers_successful() - crawler_error_mess = crawlers.get_test_ail_crawlers_message() - - return render_template("settings_splash_crawler.html", - is_manager_connected=is_manager_connected, - splash_manager_url=splash_manager_url, api_key=api_key, - all_splash=all_splash, all_proxies=all_proxies, - nb_crawlers_to_launch=nb_crawlers_to_launch, - is_crawler_working=is_crawler_working, - crawler_error_mess=crawler_error_mess, - #crawler_full_config=crawler_full_config - ) - -@crawler_splash.route('/crawler/settings/crawler_manager', methods=['GET', 'POST']) -@login_required -@login_admin -def crawler_splash_setings_crawler_manager(): - if request.method == 'POST': - splash_manager_url = request.form.get('splash_manager_url') - api_key = request.form.get('api_key') - - res = crawlers.api_save_splash_manager_url_api({'url':splash_manager_url, 'api_key':api_key}) - if res[1] != 200: - return Response(json.dumps(res[0], indent=2, sort_keys=True), mimetype='application/json'), res[1] - else: - return redirect(url_for('crawler_splash.crawler_splash_setings')) - else: - splash_manager_url = crawlers.get_splash_manager_url() - api_key = crawlers.get_splash_api_key() - return render_template("settings_edit_splash_crawler_manager.html", - splash_manager_url=splash_manager_url, api_key=api_key) @crawler_splash.route('/crawler/settings/crawlers_to_lauch', methods=['GET', 'POST']) @login_required @@ -583,13 +636,6 @@ def crawler_splash_setings_crawlers_to_lauch(): return render_template("settings_edit_crawlers_to_launch.html", nb_crawlers_to_launch=nb_crawlers_to_launch) -@crawler_splash.route('/crawler/settings/test_crawler', methods=['GET']) -@login_required -@login_admin -def crawler_splash_setings_test_crawler(): - crawlers.test_ail_crawlers() - return redirect(url_for('crawler_splash.crawler_splash_setings')) - @crawler_splash.route('/crawler/settings/relaunch_crawler', methods=['GET']) @login_required @login_admin @@ -598,3 +644,59 @@ def crawler_splash_setings_relaunch_crawler(): return redirect(url_for('crawler_splash.crawler_splash_setings')) ## - - ## + +#### LACUS #### + +@crawler_splash.route('/crawler/settings', methods=['GET']) +@login_required +@login_analyst +def crawler_settings(): + lacus_url = crawlers.get_lacus_url() + api_key = crawlers.get_hidden_lacus_api_key() + + is_manager_connected = crawlers.get_lacus_connection_metadata(force_ping=True) + is_crawler_working = crawlers.is_test_ail_crawlers_successful() + crawler_error_mess = crawlers.get_test_ail_crawlers_message() + + # TODO REGISTER PROXY + # all_proxies = crawlers.get_all_proxies_metadata() + + # nb_crawlers_to_launch = crawlers.get_nb_crawlers_to_launch() + # crawler_full_config = Config_DB.get_full_config_by_section('crawler') + + return render_template("settings_crawler.html", + is_manager_connected=is_manager_connected, + lacus_url=lacus_url, api_key=api_key, + #all_proxies=all_proxies, + #nb_crawlers_to_launch=nb_crawlers_to_launch, + is_crawler_working=is_crawler_working, + crawler_error_mess=crawler_error_mess, + ) + +@crawler_splash.route('/crawler/settings/crawler/manager', methods=['GET', 'POST']) +@login_required +@login_admin +def crawler_lacus_settings_crawler_manager(): + if request.method == 'POST': + lacus_url = request.form.get('lacus_url') + api_key = request.form.get('api_key') + + res = crawlers.api_save_lacus_url_key({'url': lacus_url, 'api_key': api_key}) + print(res) + if res[1] != 200: + return Response(json.dumps(res[0], indent=2, sort_keys=True), mimetype='application/json'), res[1] + else: + return redirect(url_for('crawler_splash.crawler_settings')) + else: + lacus_url = crawlers.get_lacus_url() + api_key = crawlers.get_lacus_api_key() + return render_template("settings_edit_lacus_crawler.html", lacus_url=lacus_url, api_key=api_key) + +@crawler_splash.route('/crawler/settings/crawler/test', methods=['GET']) +@login_required +@login_admin +def crawler_settings_crawler_test(): + crawlers.test_ail_crawlers() + return redirect(url_for('crawler_splash.crawler_settings')) + +#--- LACUS ---# \ No newline at end of file diff --git a/var/www/blueprints/investigations_b.py b/var/www/blueprints/investigations_b.py index 0f9e7723..003a3e84 100644 --- a/var/www/blueprints/investigations_b.py +++ b/var/www/blueprints/investigations_b.py @@ -53,7 +53,7 @@ def show_investigation(): investigation_uuid = request.args.get("uuid") investigation = Investigations.Investigation(investigation_uuid) metadata = investigation.get_metadata(r_str=True) - objs = ail_objects.get_objects_meta(investigation.get_objects(), icon=True, url=True, flask_context=True) + objs = ail_objects.get_objects_meta(investigation.get_objects(), flask_context=True) return render_template("view_investigation.html", bootstrap_label=bootstrap_label, metadata=metadata, investigation_objs=objs) diff --git a/var/www/blueprints/objects_cve.py b/var/www/blueprints/objects_cve.py new file mode 100644 index 00000000..4dc098ec --- /dev/null +++ b/var/www/blueprints/objects_cve.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +''' + Blueprint Flask: crawler splash endpoints: dashboard, onion crawler ... +''' + +import os +import sys +import json + +from flask import Flask, render_template, jsonify, request, Blueprint, redirect, url_for, Response, abort, send_file +from flask_login import login_required, current_user + +# Import Role_Manager +from Role_Manager import login_admin, login_analyst, login_read_only + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from lib.objects import Cves +from packages import Date + +# ============ BLUEPRINT ============ +objects_cve = Blueprint('objects_cve', __name__, template_folder=os.path.join(os.environ['AIL_FLASK'], 'templates/objects/cve')) + +# ============ VARIABLES ============ +bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info'] + + +# ============ FUNCTIONS ============ +@objects_cve.route("/objects/cve", methods=['GET']) +@login_required +@login_read_only +def objects_cves(): + date_from = request.args.get('date_from') + date_to = request.args.get('date_to') + show_objects = request.args.get('show_objects') + date = Date.sanitise_date_range(date_from, date_to) + date_from = date['date_from'] + date_to = date['date_to'] + + # barchart_type + # correlation_type_search_endpoint + + dict_objects = Cves.api_get_cves_meta_by_daterange(date_from, date_to) + print(date_from, date_to, dict_objects) + return render_template("CveDaterange.html", date_from=date_from, date_to=date_to, + dict_objects=dict_objects, show_objects=show_objects) + +@objects_cve.route("/objects/cve/post", methods=['POST']) +@login_required +@login_read_only +def objects_cves_post(): + date_from = request.form.get('date_from') + date_to = request.form.get('date_to') + show_objects = request.form.get('show_objects') + return redirect(url_for('objects_cve.objects_cves', date_from=date_from, date_to=date_to, show_objects=show_objects)) + +@objects_cve.route("/objects/cve/range/json", methods=['GET']) +@login_required +@login_read_only +def objects_cve_range_json(): + return None + +@objects_cve.route("/objects/cve/search", methods=['POST']) +@login_required +@login_read_only +def objects_cve_search(): + to_search = request.form.get('object_id') + + # TODO SANITIZE ID + # TODO Search all + cve = Cves.Cve(to_search) + if not cve.exists(): + abort(404) + else: + return redirect(cve.get_link(flask_context=True)) + +# ============= ROUTES ============== + diff --git a/var/www/blueprints/objects_item.py b/var/www/blueprints/objects_item.py index 2c6b6ea5..27903c54 100644 --- a/var/www/blueprints/objects_item.py +++ b/var/www/blueprints/objects_item.py @@ -36,16 +36,16 @@ bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info'] # ============= ROUTES ============== -@objects_item.route("/object/item") #completely shows the paste in a new tab +@objects_item.route("/object/item") @login_required @login_read_only -def showItem(): # # TODO: support post +def showItem(): # # TODO: support post item_id = request.args.get('id') if not item_id or not item_basic.exist_item(item_id): abort(404) item = Item(item_id) - meta = item.get_meta(options=set(['content', 'crawler', 'duplicates', 'lines', 'size'])) + meta = item.get_meta(options=['content', 'crawler', 'duplicates', 'lines', 'size']) meta['name'] = meta['id'].replace('/', ' / ') meta['father'] = item_basic.get_item_parent(item_id) @@ -94,4 +94,4 @@ def item_download(): # # TODO: support post if not item_id or not item_basic.exist_item(item_id): abort(404) item = Item(item_id) - return send_file(item.get_raw_content(), attachment_filename=item_id, as_attachment=True) + return send_file(item.get_raw_content(), download_name=item_id, as_attachment=True) diff --git a/var/www/modules/PasteSubmit/Flask_PasteSubmit.py b/var/www/modules/PasteSubmit/Flask_PasteSubmit.py index b5d1a9af..5af36456 100644 --- a/var/www/modules/PasteSubmit/Flask_PasteSubmit.py +++ b/var/www/modules/PasteSubmit/Flask_PasteSubmit.py @@ -17,7 +17,6 @@ import redis import unicodedata import uuid from io import BytesIO -from Date import Date from functools import wraps @@ -31,9 +30,9 @@ from flask_login import login_required # Import Project packages ################################## from lib import Tag +from lib.objects.Items import Item -import Paste -import Import_helper +from packages import Import_helper from pytaxonomies import Taxonomies from pymispgalaxies import Galaxies, Clusters @@ -98,8 +97,6 @@ def limit_content_length(): # ============ FUNCTIONS ============ -def one(): - return 1 def allowed_file(filename): if not '.' in filename: @@ -126,15 +123,14 @@ def date_to_str(date): def misp_create_event(distribution, threat_level_id, analysis, info, l_tags, publish, path): - paste = Paste.Paste(path) - source = path.split('/')[-6:] - source = '/'.join(source)[:-3] + item = Item(path) + source = item.get_source() ail_uuid = r_serv_db.get('ail:uuid') - pseudofile = BytesIO(paste.get_p_content().encode()) + pseudofile = BytesIO(item.get_content(binary=True)) - temp = paste._get_p_duplicate() + temp = item.get_duplicates() - #beautifier + # beautifier if not temp: temp = '' @@ -181,7 +177,7 @@ def misp_create_event(distribution, threat_level_id, analysis, info, l_tags, pub leak_obj = MISPObject(obj_name) leak_obj.add_attribute('sensor', value=ail_uuid, type="text") leak_obj.add_attribute('origin', value=source, type='text') - leak_obj.add_attribute('last-seen', value=date_to_str(paste.p_date), type='datetime') + leak_obj.add_attribute('last-seen', value=date_to_str(item.get_date()), type='datetime') leak_obj.add_attribute('raw-data', value=source, data=pseudofile, type="attachment") if p_duplicate_number > 0: @@ -192,7 +188,8 @@ def misp_create_event(distribution, threat_level_id, analysis, info, l_tags, pub templateID = [x['ObjectTemplate']['id'] for x in pymisp.get_object_templates_list()['response'] if x['ObjectTemplate']['name'] == obj_name][0] except IndexError: valid_types = ", ".join([x['ObjectTemplate']['name'] for x in pymisp.get_object_templates_list()]) - print ("Template for type {} not found! Valid types are: {%s}".format(obj_name, valid_types)) + print (f"Template for type {obj_name} not found! Valid types are: {valid_types}") + return False r = pymisp.add_object(eventid, templateID, leak_obj) if 'errors' in r: print(r) @@ -206,7 +203,7 @@ def hive_create_case(hive_tlp, threat_level, hive_description, hive_case_title, ail_uuid = r_serv_db.get('ail:uuid') source = path.split('/')[-6:] source = '/'.join(source)[:-3] - # get paste date + # get item date var = path.split('/') last_seen = "{0}-{1}-{2}".format(var[-4], var[-3], var[-2]) diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index 5d85e6c7..960f3ae0 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -41,45 +41,6 @@ dic_type_name={'onion':'Onion', 'regular':'Website'} # ============ FUNCTIONS ============ -def one(): - return 1 - -def get_date_range(num_day): - curr_date = datetime.date.today() - date = Date( '{}{}{}'.format(str(curr_date.year), str(curr_date.month).zfill(2), str(curr_date.day).zfill(2)) ) - date_list = [] - - for i in range(0, num_day): - date_list.append(date.substract_day(i)) - - return list(reversed(date_list)) - -def substract_date(date_from, date_to): - date_from = datetime.date(int(date_from[0:4]), int(date_from[4:6]), int(date_from[6:8])) - date_to = datetime.date(int(date_to[0:4]), int(date_to[4:6]), int(date_to[6:8])) - delta = date_to - date_from # timedelta - l_date = [] - for i in range(delta.days + 1): - date = date_from + datetime.timedelta(i) - l_date.append( date.strftime('%Y%m%d') ) - return l_date - -def unpack_paste_tags(p_tags): - l_tags = [] - for tag in p_tags: - complete_tag = tag - tag = tag.split('=') - if len(tag) > 1: - if tag[1] != '': - tag = tag[1][1:-1] - # no value - else: - tag = tag[0][1:-1] - # use for custom tags - else: - tag = tag[0] - l_tags.append( (tag, complete_tag) ) - return l_tags def is_valid_domain(domain): faup.decode(domain) @@ -89,26 +50,6 @@ def is_valid_domain(domain): else: return False -def is_valid_service_type(service_type): - accepted_service = ['onion', 'regular'] - if service_type in accepted_service: - return True - else: - return False - -def get_onion_status(domain, date): - if r_serv_onion.sismember('onion_up:'+date , domain): - return True - else: - return False - -def get_domain_type(domain): - type_id = domain.split(':')[-1] - if type_id == 'onion': - return 'onion' - else: - return 'regular' - def get_type_domain(domain): if domain is None: type = 'regular' @@ -133,18 +74,6 @@ def get_domain_from_url(url): def get_last_domains_crawled(type): # DONE return r_serv_onion.lrange('last_{}'.format(type), 0 ,-1) -def get_nb_domains_inqueue(type): - nb = r_serv_onion.scard('{}_crawler_queue'.format(type)) - nb += r_serv_onion.scard('{}_crawler_priority_queue'.format(type)) - return nb - -def get_stats_last_crawled_domains(type, date): - statDomains = {} - statDomains['domains_up'] = r_serv_onion.scard('{}_up:{}'.format(type, date)) - statDomains['domains_down'] = r_serv_onion.scard('{}_down:{}'.format(type, date)) - statDomains['total'] = statDomains['domains_up'] + statDomains['domains_down'] - statDomains['domains_queue'] = get_nb_domains_inqueue(type) - return statDomains def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None, auto_mode=False): list_crawled_metadata = [] @@ -201,22 +130,6 @@ def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None, aut list_crawled_metadata.append(metadata_domain) return list_crawled_metadata -def get_crawler_splash_status(type): - crawler_metadata = [] - all_crawlers = r_cache.smembers('{}_crawlers'.format(type)) - for crawler in all_crawlers: - crawling_domain = r_cache.hget('metadata_crawler:{}'.format(crawler), 'crawling_domain') - started_time = r_cache.hget('metadata_crawler:{}'.format(crawler), 'started_time') - status_info = r_cache.hget('metadata_crawler:{}'.format(crawler), 'status') - crawler_info = '{} - {}'.format(crawler, started_time) - if status_info=='Waiting' or status_info=='Crawling': - status=True - else: - status=False - crawler_metadata.append({'crawler_info': crawler_info, 'crawling_domain': crawling_domain, 'status_info': status_info, 'status': status}) - - return crawler_metadata - def delete_auto_crawler(url): domain = get_domain_from_url(url) type = get_type_domain(domain) @@ -231,67 +144,6 @@ def delete_auto_crawler(url): # ============= ROUTES ============== -# @hiddenServices.route("/crawlers/", methods=['GET']) -# @login_required -# @login_read_only -# def dashboard(): -# crawler_metadata_onion = get_crawler_splash_status('onion') -# crawler_metadata_regular = get_crawler_splash_status('regular') -# -# now = datetime.datetime.now() -# date = now.strftime("%Y%m%d") -# statDomains_onion = get_stats_last_crawled_domains('onion', date) -# statDomains_regular = get_stats_last_crawled_domains('regular', date) -# -# return render_template("Crawler_dashboard.html", crawler_metadata_onion = crawler_metadata_onion, -# date=date, -# crawler_metadata_regular=crawler_metadata_regular, -# statDomains_onion=statDomains_onion, statDomains_regular=statDomains_regular) - -@hiddenServices.route("/crawlers/crawler_splash_onion", methods=['GET']) -@login_required -@login_read_only -def crawler_splash_onion(): - type = 'onion' - last_onions = get_last_domains_crawled(type) - list_onion = [] - - now = datetime.datetime.now() - date = now.strftime("%Y%m%d") - statDomains = get_stats_last_crawled_domains(type, date) - - list_onion = get_last_crawled_domains_metadata(last_onions, date, type=type) - crawler_metadata = get_crawler_splash_status(type) - - date_string = '{}-{}-{}'.format(date[0:4], date[4:6], date[6:8]) - return render_template("Crawler_Splash_onion.html", last_onions=list_onion, statDomains=statDomains, - crawler_metadata=crawler_metadata, date_from=date_string, date_to=date_string) - -@hiddenServices.route("/crawlers/Crawler_Splash_last_by_type", methods=['GET']) -@login_required -@login_read_only -def Crawler_Splash_last_by_type(): - type = request.args.get('type') - # verify user input - if type not in list_types: - type = 'onion' - type_name = dic_type_name[type] - list_domains = [] - - now = datetime.datetime.now() - date = now.strftime("%Y%m%d") - date_string = '{}-{}-{}'.format(date[0:4], date[4:6], date[6:8]) - - statDomains = get_stats_last_crawled_domains(type, date) - - list_domains = get_last_crawled_domains_metadata(get_last_domains_crawled(type), date, type=type) - crawler_metadata = get_crawler_splash_status(type) - - return render_template("Crawler_Splash_last_by_type.html", type=type, type_name=type_name, - is_manager_connected=crawlers.get_splash_manager_connection_metadata(), - last_domains=list_domains, statDomains=statDomains, - crawler_metadata=crawler_metadata, date_from=date_string, date_to=date_string) - @hiddenServices.route("/crawlers/blacklisted_domains", methods=['GET']) @login_required @login_read_only @@ -424,7 +276,7 @@ def auto_crawler(): return render_template("Crawler_auto.html", page=page, nb_page_max=nb_page_max, last_domains=last_domains, - is_manager_connected=crawlers.get_splash_manager_connection_metadata(), + is_manager_connected=crawlers.get_lacus_connection_metadata(), auto_crawler_domain_onions_metadata=auto_crawler_domain_onions_metadata, auto_crawler_domain_regular_metadata=auto_crawler_domain_regular_metadata) @@ -439,285 +291,6 @@ def remove_auto_crawler(): delete_auto_crawler(url) return redirect(url_for('hiddenServices.auto_crawler', page=page)) -# # TODO: refractor -@hiddenServices.route("/hiddenServices/last_crawled_domains_with_stats_json", methods=['GET']) -@login_required -@login_read_only -def last_crawled_domains_with_stats_json(): - last_onions = r_serv_onion.lrange('last_onion', 0 ,-1) - list_onion = [] - - now = datetime.datetime.now() - date = '{}{}{}'.format(now.strftime("%Y"), now.strftime("%m"), now.strftime("%d")) - statDomains = {} - statDomains['domains_up'] = r_serv_onion.scard('onion_up:{}'.format(date)) - statDomains['domains_down'] = r_serv_onion.scard('onion_down:{}'.format(date)) - statDomains['total'] = statDomains['domains_up'] + statDomains['domains_down'] - statDomains['domains_queue'] = r_serv_onion.scard('onion_domain_crawler_queue') - - for onion in last_onions: - metadata_onion = {} - metadata_onion['domain'] = onion - metadata_onion['last_check'] = r_serv_onion.hget('onion_metadata:{}'.format(onion), 'last_check') - if metadata_onion['last_check'] is None: - metadata_onion['last_check'] = '********' - metadata_onion['first_seen'] = r_serv_onion.hget('onion_metadata:{}'.format(onion), 'first_seen') - if metadata_onion['first_seen'] is None: - metadata_onion['first_seen'] = '********' - if get_onion_status(onion, metadata_onion['last_check']): - metadata_onion['status_text'] = 'UP' - metadata_onion['status_color'] = 'Green' - metadata_onion['status_icon'] = 'fa-check-circle' - else: - metadata_onion['status_text'] = 'DOWN' - metadata_onion['status_color'] = 'Red' - metadata_onion['status_icon'] = 'fa-times-circle' - list_onion.append(metadata_onion) - - crawler_metadata=[] - all_onion_crawler = r_cache.smembers('all_crawler:onion') - for crawler in all_onion_crawler: - crawling_domain = r_cache.hget('metadata_crawler:{}'.format(crawler), 'crawling_domain') - started_time = r_cache.hget('metadata_crawler:{}'.format(crawler), 'started_time') - status_info = r_cache.hget('metadata_crawler:{}'.format(crawler), 'status') - crawler_info = '{} - {}'.format(crawler, started_time) - if status_info=='Waiting' or status_info=='Crawling': - status=True - else: - status=False - crawler_metadata.append({'crawler_info': crawler_info, 'crawling_domain': crawling_domain, 'status_info': status_info, 'status': status}) - - date_string = '{}-{}-{}'.format(date[0:4], date[4:6], date[6:8]) - - return jsonify({'last_onions': list_onion, 'statDomains': statDomains, 'crawler_metadata':crawler_metadata}) - -@hiddenServices.route("/hiddenServices/get_onions_by_daterange", methods=['POST']) -@login_required -@login_read_only -def get_onions_by_daterange(): - date_from = request.form.get('date_from') - date_to = request.form.get('date_to') - service_type = request.form.get('service_type') - domains_up = request.form.get('domains_up') - domains_down = request.form.get('domains_down') - domains_tags = request.form.get('domains_tags') - - return redirect(url_for('hiddenServices.show_domains_by_daterange', date_from=date_from, date_to=date_to, service_type=service_type, domains_up=domains_up, domains_down=domains_down, domains_tags=domains_tags)) - -@hiddenServices.route("/hiddenServices/show_domains_by_daterange", methods=['GET']) -@login_required -@login_read_only -def show_domains_by_daterange(): - date_from = request.args.get('date_from') - date_to = request.args.get('date_to') - service_type = request.args.get('service_type') - domains_up = request.args.get('domains_up') - domains_down = request.args.get('domains_down') - domains_tags = request.args.get('domains_tags') - - # incorrect service type - if not is_valid_service_type(service_type): - service_type = 'onion' - - type_name = dic_type_name[service_type] - - date_range = [] - if date_from is not None and date_to is not None: - #change format - try: - if len(date_from) != 8: - date_from = date_from[0:4] + date_from[5:7] + date_from[8:10] - date_to = date_to[0:4] + date_to[5:7] + date_to[8:10] - date_range = substract_date(date_from, date_to) - except: - pass - - if not date_range: - date_range.append(datetime.date.today().strftime("%Y%m%d")) - date_from = date_range[0][0:4] + '-' + date_range[0][4:6] + '-' + date_range[0][6:8] - date_to = date_from - - else: - date_from = date_from[0:4] + '-' + date_from[4:6] + '-' + date_from[6:8] - date_to = date_to[0:4] + '-' + date_to[4:6] + '-' + date_to[6:8] - - statDomains = {} - statDomains['domains_up'] = 0 - statDomains['domains_down'] = 0 - statDomains['total'] = 0 - statDomains['domains_queue'] = get_nb_domains_inqueue(service_type) - - domains_by_day = {} - domain_metadata = {} - stats_by_date = {} - for date in date_range: - stats_by_date[date] = {} - stats_by_date[date]['domain_up'] = 0 - stats_by_date[date]['domain_down'] = 0 - if domains_up: - domains_up = True - domains_by_day[date] = list(r_serv_onion.smembers('{}_up:{}'.format(service_type, date))) - for domain in domains_by_day[date]: - h = HiddenServices(domain, 'onion') - domain_metadata[domain] = {} - if domains_tags: - domains_tags = True - domain_metadata[domain]['tags'] = h.get_domain_tags(update=True) - - domain_metadata[domain]['last_check'] = r_serv_onion.hget('{}_metadata:{}'.format(service_type, domain), 'last_check') - if domain_metadata[domain]['last_check'] is None: - domain_metadata[domain]['last_check'] = '********' - domain_metadata[domain]['first_seen'] = r_serv_onion.hget('{}_metadata:{}'.format(service_type, domain), 'first_seen') - if domain_metadata[domain]['first_seen'] is None: - domain_metadata[domain]['first_seen'] = '********' - domain_metadata[domain]['status_text'] = 'UP' - domain_metadata[domain]['status_color'] = 'Green' - domain_metadata[domain]['status_icon'] = 'fa-check-circle' - statDomains['domains_up'] += 1 - stats_by_date[date]['domain_up'] += 1 - - if domains_down: - domains_down = True - domains_by_day_down = list(r_serv_onion.smembers('{}_down:{}'.format(service_type, date))) - if domains_up: - domains_by_day[date].extend(domains_by_day_down) - else: - domains_by_day[date] = domains_by_day_down - for domain in domains_by_day_down: - #h = HiddenServices(onion_domain, 'onion') - domain_metadata[domain] = {} - #domain_metadata[domain]['tags'] = h.get_domain_tags() - - domain_metadata[domain]['last_check'] = r_serv_onion.hget('{}_metadata:{}'.format(service_type, domain), 'last_check') - if domain_metadata[domain]['last_check'] is None: - domain_metadata[domain]['last_check'] = '********' - domain_metadata[domain]['first_seen'] = r_serv_onion.hget('{}_metadata:{}'.format(service_type, domain), 'first_seen') - if domain_metadata[domain]['first_seen'] is None: - domain_metadata[domain]['first_seen'] = '********' - - domain_metadata[domain]['status_text'] = 'DOWN' - domain_metadata[domain]['status_color'] = 'Red' - domain_metadata[domain]['status_icon'] = 'fa-times-circle' - statDomains['domains_down'] += 1 - stats_by_date[date]['domain_down'] += 1 - - statDomains['total'] = statDomains['domains_up'] + statDomains['domains_down'] - - return render_template("domains.html", date_range=date_range, domains_by_day=domains_by_day, - statDomains=statDomains, type_name=type_name, - domain_metadata=domain_metadata, - stats_by_date=stats_by_date, - date_from=date_from, date_to=date_to, domains_up=domains_up, domains_down=domains_down, - domains_tags=domains_tags, type=service_type, bootstrap_label=bootstrap_label) - -@hiddenServices.route("/crawlers/download_domain", methods=['GET']) -@login_required -@login_read_only -@no_cache -def download_domain(): - domain = request.args.get('domain') - epoch = request.args.get('epoch') - try: - epoch = int(epoch) - except: - epoch = None - port = request.args.get('port') - faup.decode(domain) - unpack_url = faup.get() - - ## TODO: # FIXME: remove me - try: - domain = unpack_url['domain'].decode() - except: - domain = unpack_url['domain'] - - if not port: - if unpack_url['port']: - try: - port = unpack_url['port'].decode() - except: - port = unpack_url['port'] - else: - port = 80 - try: - port = int(port) - except: - port = 80 - type = get_type_domain(domain) - if domain is None or not r_serv_onion.exists('{}_metadata:{}'.format(type, domain)): - return '404' - # # TODO: FIXME return 404 - - origin_paste = r_serv_onion.hget('{}_metadata:{}'.format(type, domain), 'paste_parent') - - h = HiddenServices(domain, type, port=port) - item_core = h.get_domain_crawled_core_item(epoch=epoch) - if item_core: - l_pastes = h.get_last_crawled_pastes(item_root=item_core['root_item']) - else: - l_pastes = [] - #dict_links = h.get_all_links(l_pastes) - - zip_file = h.create_domain_basic_archive(l_pastes) - filename = domain + '.zip' - - return send_file(zip_file, attachment_filename=filename, as_attachment=True) - - -@hiddenServices.route("/hiddenServices/onion_son", methods=['GET']) -@login_required -@login_analyst -def onion_son(): - onion_domain = request.args.get('onion_domain') - - h = HiddenServices(onion_domain, 'onion') - l_pastes = h.get_last_crawled_pastes() - l_son = h.get_domain_son(l_pastes) - return 'l_son' - -# ============= JSON ============== -@hiddenServices.route("/hiddenServices/domain_crawled_7days_json", methods=['GET']) -@login_required -@login_read_only -def domain_crawled_7days_json(): - type = 'onion' - ## TODO: # FIXME: 404 error - - date_range = get_date_range(7) - json_domain_stats = [] - #try: - for date in date_range: - nb_domain_up = r_serv_onion.scard('{}_up:{}'.format(type, date)) - nb_domain_down = r_serv_onion.scard('{}_up:{}'.format(type, date)) - date = date[0:4] + '-' + date[4:6] + '-' + date[6:8] - json_domain_stats.append({ 'date': date, 'value': int( nb_domain_up ), 'nb_domain_down': int( nb_domain_down )}) - #except: - #return jsonify() - - return jsonify(json_domain_stats) - -@hiddenServices.route('/hiddenServices/domain_crawled_by_type_json') -@login_required -@login_read_only -def domain_crawled_by_type_json(): - current_date = request.args.get('date') - type = request.args.get('type') - if type in list_types: - - num_day_type = 7 - date_range = get_date_range(num_day_type) - range_decoder = [] - for date in date_range: - day_crawled = {} - day_crawled['date']= date[0:4] + '-' + date[4:6] + '-' + date[6:8] - day_crawled['UP']= nb_domain_up = r_serv_onion.scard('{}_up:{}'.format(type, date)) - day_crawled['DOWN']= nb_domain_up = r_serv_onion.scard('{}_down:{}'.format(type, date)) - range_decoder.append(day_crawled) - - return jsonify(range_decoder) - - else: - return jsonify('Incorrect Type') # ========= REGISTRATION ========= app.register_blueprint(hiddenServices, url_prefix=baseUrl) diff --git a/var/www/modules/hiddenServices/templates/Crawler_Splash_onion.html b/var/www/modules/hiddenServices/templates/Crawler_Splash_onion.html deleted file mode 100644 index 1e1a1b7a..00000000 --- a/var/www/modules/hiddenServices/templates/Crawler_Splash_onion.html +++ /dev/null @@ -1,476 +0,0 @@ - - - - - AIL-Framework - - - - - - - - - - - - - - - - - - - - - {% include 'nav_bar.html' %} - -
-
- - {% include 'crawler/menu_sidebar.html' %} - -
- -
-
- -
- - - - - - - - - - - {% for metadata_onion in last_onions %} - - - - - - - {% endfor %} - -
DomainFirst SeenLast CheckStatus
{{ metadata_onion['domain'] }}{{'{}/{}/{}'.format(metadata_onion['first_seen'][0:4], metadata_onion['first_seen'][4:6], metadata_onion['first_seen'][6:8])}}{{'{}/{}/{}'.format(metadata_onion['last_check'][0:4], metadata_onion['last_check'][4:6], metadata_onion['last_check'][6:8])}}
- - {{metadata_onion['status_text']}} -
-
-
- - - - - -
-
- -
-
-
-
- {{ statDomains['domains_up'] }} UP - {{ statDomains['domains_down'] }} DOWN -
-
- {{ statDomains['total'] }} Crawled - {{ statDomains['domains_queue'] }} Queue -
-
-
-
-
Select domains by date range :
-

Some quick example text to build on the card title and make up the bulk of the card's content.

-
-
-
-
-
- -
-
-
- -
-
-
-
- - -
-
- - -
-
- - -
-
-
- - -
-
- -
-
- -
-
- Crawlers Status -
-
- - - {% for crawler in crawler_metadata %} - - - - - - {% endfor %} - -
- {{crawler['crawler_info']}} - - {{crawler['crawling_domain']}} - - {{crawler['status_info']}} -
-
-
-
-
- -
- -
-
- - - - - - - - - - - diff --git a/var/www/modules/search/Flask_search.py b/var/www/modules/search/Flask_search.py index cbc1f633..361da03f 100644 --- a/var/www/modules/search/Flask_search.py +++ b/var/www/modules/search/Flask_search.py @@ -4,7 +4,6 @@ ''' Flask functions and routes for the trending modules page ''' -import redis import json import os import datetime @@ -14,11 +13,12 @@ from flask import Flask, render_template, jsonify, request, Blueprint from Role_Manager import login_admin, login_analyst from flask_login import login_required -import Paste from whoosh import index from whoosh.fields import Schema, TEXT, ID from whoosh.qparser import QueryParser +from lib.objects.Items import Item + import time # ============ VARIABLES ============ @@ -27,7 +27,6 @@ import Flask_config app = Flask_config.app config_loader = Flask_config.config_loader baseUrl = Flask_config.baseUrl -r_serv_metadata = Flask_config.r_serv_metadata max_preview_char = Flask_config.max_preview_char max_preview_modal = Flask_config.max_preview_modal bootstrap_label = Flask_config.bootstrap_label @@ -128,15 +127,14 @@ def search(): for x in results: r.append(x.items()[0][1].replace(PASTES_FOLDER, '', 1)) path = x.items()[0][1].replace(PASTES_FOLDER, '', 1) - paste = Paste.Paste(path) - content = paste.get_p_content() + item = Item(path) + content = item.get_content() content_range = max_preview_char if len(content)>max_preview_char else len(content)-1 c.append(content[0:content_range]) - curr_date = str(paste._get_p_date()) - curr_date = curr_date[0:4]+'/'+curr_date[4:6]+'/'+curr_date[6:] + curr_date = item.get_date(separator=True) paste_date.append(curr_date) - paste_size.append(paste._get_p_size()) - p_tags = r_serv_metadata.smembers('tag:'+path) + paste_size.append(item.get_size()) + p_tags = item.get_tags() l_tags = [] for tag in p_tags: complete_tag = tag @@ -205,15 +203,14 @@ def get_more_search_result(): path = x.items()[0][1] path = path.replace(PASTES_FOLDER, '', 1) path_array.append(path) - paste = Paste.Paste(path) - content = paste.get_p_content() + item = Item(path) + content = item.get_content() content_range = max_preview_char if len(content)>max_preview_char else len(content)-1 preview_array.append(content[0:content_range]) - curr_date = str(paste._get_p_date()) - curr_date = curr_date[0:4]+'/'+curr_date[4:6]+'/'+curr_date[6:] + curr_date = item.get_date(separator=True) date_array.append(curr_date) - size_array.append(paste._get_p_size()) - p_tags = r_serv_metadata.smembers('tag:'+path) + size_array.append(item.get_size()) + p_tags = item.get_tags() l_tags = [] for tag in p_tags: complete_tag = tag diff --git a/var/www/modules/showpaste/Flask_showpaste.py b/var/www/modules/showpaste/Flask_showpaste.py index 8bdf0984..6589896e 100644 --- a/var/www/modules/showpaste/Flask_showpaste.py +++ b/var/www/modules/showpaste/Flask_showpaste.py @@ -4,7 +4,6 @@ ''' Flask functions and routes for the trending modules page ''' -import redis import json import os import sys @@ -15,17 +14,14 @@ from Role_Manager import login_admin, login_analyst, login_read_only, no_cache from flask_login import login_required import difflib -import ssdeep import Paste import requests sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/')) -import Tag import Item sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) -import Domain # ============ VARIABLES ============ import Flask_config @@ -52,214 +48,11 @@ showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templa def get_item_screenshot_path(item): screenshot = r_serv_metadata.hget('paste_metadata:{}'.format(item), 'screenshot') if screenshot: - screenshot = os.path.join(screenshot[0:2], screenshot[2:4], screenshot[4:6], screenshot[6:8], screenshot[8:10], screenshot[10:12], screenshot[12:]) + screenshot = os.path.join(screenshot[0:2], screenshot[2:4], screenshot[4:6], screenshot[6:8], screenshot[8:10], screenshot[10:12], screenshot[12:]) else: screenshot = '' return screenshot -def showpaste(content_range, requested_path): - if PASTES_FOLDER not in requested_path: - # remove full path - requested_path_full = os.path.join(requested_path, PASTES_FOLDER) - else: - requested_path_full = requested_path - requested_path = requested_path.replace(PASTES_FOLDER, '', 1) - - # escape directory transversal - if os.path.commonprefix((requested_path_full,PASTES_FOLDER)) != PASTES_FOLDER: - return 'path transversal detected' - - vt_enabled = Flask_config.vt_enabled - - try: - paste = Paste.Paste(requested_path) - except FileNotFoundError: - abort(404) - - p_date = str(paste._get_p_date()) - p_date = p_date[6:]+'/'+p_date[4:6]+'/'+p_date[0:4] - p_source = paste.p_source - p_encoding = paste._get_p_encoding() - p_language = 'None' - p_size = paste.p_size - p_mime = paste.p_mime - p_lineinfo = paste.get_lines_info() - p_content = paste.get_p_content() - p_duplicate_str_full_list = paste._get_p_duplicate() - - p_duplicate_full_list = [] - p_duplicate_list = [] - p_simil_list = [] - p_date_list = [] - p_hashtype_list = [] - - for dup_list in p_duplicate_str_full_list: - dup_list = dup_list[1:-1].replace('\'', '').replace(' ', '').split(',') - if dup_list[0] == "tlsh": - dup_list[2] = 100 - int(dup_list[2]) - else: - dup_list[2] = int(dup_list[2]) - p_duplicate_full_list.append(dup_list) - - #p_duplicate_full_list.sort(lambda x,y: cmp(x[2], y[2]), reverse=True) - - # Combine multiple duplicate paste name and format for display - new_dup_list = [] - dup_list_removed = [] - for dup_list_index in range(0, len(p_duplicate_full_list)): - if dup_list_index in dup_list_removed: - continue - indices = [i for i, x in enumerate(p_duplicate_full_list) if x[1] == p_duplicate_full_list[dup_list_index][1]] - hash_types = [] - comp_vals = [] - for i in indices: - hash_types.append(p_duplicate_full_list[i][0]) - comp_vals.append(p_duplicate_full_list[i][2]) - dup_list_removed.append(i) - - #hash_types = str(hash_types).replace("[","").replace("]","") if len(hash_types)==1 else str(hash_types) - #comp_vals = str(comp_vals).replace("[","").replace("]","") if len(comp_vals)==1 else str(comp_vals) - - if len(p_duplicate_full_list[dup_list_index]) > 3: - try: - date_paste = str(int(p_duplicate_full_list[dup_list_index][3])) - date_paste = date_paste[0:4]+"-"+date_paste[4:6]+"-"+date_paste[6:8] - except ValueError: - date_paste = str(p_duplicate_full_list[dup_list_index][3]) - else: - date_paste = "No date available" - new_dup_list.append([hash_types, p_duplicate_full_list[dup_list_index][1], comp_vals, date_paste]) - - # Create the list to pass to the webpage - for dup_list in new_dup_list: - hash_type, path, simil_percent, date_paste = dup_list - p_duplicate_list.append(path) - p_simil_list.append(simil_percent) - p_hashtype_list.append(hash_type) - p_date_list.append(date_paste) - - if content_range != 0: - p_content = p_content[0:content_range] - - #active taxonomies - active_taxonomies = r_serv_tags.smembers('active_taxonomies') - - l_tags = r_serv_metadata.smembers('tag:'+requested_path) - tags_safe = Tag.is_tags_safe(l_tags) - - #active galaxies - active_galaxies = r_serv_tags.smembers('active_galaxies') - - list_tags = [] - - for tag in l_tags: - if(tag[9:28] == 'automatic-detection'): - automatic = True - else: - automatic = False - - if r_serv_statistics.sismember('tp:'+tag, requested_path): - tag_status_tp = True - else: - tag_status_tp = False - if r_serv_statistics.sismember('fp:'+tag, requested_path): - tag_status_fp = True - else: - tag_status_fp = False - - list_tags.append( (tag, automatic, tag_status_tp, tag_status_fp) ) - - l_64 = [] - # load hash files - if r_serv_metadata.scard('hash_paste:'+requested_path) > 0: - set_b64 = r_serv_metadata.smembers('hash_paste:'+requested_path) - for hash in set_b64: - nb_in_file = r_serv_metadata.zscore('nb_seen_hash:'+hash, requested_path) - # item list not updated - if nb_in_file is None: - l_pastes = r_serv_metadata.zrange('nb_seen_hash:'+hash, 0, -1) - for paste_name in l_pastes: - # dynamic update - if PASTES_FOLDER in paste_name: - score = r_serv_metadata.zscore('nb_seen_hash:{}'.format(hash), paste_name) - r_serv_metadata.zrem('nb_seen_hash:{}'.format(hash), paste_name) - paste_name = paste_name.replace(PASTES_FOLDER, '', 1) - r_serv_metadata.zadd('nb_seen_hash:{}'.format(hash), score, paste_name) - nb_in_file = r_serv_metadata.zscore('nb_seen_hash:'+hash, requested_path) - nb_in_file = int(nb_in_file) - estimated_type = r_serv_metadata.hget('metadata_hash:'+hash, 'estimated_type') - file_type = estimated_type.split('/')[0] - # set file icon - if file_type == 'application': - file_icon = 'fa-file-o ' - elif file_type == 'audio': - file_icon = 'fa-file-video-o ' - elif file_type == 'image': - file_icon = 'fa-file-image-o' - elif file_type == 'text': - file_icon = 'fa-file-text-o' - else: - file_icon = 'fa-file' - saved_path = r_serv_metadata.hget('metadata_hash:'+hash, 'saved_path') - if r_serv_metadata.hexists('metadata_hash:'+hash, 'vt_link'): - b64_vt = True - b64_vt_link = r_serv_metadata.hget('metadata_hash:'+hash, 'vt_link') - b64_vt_report = r_serv_metadata.hget('metadata_hash:'+hash, 'vt_report') - else: - b64_vt = False - b64_vt_link = '' - b64_vt_report = r_serv_metadata.hget('metadata_hash:'+hash, 'vt_report') - # hash never refreshed - if b64_vt_report is None: - b64_vt_report = '' - - l_64.append( (file_icon, estimated_type, hash, saved_path, nb_in_file, b64_vt, b64_vt_link, b64_vt_report) ) - - crawler_metadata = {} - if 'infoleak:submission="crawler"' in l_tags: - crawler_metadata['get_metadata'] = True - crawler_metadata['domain'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'domain') - crawler_metadata['domain'] = crawler_metadata['domain'].rsplit(':', 1)[0] - if tags_safe: - tags_safe = Tag.is_tags_safe(Domain.get_domain_tags(crawler_metadata['domain'])) - crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'father') - crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+requested_path,'real_link') - crawler_metadata['screenshot'] = get_item_screenshot_path(requested_path) - else: - crawler_metadata['get_metadata'] = False - - item_parent = Item.get_item_parent(requested_path) - - if Flask_config.pymisp is False: - misp = False - else: - misp = True - - if Flask_config.HiveApi is False: - hive = False - else: - hive = True - - misp_event = r_serv_metadata.get('misp_events:' + requested_path) - if misp_event is None: - misp_eventid = False - misp_url = '' - else: - misp_eventid = True - misp_url = misp_event_url + misp_event - - hive_case = r_serv_metadata.get('hive_cases:' + requested_path) - if hive_case is None: - hive_caseid = False - hive_url = '' - else: - hive_caseid = True - hive_url = hive_case_url.replace('id_here', hive_case) - - return render_template("show_saved_paste.html", date=p_date, bootstrap_label=bootstrap_label, active_taxonomies=active_taxonomies, active_galaxies=active_galaxies, list_tags=list_tags, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list, hashtype_list = p_hashtype_list, date_list=p_date_list, - crawler_metadata=crawler_metadata, tags_safe=tags_safe, item_parent=item_parent, - l_64=l_64, vt_enabled=vt_enabled, misp=misp, hive=hive, misp_eventid=misp_eventid, misp_url=misp_url, hive_caseid=hive_caseid, hive_url=hive_url) - def get_item_basic_info(item): item_basic_info = {} item_basic_info['date'] = str(item.get_p_date()) @@ -286,7 +79,7 @@ def show_item_min(requested_path , content_range=0): else: relative_path = requested_path.replace(PASTES_FOLDER, '', 1) # remove old full path - #requested_path = requested_path.replace(PASTES_FOLDER, '') + # requested_path = requested_path.replace(PASTES_FOLDER, '') # escape directory transversal if os.path.commonprefix((os.path.realpath(requested_path),PASTES_FOLDER)) != PASTES_FOLDER: return 'path transversal detected' @@ -370,7 +163,7 @@ def show_item_min(requested_path , content_range=0): crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+relative_path, 'father') crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+relative_path,'real_link') crawler_metadata['screenshot'] = get_item_screenshot_path(relative_path) - #crawler_metadata['har_file'] = Item.get_item_har(relative_path) + # crawler_metadata['har_file'] = Item.get_item_har(relative_path) else: crawler_metadata['get_metadata'] = False @@ -462,13 +255,6 @@ def showDiff(): def screenshot(filename): return send_from_directory(SCREENSHOT_FOLDER, filename+'.png', as_attachment=True) -# @showsavedpastes.route('/har/paste/') -# @login_required -# @login_read_only -# def har(filename): -# har_file = Item.get_item_har(filename) -# return jsonify(har_file) - @showsavedpastes.route('/send_file_to_vt/', methods=['POST']) @login_required @login_analyst diff --git a/var/www/modules/terms/Flask_terms.py b/var/www/modules/terms/Flask_terms.py index 223f991b..c92d22c2 100644 --- a/var/www/modules/terms/Flask_terms.py +++ b/var/www/modules/terms/Flask_terms.py @@ -15,14 +15,11 @@ from flask import Flask, render_template, jsonify, request, Blueprint, url_for, from Role_Manager import login_admin, login_analyst, login_user_no_api, login_read_only from flask_login import login_required, current_user - -import re -from pprint import pprint import Levenshtein # --------------------------------------------------------------- -import Paste +from lib.objects.Items import Item import Term # ============ VARIABLES ============ @@ -262,21 +259,21 @@ def credentials_tracker(): @login_required @login_user_no_api def credentials_management_query_paste(): - cred = request.args.get('cred') + cred = request.args.get('cred') allPath = request.json['allPath'] paste_info = [] for pathNum in allPath: path = r_serv_cred.hget(REDIS_KEY_ALL_PATH_SET_REV, pathNum) - paste = Paste.Paste(path) - p_date = str(paste._get_p_date()) - p_date = p_date[0:4]+'/'+p_date[4:6]+'/'+p_date[6:8] - p_source = paste.p_source - p_encoding = paste._get_p_encoding() - p_size = paste.p_size - p_mime = paste.p_mime - p_lineinfo = paste.get_lines_info() - p_content = paste.get_p_content() + item = Item(path) + p_date = item.get_date(separator=True) + p_source = item.get_source() + p_content = item.get_content() + p_encoding = item.get_mimetype() + p_size = item.get_size() + p_mime = p_encoding + lineinfo = item.get_meta_lines(content=p_content) + p_lineinfo = lineinfo['nb'], lineinfo['max_length'] if p_content != 0: p_content = p_content[0:400] paste_info.append({"path": path, "date": p_date, "source": p_source, "encoding": p_encoding, "size": p_size, "mime": p_mime, "lineinfo": p_lineinfo, "content": p_content}) diff --git a/var/www/templates/correlation/metadata_card_cryptocurrency.html b/var/www/templates/correlation/metadata_card_cryptocurrency.html index 967efa75..b0c39df6 100644 --- a/var/www/templates/correlation/metadata_card_cryptocurrency.html +++ b/var/www/templates/correlation/metadata_card_cryptocurrency.html @@ -51,7 +51,7 @@ Total sent Balance Inputs address seen in AIL - Ouputs address seen in AIL + Outputs address seen in AIL @@ -62,12 +62,12 @@ {{ dict_object["metadata_card"]["related_btc"]["final_balance"] }} {% for btc_addr in dict_object["metadata_card"]["related_btc"]["btc_in"] %} - {{ btc_addr }} + {{ btc_addr }} {% endfor %} {% for btc_addr in dict_object["metadata_card"]["related_btc"]["btc_out"] %} - {{ btc_addr }} + {{ btc_addr }} {% endfor %} @@ -75,7 +75,7 @@ {% else %} - Expand Bitcoin address + Expand Bitcoin address {% endif %} {% endif %} diff --git a/var/www/templates/correlation/metadata_card_cve.html b/var/www/templates/correlation/metadata_card_cve.html new file mode 100644 index 00000000..49e6a38f --- /dev/null +++ b/var/www/templates/correlation/metadata_card_cve.html @@ -0,0 +1,172 @@ + + + +{#{% with modal_add_tags=dict_object['metadata_card']['add_tags_modal']%}#} +{# {% include 'modals/add_tags.html' %}#} +{#{% endwith %}#} + +{% include 'modals/edit_tag.html' %} + +
+
+

{{ dict_object["correlation_id"] }} :

+
    +
  • +
    +
    + + + + + + + + + + + + + + + + + + +
    Object typeFirst seenLast seenNb seen
    {{ dict_object["object_type"] }} + + + + {{ dict_object["metadata_card"]["icon"]["icon_text"] }} + + + {{ dict_object["metadata"]['first_seen'] }}{{ dict_object["metadata"]['last_seen'] }}{{ dict_object["metadata"]['nb_seen'] }}
    +
    +
    +
    +
    +
    +
  • +{#
  • #} +{#
    #} +{#
    #} +{# Tags:#} +{# {% for tag in dict_object["metadata"]['tags'] %}#} +{# #} +{# {% endfor %}#} +{# #} +{#
    #} +{#
  • #} +
+ + + {% with obj_type='decoded', obj_id=dict_object['correlation_id'], obj_subtype='' %} + {% include 'modals/investigations_register_obj.html' %} + {% endwith %} + + +
+
+ + + + + + diff --git a/var/www/templates/correlation/metadata_card_paste.html b/var/www/templates/correlation/metadata_card_item.html similarity index 99% rename from var/www/templates/correlation/metadata_card_paste.html rename to var/www/templates/correlation/metadata_card_item.html index f929ca26..194ec200 100644 --- a/var/www/templates/correlation/metadata_card_paste.html +++ b/var/www/templates/correlation/metadata_card_item.html @@ -39,7 +39,7 @@ diff --git a/var/www/templates/correlation/show_correlation.html b/var/www/templates/correlation/show_correlation.html index b2e32cfe..c270c78d 100644 --- a/var/www/templates/correlation/show_correlation.html +++ b/var/www/templates/correlation/show_correlation.html @@ -99,12 +99,14 @@ {% include 'correlation/metadata_card_username.html' %} {% elif dict_object["object_type"] == "decoded" %} {% include 'correlation/metadata_card_decoded.html' %} + {% elif dict_object["object_type"] == "cve" %} + {% include 'correlation/metadata_card_cve.html' %} {% elif dict_object["object_type"] == "domain" %} {% include 'correlation/metadata_card_domain.html' %} {% elif dict_object["object_type"] == "screenshot" %} {% include 'correlation/metadata_card_screenshot.html' %} - {% elif dict_object["object_type"] == "paste" %} - {% include 'correlation/metadata_card_paste.html' %} + {% elif dict_object["object_type"] == "item" %} + {% include 'correlation/metadata_card_item.html' %} {% endif %}
@@ -146,82 +148,87 @@
    -
  • Select Correlation
  • -
  • +
  • Select Correlation
  • + +
  • - - - - + + + -
    - - -
    -
    - - -
    -
    - - -
    -
    - - -
    -
    - - -
    -
    - - -
    -
    - - -
    -
  • -
  • -
    - Union   -
    - - -
    -
    -
  • -
  • +
    + + +
    +
    + + +
    +
    + + +
    +
    + + +
    +
    + + +
    +
    + + +
    +
    + + +
    +
    + + +
    -
    - - -
    +
  • +
  • +
    + Union   +
    + + +
    +
    +
  • +
  • -
    - -
    +
    + + +
    - +
    + +
    -
  • + + +
    -
  • -
  • -

    Double click on a node to open this object

    - - - +

  • +
  • +

    Double click on a node to open this object

    + + + - Current Correlation
    -

    -
  • -
+ Current Correlation
+

+ +
@@ -236,7 +243,7 @@ - {% if dict_object["object_type"] in ["decoded", "pgp", "cryptocurrency"] %} + {% if dict_object["object_type"] in ["cve", "decoded", "pgp", "cryptocurrency"] %}
Graph @@ -257,14 +264,16 @@ var all_graph = {}; $(document).ready(function(){ $("#page-Decoded").addClass("active"); - all_graph.node_graph = create_graph("{{ url_for('correlation.graph_node_json') }}?correlation_id={{ dict_object["correlation_id"] }}&object_type={{ dict_object["object_type"] }}&mode={{ dict_object["mode"] }}&correlation_names={{ dict_object["correlation_names_str"] }}&correlation_objects={{ dict_object["correlation_objects_str"] }}&max_nodes={{dict_object["max_nodes"]}}{% if 'type_id' in dict_object["metadata"] %}&type_id={{ dict_object["metadata"]["type_id"] }}{% endif %}"); + all_graph.node_graph = create_graph("{{ url_for('correlation.graph_node_json') }}?id={{ dict_object["correlation_id"] }}&type={{ dict_object["object_type"] }}&mode={{ dict_object["mode"] }}&filter={{ dict_object["filter_str"] }}&max_nodes={{dict_object["max_nodes"]}}{% if 'type_id' in dict_object["metadata"] %}&subtype={{ dict_object["metadata"]["type_id"] }}{% endif %}"); {% if dict_object["object_type"] == "pgp" %} all_graph.line_chart = create_line_chart('graph_line', "{{ url_for('hashDecoded.pgpdump_graph_line_json') }}?type_id={{dict_object["metadata"]["type_id"]}}&key_id={{dict_object["correlation_id"]}}"); {% elif dict_object["object_type"] == "cryptocurrency" %} all_graph.line_chart = create_line_chart('graph_line', "{{ url_for('hashDecoded.cryptocurrency_graph_line_json') }}?type_id={{dict_object["metadata"]["type_id"]}}&key_id={{dict_object["correlation_id"]}}"); {% elif dict_object["object_type"] == "decoded" %} all_graph.line_chart = create_line_chart('graph_line', "{{ url_for('hashDecoded.hash_graph_line_json') }}?hash={{dict_object["correlation_id"]}}"); - {% endif %} + {% elif dict_object["object_type"] == "cve" %} + all_graph.line_chart = create_line_chart('graph_line', "{{ url_for('hashDecoded.hash_graph_line_json') }}?hash={{dict_object["correlation_id"]}}"); + {% endif %} all_graph.onResize(); }); diff --git a/var/www/templates/crawler/crawler_splash/crawler_manual.html b/var/www/templates/crawler/crawler_splash/crawler_manual.html index 510099a4..e0556f15 100644 --- a/var/www/templates/crawler/crawler_splash/crawler_manual.html +++ b/var/www/templates/crawler/crawler_splash/crawler_manual.html @@ -37,7 +37,7 @@
Crawl a Domain
-

Enter a domain and choose what kind of data you want.

+

Enter an url or a domain and choose what kind of option you want.

@@ -49,22 +49,22 @@
-
- + + {%for proxy in proxies%} + {%endfor%}
@@ -122,15 +122,16 @@ Depth Limit
-
-
-   -
- -
- Max Pages -
-
+{# TEMPORARY DISABLED #} +{#
#} +{#
#} +{#  #} +{#
#} +{# #} +{#
#} +{# Max Pages#} +{#
#} +{#
#}
@@ -204,10 +205,10 @@ function manual_crawler_input_controler() { function queue_type_selector_input_controler() { if($('#queue_type_selector').is(':checked')){ $("#div_crawler_queue_type").hide(); - $("#div_splash_name").show(); + $("#div_proxy_name").show(); }else{ $("#div_crawler_queue_type").show(); - $("#div_splash_name").hide(); + $("#div_proxy_name").hide(); } } diff --git a/var/www/templates/crawler/crawler_splash/dashboard_splash_crawler.html b/var/www/templates/crawler/crawler_splash/dashboard_crawler.html similarity index 50% rename from var/www/templates/crawler/crawler_splash/dashboard_splash_crawler.html rename to var/www/templates/crawler/crawler_splash/dashboard_crawler.html index 0a80d08c..19a23e92 100644 --- a/var/www/templates/crawler/crawler_splash/dashboard_splash_crawler.html +++ b/var/www/templates/crawler/crawler_splash/dashboard_crawler.html @@ -7,10 +7,13 @@ + + + @@ -33,15 +36,15 @@
@@ -51,15 +54,15 @@
@@ -69,23 +72,23 @@ - {% for splash_crawler in all_splash_crawler_status %} + {% for crawler in crawlers_status %} - {% endfor %} @@ -93,6 +96,9 @@
- {{splash_crawler['crawler_info']}} + {{crawler['start_time']}} - {%if splash_crawler['type']=='onion'%} + {%if crawler['type']=='onion'%} {%else%} {%endif%} - {{splash_crawler['crawling_domain']}} + {{crawler['domain']}} - {{splash_crawler['status_info']}} + + {{crawler['status']}}
{% include 'domains/block_domains_name_search.html' %} +
+ {% include 'crawler/show_domains_by_daterange.html' %} +

@@ -134,6 +140,32 @@ var to_refresh = false $(document).ready(function(){ $("#page-Crawler").addClass("active"); $("#nav_dashboard").addClass("active"); + $('#date-range-from').dateRangePicker({ + separator : ' to ', + getValue: function(){ + if ($('#date-range-from-input').val() && $('#date-range-to-input').val() ) + return $('#date-range-from-input').val() + ' to ' + $('#date-range-to-input').val(); + else + return ''; + }, + setValue: function(s,s1,s2){ + $('#date-range-from-input').val(s1); + $('#date-range-to-input').val(s2); + } + }); + $('#date-range-to').dateRangePicker({ + separator : ' to ', + getValue: function(){ + if ($('#date-range-from-input').val() && $('#date-range-to-input').val() ) + return $('#date-range-from-input').val() + ' to ' + $('#date-range-to-input').val(); + else + return ''; + }, + setValue: function(s,s1,s2){ + $('#date-range-from-input').val(s1); + $('#date-range-to-input').val(s2); + } + }); $( window ).on("focus", function() { to_refresh = true refresh_crawler_status(); @@ -144,6 +176,7 @@ $(document).ready(function(){ to_refresh = true refresh_crawler_status(); + }); function toggle_sidebar(){ @@ -165,21 +198,21 @@ function refresh_crawler_status(){ $.getJSON("{{ url_for('crawler_splash.crawler_dashboard_json') }}", function(data) { - $('#stat_onion_domain_up').text(data.splash_crawlers_latest_stats['onion']['domains_up']); - $('#stat_onion_domain_down').text(data.splash_crawlers_latest_stats['onion']['domains_down']); - $('#stat_onion_total').text(data.splash_crawlers_latest_stats['onion']['total']); - $('#stat_onion_queue').text(data.splash_crawlers_latest_stats['onion']['domains_queue']); + $('#stat_onion_domain_up').text(data.stats['onion']['up']); + $('#stat_onion_domain_down').text(data.stats['onion']['down']); + $('#stat_onion_total').text(data.stats['onion']['crawled']); + $('#stat_onion_queue').text(data.stats['onion']['queue']); - $('#stat_regular_domain_up').text(data.splash_crawlers_latest_stats['regular']['domains_up']); - $('#stat_regular_domain_down').text(data.splash_crawlers_latest_stats['regular']['domains_down']); - $('#stat_regular_total').text(data.splash_crawlers_latest_stats['regular']['total']); - $('#stat_regular_queue').text(data.splash_crawlers_latest_stats['regular']['domains_queue']); + $('#stat_web_domain_up').text(data.stats['web']['up']); + $('#stat_web_domain_down').text(data.stats['web']['down']); + $('#stat_web_total').text(data.stats['web']['crawled']); + $('#stat_web_queue').text(data.stats['web']['queue']); - if(data.all_splash_crawler_status.length!=0){ + if(data.crawlers_status.length!=0){ $("#tbody_crawler_onion_info").empty(); var tableRef = document.getElementById('tbody_crawler_onion_info'); - for (var i = 0; i < data.all_splash_crawler_status.length; i++) { - var crawler = data.all_splash_crawler_status[i]; + for (var i = 0; i < data.crawlers_status.length; i++) { + var crawler = data.crawlers_status[i]; var newRow = tableRef.insertRow(tableRef.rows.length); var text_color; var icon; @@ -198,16 +231,16 @@ function refresh_crawler_status(){ } var newCell = newRow.insertCell(0); - newCell.innerHTML = " "+crawler['crawler_info']+""; + newCell.innerHTML = " "+crawler['start_time']+""; var newCell = newRow.insertCell(1); newCell.innerHTML = ""; newCell = newRow.insertCell(2); - newCell.innerHTML = ""+crawler['crawling_domain']+""; + newCell.innerHTML = ""+crawler['domain']+""; newCell = newRow.insertCell(3); - newCell.innerHTML = "
"+crawler['status_info']+"
"; + newCell.innerHTML = "
"+crawler['status']+"
"; //$("#panel_crawler").show(); } diff --git a/var/www/templates/crawler/crawler_splash/domains_daterange.html b/var/www/templates/crawler/crawler_splash/domains_daterange.html new file mode 100644 index 00000000..2160609e --- /dev/null +++ b/var/www/templates/crawler/crawler_splash/domains_daterange.html @@ -0,0 +1,154 @@ + + + + + + AIL-Framework + + + + + + + + + + + + + + + + + + + + {% include 'nav_bar.html' %} + +
+
+ + {% include 'crawler/menu_sidebar.html' %} + +
+ + {% include 'crawler/show_domains_by_daterange.html' %} + + + {% for date in dict_domains %} +
+
+

{{'{}/{}/{}'.format(date[0:4], date[4:6], date[6:8])}}

+
+
+ + + + + + + + + + + + {% for dict_domain in dict_domains[date] %} + + + + + + + {% endfor %} + +
DomainFirst SeenLast CheckStatus
+ {{ dict_domain['domain'] }} +
+ {% for tag in dict_domain['tags'] %} + + {{ tag }} + + {% endfor %} +
+
{{dict_domain['first_seen']}}{{dict_domain['last_check']}} + {% if dict_domain['status'] %} +
+ UP +
+ {% else %} +
+ DOWN +
+ {% endif %} +
+ +
+
+ + {% endfor %} + +
+
+ +
+ + + + + + diff --git a/var/www/templates/crawler/crawler_splash/last_crawled.html b/var/www/templates/crawler/crawler_splash/last_crawled.html new file mode 100644 index 00000000..73599530 --- /dev/null +++ b/var/www/templates/crawler/crawler_splash/last_crawled.html @@ -0,0 +1,338 @@ + + + + + AIL-Framework + + + + + + + + + + + + + + + + + + + + + {% include 'nav_bar.html' %} + +
+
+ + {% include 'crawler/menu_sidebar.html' %} + +
+ + {% include 'crawler/crawler_disabled.html' %} + +
+
+
+ + + + + + + + + + + {% for domain in domains %} + + + + + + + {% endfor %} + +
DomainFirst SeenLast CheckStatus
{{ domain['domain'] }}{{domain['first_seen']}}{{domain['last_check']}} + {% if domain['status_epoch'] %} +
+ UP +
+ {% else %} +
+ DOWN +
+ {% endif %} + +
+
+ + + + + +
+
+ {% include 'crawler/show_domains_by_daterange.html' %} + +
+ +
+
+ +
+ +
+
+ + + + + + + diff --git a/var/www/templates/crawler/crawler_splash/settings_splash_crawler.html b/var/www/templates/crawler/crawler_splash/settings_crawler.html similarity index 58% rename from var/www/templates/crawler/crawler_splash/settings_splash_crawler.html rename to var/www/templates/crawler/crawler_splash/settings_crawler.html index 739350e4..ef70c522 100644 --- a/var/www/templates/crawler/crawler_splash/settings_splash_crawler.html +++ b/var/www/templates/crawler/crawler_splash/settings_crawler.html @@ -68,17 +68,17 @@ - - + + - +{# #} +{# #} - +
Splash Manager URL{{splash_manager_url}}Lacus URL{{lacus_url}}
API KeyAPI Key#} +{# {{api_key}}#} +{# #} +{# - {{api_key}} - - - + @@ -92,126 +92,52 @@ -
- -
-
-
All Splash Crawlers:
- - - - - - - - - - {% for splash_name in all_splash %} - - - - - - - - {% endfor %} - -
- Splash name - - Proxy - - Crawler type - - Description -
- {{splash_name}} - - {{all_splash[splash_name]['proxy']}} - - {%if all_splash[splash_name]['type']=='tor'%} - - {%else%} - - {%endif%} - {{all_splash[splash_name]['type']}} - - {{all_splash[splash_name]['description']}} - -
- -
-
-
-
- -
-
-
All Proxies:
- - - - - - - - - - - - {% for proxy_name in all_proxies %} - - - - - - - - - - {% endfor %} - -
- Proxy name - - Host - - Port - - Type - - Crawler Type - - Description -
- {{proxy_name}} - - {{all_proxies[proxy_name]['host']}} - - {{all_proxies[proxy_name]['port']}} - - {{all_proxies[proxy_name]['type']}} - - {%if all_proxies[proxy_name]['crawler_type']=='tor'%} - - {%else%} - - {%endif%} - {{all_proxies[proxy_name]['crawler_type']}} - - {{all_proxies[proxy_name]['description']}} - -
- -
-
-
-
-
+
+
+
All Proxies:
+ + + + + + + + + + + + {% for proxy_name in all_proxies %} + + + + + + + + {% endfor %} + +
Proxy nameURLCrawler TypeDescription
+ {{proxy_name}} + + {{all_proxies[proxy_name]['url']}} + + {%if all_proxies[proxy_name]['crawler_type']=='tor'%} + + {%else%} + + {%endif%} + {{all_proxies[proxy_name]['crawler_type']}} + + {{all_proxies[proxy_name]['description']}} + +
+ +
+
+
+
@@ -242,7 +168,7 @@ {{crawler_error_mess}} -
+ diff --git a/var/www/templates/crawler/crawler_splash/settings_edit_lacus_crawler.html b/var/www/templates/crawler/crawler_splash/settings_edit_lacus_crawler.html new file mode 100644 index 00000000..09c3667b --- /dev/null +++ b/var/www/templates/crawler/crawler_splash/settings_edit_lacus_crawler.html @@ -0,0 +1,61 @@ + + + + + AIL-Framework + + + + + + + + + + + + + + + {% include 'nav_bar.html' %} + +
+
+ + {% include 'crawler/menu_sidebar.html' %} + +
+ +
+
+

Lacus Config:

+ +
+ + +
+{#
#} +{# #} +{# #} +{#
#} + + +
+
+ + +
+
+
+ + + + + diff --git a/var/www/templates/crawler/crawler_splash/settings_edit_splash_crawler_manager.html b/var/www/templates/crawler/crawler_splash/settings_edit_splash_crawler_manager.html deleted file mode 100644 index 2eca4ba8..00000000 --- a/var/www/templates/crawler/crawler_splash/settings_edit_splash_crawler_manager.html +++ /dev/null @@ -1,55 +0,0 @@ - - - - - AIL-Framework - - - - - - - - - - - - - - - {% include 'nav_bar.html' %} - -
-
- - {% include 'crawler/menu_sidebar.html' %} - -
- -
-
- - -
-
- - -
- -
- -
-
-
- - - - - diff --git a/var/www/templates/crawler/crawler_splash/showDomain.html b/var/www/templates/crawler/crawler_splash/showDomain.html index 5cd9ddad..5fc65caa 100644 --- a/var/www/templates/crawler/crawler_splash/showDomain.html +++ b/var/www/templates/crawler/crawler_splash/showDomain.html @@ -38,8 +38,6 @@
- -
@@ -58,7 +56,7 @@
{% endif %} -

{{ dict_domain['domain'] }} :

+

{{ dict_domain['domain'] }}

@@ -66,67 +64,67 @@ - - + + - + - - + +
First Seen Last CheckPortsLanguagesPortsLanguages
{%if "first_seen" in dict_domain%}{{ dict_domain['first_seen'] }}{%endif%} {%if "last_check" in dict_domain%}{{ dict_domain['last_check'] }}{%endif%}{%if dict_domain["ports"]%}{{ dict_domain["ports"] }}{%endif%} - {% for languages in dict_domain['languages'] %} - {{languages}} - {% endfor %} - {%if dict_domain["ports"]%}{{ dict_domain["ports"] }}{%endif%} + {% for languages in dict_domain['languages'] %} + {{languages}} + {% endfor %} +
- {% include 'modals/edit_tag.html' %} + {% include 'modals/edit_tag.html' %} {% for tag in dict_domain['tags'] %} - - + {% endfor %}
- {% include 'modals/add_tags.html' %} - + {% include 'modals/add_tags.html' %} +
- - + + + + - {% if 'father' in dict_domain %} - {% if dict_domain['father']=='manual' or dict_domain['father']=='auto' %} + {% if 'last_origin' in dict_domain %} + {% if dict_domain['last_origin']=='manual' or dict_domain['last_origin']=='auto' %} - + - {%else%} - - - - - {% if dict_domain['father']['domain_father'] %} + {%else%} + + + + + {% if dict_domain['last_origin']['domain'] %} {%endif%}

- {%endif%} + {%endif%} {%endif%}

Last Origin:
Last Origin:
{{ dict_domain['father'] }}{{ dict_domain['last_origin']['item'] }}
- {{ dict_domain['father']['item_father'] }} -
+ {{ dict_domain['last_origin']['item'] }} +
- {{ dict_domain['father']['domain_father'] }} + {{ dict_domain['last_origin']['domain']}}
@@ -136,15 +134,14 @@ {% if dict_domain["correlation_nb"] > 0 %}
+ + + +
{%endif%} {%endif%} - {% with obj_type='domain', obj_id=dict_domain['domain'], obj_subtype=''%} {% include 'modals/investigations_register_obj.html' %} {% endwith %} @@ -157,12 +154,11 @@
- {% with obj_type='domain', obj_id=dict_domain['domain'], obj_lvl=0%} - {% include 'import_export/block_add_user_object_to_export.html' %} - {% endwith %} + {% with obj_type='domain', obj_id=dict_domain['domain'], obj_lvl=0%} + {% include 'import_export/block_add_user_object_to_export.html' %} + {% endwith %} +
- - @@ -329,7 +325,7 @@
- Screenshot   + Screenshot  
{{dict_domain['screenshot']|length}}
@@ -362,94 +358,84 @@
{% endif %} + {% if dict_domain["history"] %} +
+
+
+
Date: + + {{dict_domain["date"]}} + +
+
Capture {% if not dict_domain["crawler_history_items"] %}: Domain DOWN{% endif %}
+
+
- {% if dict_domain["crawler_history"] %} -
+ {% if dict_domain["crawler_history_items"] %} + + + + + + + + + {% for item in dict_domain["crawler_history_items"] %} + + + + + {% endfor %} -
-
-
- Date: - - {{dict_domain["crawler_history"]["date"]}} - -   PORT: - - {{dict_domain["crawler_history"]["port"]}} - -
-
Crawled Items {% if not dict_domain["crawler_history"]["items"] %}: DOWN{% endif %}
-
- -
- - {% if dict_domain["crawler_history"]["items"] %} -
Urls
+ +
{{ item['crawler']["url"] }}
+
+
+ {% for tag in item["tags"] %} + + {{ tag["min_tag"] }} + + {% endfor %} +
+
+ {%if item['crawler']["screenshot"]%} + + + {%endif%} +
- - - - - - - - {% for item in dict_domain["crawler_history"]["items"] %} - - - - - {% endfor %} - - -
Crawled Pastes
- -
{{ item["link"] }}
-
-
- {% for tag in item["tags"] %} - - {{ tag["min_tag"] }} - - {% endfor %} -
-
- {%if item["screenshot"]%} - - - - {%endif%} -
+
{%endif%} {%endif%} {% if dict_domain["history"] %} - - - - - - - - {% for dom_history in dict_domain["history"] %} - - - + {% endfor %} @@ -287,19 +287,22 @@
Domain History
- -
-
{{dict_domain["domain"]}}
- {% if dom_history["status"] %} -
UP
- {% else %} -
DOWN
- {% endif %} -
{{ dom_history["date"] }}
-
-
+ + + + + + + + {% for dom_history in dict_domain["history"] %} + + {% endfor %} @@ -551,16 +537,16 @@ img.onload = pixelate; img.addEventListener("error", img_error); var draw_img = false; -{%if "crawler_history" in dict_domain%} - {%if "random_item" in dict_domain['crawler_history']%} - {%if dict_domain['crawler_history']['random_item']['screenshot']%} - var screenshot = "{{dict_domain['crawler_history']['random_item']['screenshot']}}"; +{%if "crawler_history_items" in dict_domain%} + {%if "random_item" in dict_domain%} + {%if dict_domain['random_item']['crawler']['screenshot']%} + var screenshot = "{{dict_domain['random_item']['crawler']['screenshot']}}"; var selected_icon = $("#"+screenshot.replace(/\//g, "")); selected_icon.addClass("icon_selected"); selected_icon.removeClass("icon_img"); - $("#screenshot_link").attr("href", screenshot_href + "{{dict_domain['crawler_history']['random_item']['id']}}"); - $("#screenshot_link").text("{{dict_domain['crawler_history']['random_item']['link']}}"); + $("#screenshot_link").attr("href", screenshot_href + "{{dict_domain['random_item']['id']}}"); + $("#screenshot_link").text("{{dict_domain['random_item']['crawler']['url']}}"); {%else%} var screenshot = ""; {%endif%} diff --git a/var/www/templates/crawler/menu_sidebar.html b/var/www/templates/crawler/menu_sidebar.html index d3ed9170..56b3019f 100644 --- a/var/www/templates/crawler/menu_sidebar.html +++ b/var/www/templates/crawler/menu_sidebar.html @@ -20,33 +20,28 @@ diff --git a/var/www/templates/crawler/show_domains_by_daterange.html b/var/www/templates/crawler/show_domains_by_daterange.html index d56644c2..313e91a6 100644 --- a/var/www/templates/crawler/show_domains_by_daterange.html +++ b/var/www/templates/crawler/show_domains_by_daterange.html @@ -1,23 +1,24 @@
-
-
- -
- {{ statDomains['total'] }} Crawled - {{ statDomains['domains_queue'] }} Queue -
-
-
+ {% if crawler_stats %} +
+
+ +
+ {{ crawler_stats[type]['crawled'] }} Crawled + {{ crawler_stats[type]['queue'] }} Queue +
+
+
+ {% endif %}
-
Select domains by date range :
-

Some quick example text to build on the card title and make up the bulk of the card's content.

-
+
Search Domains by Date :
+
- +
@@ -40,16 +41,10 @@ Domains DOWN
-
- - -
diff --git a/var/www/templates/decoded/menu_sidebar.html b/var/www/templates/decoded/menu_sidebar.html index 08eef6fd..6130dd9a 100644 --- a/var/www/templates/decoded/menu_sidebar.html +++ b/var/www/templates/decoded/menu_sidebar.html @@ -1,21 +1,21 @@
+ - +
diff --git a/var/www/templates/objects/cve/CveDaterange.html b/var/www/templates/objects/cve/CveDaterange.html new file mode 100644 index 00000000..861c70e7 --- /dev/null +++ b/var/www/templates/objects/cve/CveDaterange.html @@ -0,0 +1,618 @@ + + + + + Decoded - AIL + + + + + + + + + + + + + + + + + + + + + + + + {% include 'nav_bar.html' %} + +
+
+ + {% include 'sidebars/sidebar_objects.html' %} + +
+ +
+
+
+ +
+
+
Search CVE by name:
+ +
+ + +
+ +
+
+
+ + +
+ +
+
+
Select a date range :
+
+
+
+ +
+
+
+ +
+
+ + +
+ + +
+
+ +
+
+
+
+
+
+ + {% if dict_objects %} + {% if date_from|string == date_to|string %} +

{{ date_from }} CVE:

+ {% else %} +

{{ date_from }} to {{ date_to }} CVE:

+ {% endif %} +
Domain History
+ +
+
{{dict_domain["domain"]}}
+ {% if dom_history["status"] %} +
UP
+ {% else %} +
DOWN
+ {% endif %} +
{{ dom_history["date"] }}
+
+
+ + + + + + + + + + + {% for cve_id in dict_objects %} + + + + + + + + {% endfor %} + +
CVE-IDFirst SeenLast SeenTotalLast days
{{ cve_id }}{{ dict_objects[cve_id]['first_seen'] }}{{ dict_objects[cve_id]['last_seen'] }}{{ dict_objects[cve_id]['nb_seen'] }}
+ + + {% else %} + {% if show_objects %} + {% if date_from|string == date_to|string %} +

{{ date_from }}, No CVE

+ {% else %} +

{{ date_from }} to {{ date_to }}, No CVE

+ {% endif %} + {% endif %} + {% endif %} + + + + + + + + + + + + + + + + + + + + + diff --git a/var/www/templates/objects/item/show_item.html b/var/www/templates/objects/item/show_item.html index 198ea02d..6d3afd8b 100644 --- a/var/www/templates/objects/item/show_item.html +++ b/var/www/templates/objects/item/show_item.html @@ -92,7 +92,7 @@
- + @@ -248,7 +248,7 @@ {% for b64 in l_64 %}
  {{ b64[1] }} {{b64[2]}} ({{ b64[4] }}) {{b64[2]}} ({{ b64[4] }})
- - + + + + + diff --git a/var/www/templates/objects/obj_svg_block.html b/var/www/templates/objects/obj_svg_block.html index 8bcacaa6..49a08798 100644 --- a/var/www/templates/objects/obj_svg_block.html +++ b/var/www/templates/objects/obj_svg_block.html @@ -3,4 +3,4 @@ {{ icon }} - + \ No newline at end of file diff --git a/var/www/templates/sidebars/sidebar_objects.html b/var/www/templates/sidebars/sidebar_objects.html index 3f7e2025..9a1650bf 100644 --- a/var/www/templates/sidebars/sidebar_objects.html +++ b/var/www/templates/sidebars/sidebar_objects.html @@ -28,12 +28,18 @@ Objects
Last Origin:
Last Origin:
- {{ meta['father'] }} + {{ meta['father'] }}
- {{ meta['crawler']['domain'] }} + {{ meta['crawler']['domain'] }}