mirror of https://github.com/CIRCL/AIL-framework
				
				
				
			chg: [statistics] ARDB migration
							parent
							
								
									d27d47dc70
								
							
						
					
					
						commit
						aa6ba61050
					
				|  | @ -16,6 +16,13 @@ import re | |||
| import string | ||||
| from itertools import chain | ||||
| 
 | ||||
| sys.path.append(os.environ['AIL_BIN']) | ||||
| ################################## | ||||
| # Import Project packages        # | ||||
| ################################## | ||||
| from lib import Statistics | ||||
| 
 | ||||
| 
 | ||||
| from packages import Item | ||||
| from pubsublogger import publisher | ||||
| 
 | ||||
|  | @ -48,6 +55,7 @@ def is_valid_iban(iban): | |||
|         return True | ||||
|     return False | ||||
| 
 | ||||
| # # TODO: SET | ||||
| def check_all_iban(l_iban, obj_id): | ||||
|     nb_valid_iban = 0 | ||||
|     for iban in l_iban: | ||||
|  | @ -61,7 +69,8 @@ def check_all_iban(l_iban, obj_id): | |||
|             if is_valid_iban(iban): | ||||
|                 print('------') | ||||
|                 nb_valid_iban = nb_valid_iban + 1 | ||||
|                 server_statistics.hincrby('iban_by_country:'+date, iban[0:2], 1) | ||||
|                 Statistics.add_iban_country_stats_by_date(date, iban[0:2], 1) | ||||
| 
 | ||||
| 
 | ||||
|     if(nb_valid_iban > 0): | ||||
|         to_print = 'Iban;{};{};{};'.format(Item.get_source(obj_id), Item.get_item_date(obj_id), Item.get_basename(obj_id)) | ||||
|  | @ -70,9 +79,6 @@ def check_all_iban(l_iban, obj_id): | |||
|         msg = 'infoleak:automatic-detection="iban";{}'.format(obj_id) | ||||
|         p.populate_set_out(msg, 'Tags') | ||||
| 
 | ||||
|         #Send to duplicate | ||||
|         p.populate_set_out(obj_id, 'Duplicate') | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     publisher.port = 6380 | ||||
|     publisher.channel = "Script" | ||||
|  | @ -82,13 +88,6 @@ if __name__ == "__main__": | |||
|     p = Process(config_section) | ||||
|     max_execution_time = p.config.getint("BankAccount", "max_execution_time") | ||||
| 
 | ||||
|     # ARDB # | ||||
|     server_statistics = redis.StrictRedis( | ||||
|         host=p.config.get("ARDB_Statistics", "host"), | ||||
|         port=p.config.getint("ARDB_Statistics", "port"), | ||||
|         db=p.config.getint("ARDB_Statistics", "db"), | ||||
|         decode_responses=True) | ||||
| 
 | ||||
|     publisher.info("BankAccount started") | ||||
| 
 | ||||
|     #iban_regex = re.compile(r'\b[A-Za-z]{2}[0-9]{2}(?:[ ]?[0-9]{4}){4}(?:[ ]?[0-9]{1,2})?\b') | ||||
|  |  | |||
|  | @ -314,14 +314,6 @@ if __name__ == '__main__': | |||
| 
 | ||||
|     print('splash url: {}'.format(splash_url)) | ||||
| 
 | ||||
|     PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) | ||||
| 
 | ||||
|     r_serv_metadata = redis.StrictRedis( | ||||
|         host=p.config.get("ARDB_Metadata", "host"), | ||||
|         port=p.config.getint("ARDB_Metadata", "port"), | ||||
|         db=p.config.getint("ARDB_Metadata", "db"), | ||||
|         decode_responses=True) | ||||
| 
 | ||||
|     r_cache = redis.StrictRedis( | ||||
|         host=p.config.get("Redis_Cache", "host"), | ||||
|         port=p.config.getint("Redis_Cache", "port"), | ||||
|  |  | |||
|  | @ -15,6 +15,7 @@ sys.path.append(os.environ['AIL_BIN']) | |||
| # Import Project packages | ||||
| ################################## | ||||
| from lib.ConfigLoader import ConfigLoader | ||||
| from lib import Statistics | ||||
| from lib import Tag | ||||
| from lib import Users | ||||
| from lib.objects import Decodeds | ||||
|  | @ -35,6 +36,8 @@ r_serv_tracker = config_loader.get_redis_conn("ARDB_Tracker") | |||
| r_serv_tags = config_loader.get_redis_conn("ARDB_Tags") | ||||
| r_crawler = config_loader.get_redis_conn("ARDB_Onion") | ||||
| r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata") | ||||
| r_serv_trend = config_loader.get_redis_conn("ARDB_Trending") | ||||
| r_statistics = config_loader.get_redis_conn("ARDB_Statistics") | ||||
| config_loader = None | ||||
| # # - - CONFIGS - - # # | ||||
| 
 | ||||
|  | @ -358,9 +361,16 @@ def items_migration(): | |||
| def get_last_crawled_domains(domain_type): | ||||
|     return r_crawler.lrange(f'last_{domain_type}', 0 ,-1) | ||||
| 
 | ||||
| def get_domains_blacklist(domain_type): | ||||
|     return r_crawler.smembers(f'blacklist_{domain_type}') | ||||
| 
 | ||||
| def crawler_migration(): | ||||
|     print('CRAWLER MIGRATION...') | ||||
| 
 | ||||
|     for domain_type in ['onion', 'regular']: | ||||
|         for domain in get_domains_blacklist(domain_type): | ||||
|             crawlers.add_domain_blacklist(domain_type, domain) | ||||
| 
 | ||||
|     # for domain_type in ['onion', 'regular']: | ||||
|     #     for row in get_last_crawled_domains(domain_type): | ||||
|     #         dom_row, epoch = row.rsplit(';', 1) | ||||
|  | @ -368,19 +378,18 @@ def crawler_migration(): | |||
|     #         print(domain, port, epoch) | ||||
|     #         #crawlers.add_last_crawled_domain(domain_type, domain, port, epoch) | ||||
| 
 | ||||
|     for cookiejar_uuid in old_crawlers.get_all_cookiejar(): | ||||
|         meta = old_crawlers.get_cookiejar_metadata(cookiejar_uuid, level=True) | ||||
|         #print(meta) | ||||
|         #crawlers.create_cookiejar(meta['user_id'], level=meta['level'], description=meta['description'], cookiejar_uuid=cookiejar_uuid) | ||||
|         #crawlers._set_cookiejar_date(meta['date']) | ||||
| 
 | ||||
|         for meta_cookie, cookie_uuid in old_crawlers.get_cookiejar_cookies_list(cookiejar_uuid, add_cookie_uuid=True): | ||||
|             print(cookie_uuid) | ||||
|             #crawlers.add_cookie_to_cookiejar(cookiejar_uuid, meta_cookie, cookie_uuid=cookie_uuid) | ||||
|     # for cookiejar_uuid in old_crawlers.get_all_cookiejar(): | ||||
|     #     meta = old_crawlers.get_cookiejar_metadata(cookiejar_uuid, level=True) | ||||
|     #     #print(meta) | ||||
|     #     crawlers.create_cookiejar(meta['user_id'], level=meta['level'], description=meta['description'], cookiejar_uuid=cookiejar_uuid) | ||||
|     #     crawlers._set_cookiejar_date(meta['date']) | ||||
|     # | ||||
|     #     for meta_cookie, cookie_uuid in old_crawlers.get_cookiejar_cookies_list(cookiejar_uuid, add_cookie_uuid=True): | ||||
|     #         print(cookie_uuid) | ||||
|     #         crawlers.add_cookie_to_cookiejar(cookiejar_uuid, meta_cookie, cookie_uuid=cookie_uuid) | ||||
| 
 | ||||
|     # TODO: auto crawler -> to Fix / change | ||||
| 
 | ||||
| 
 | ||||
|     # TODO: crawlers queues | ||||
| 
 | ||||
| ############################### | ||||
|  | @ -689,12 +698,89 @@ def subtypes_obj_migration(): | |||
| # | ||||
| # Credential: | ||||
| # HSET 'credential_by_tld:'+date, tld, 1 | ||||
| 
 | ||||
| def get_all_provider(): | ||||
|     return r_serv_trend.smembers('all_provider_set') | ||||
| 
 | ||||
| def get_item_source_stats_by_date(date, source): | ||||
|     stats = {} | ||||
|     stats['num'] = r_serv_trend.hget(f'{source}_num', date) | ||||
|     stats['size'] = r_serv_trend.hget(f'{source}_size', date) | ||||
|     stats['avg'] = r_serv_trend.hget(f'{source}_avg', date) | ||||
|     return stats | ||||
| 
 | ||||
| def get_item_stats_size_avg_by_date(date): | ||||
|     return r_serv_trend.zrange(f'top_avg_size_set_{date}', 0, -1, withscores=True) | ||||
| 
 | ||||
| def get_item_stats_nb_by_date(date): | ||||
|     return r_serv_trend.zrange(f'providers_set_{date}', 0, -1, withscores=True) | ||||
| 
 | ||||
| def get_top_stats_module(module_name, date): | ||||
|     return r_serv_trend.zrange(f'top_{module_name}_set_{date}', 0, -1, withscores=True) | ||||
| 
 | ||||
| def get_module_tld_stats_by_date(module, date): | ||||
|     return r_statistics.hgetall(f'{module}_by_tld:{date}') | ||||
| 
 | ||||
| def statistics_migration(): | ||||
|     # paste_by_modules_timeout | ||||
| 
 | ||||
|     # Date full history => lot of keys | ||||
| 
 | ||||
| 
 | ||||
|     # top_size_set_{date} | ||||
|     # top_avg_size_set_{date} | ||||
| 
 | ||||
|     # 'providers_set_{date} | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     sources = get_all_provider() | ||||
|     for date in Date.get_date_range_today('20180101'): | ||||
| 
 | ||||
|         size_avg = get_item_stats_size_avg_by_date(date) | ||||
| 
 | ||||
|         nb_items = get_item_stats_nb_by_date(date) | ||||
| 
 | ||||
|         # top_size_set_{date} | ||||
|         # top_avg_size_set_{date} | ||||
| 
 | ||||
|         # 'providers_set_{date} | ||||
| 
 | ||||
|         # ITEM STATS | ||||
|         for source in sources: | ||||
|             source_stat = get_item_source_stats_by_date(date, source) | ||||
|             Statistics._create_item_stats_size_nb(date, source, source_stat['num'], source_stat['size'], source_stat['avg']) | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|         # # MODULE STATS | ||||
|         # for module in ['credential', 'mail', 'SQLInjection']: | ||||
|         #     stats = get_module_tld_stats_by_date(module, date) | ||||
|         #     for tld in stats: | ||||
|         #         if tld: | ||||
|         #             print(module, date, tld, stats[tld]) | ||||
|         #             Statistics.add_module_tld_stats_by_date(module, date, tld, stats[tld]) | ||||
|         # for module in ['credential']: | ||||
|         #     # TOP STATS | ||||
|         #     top_module = get_top_stats_module(module, date) | ||||
|         #     for keyword, total_sum in top_module: | ||||
|         #         print(date, module, keyword, total_sum) | ||||
|         #         #Statistics._add_module_stats(module, total_sum, keyword, date) | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     pass | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
| 
 | ||||
|     core_migration() | ||||
|     #core_migration() | ||||
|     # user_migration() | ||||
|     # tags_migration() | ||||
|     #items_migration() | ||||
|  | @ -706,6 +792,7 @@ if __name__ == '__main__': | |||
|     # ail_2_ail_migration() | ||||
|     # trackers_migration() | ||||
|     # investigations_migration() | ||||
|     statistics_migration() | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -217,8 +217,12 @@ function launching_scripts { | |||
|     sleep 0.1 | ||||
|     screen -S "Script_AIL" -X screen -t "Onion" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Onion.py; read x" | ||||
|     sleep 0.1 | ||||
|     screen -S "Script_AIL" -X screen -t "Mail" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Mail.py; read x" | ||||
|     sleep 0.1 | ||||
|     # screen -S "Script_AIL" -X screen -t "SentimentAnalysis" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./SentimentAnalysis.py; read x" | ||||
|     # sleep 0.1 | ||||
|     screen -S "Script_AIL" -X screen -t "ModuleStats" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./ModuleStats.py; read x" | ||||
|     sleep 0.1 | ||||
|     screen -S "Script_AIL" -X screen -t "Telegram" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Telegram.py; read x" | ||||
|     sleep 0.1 | ||||
| 
 | ||||
|  | @ -267,8 +271,6 @@ function launching_scripts { | |||
|     sleep 0.1 | ||||
|     screen -S "Script_AIL" -X screen -t "BankAccount" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./BankAccount.py; read x" | ||||
|     sleep 0.1 | ||||
|     screen -S "Script_AIL" -X screen -t "Mail" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Mail.py; read x" | ||||
|     sleep 0.1 | ||||
|     screen -S "Script_AIL" -X screen -t "PgpDump" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./PgpDump.py; read x" | ||||
|     sleep 0.1 | ||||
|     screen -S "Script_AIL" -X screen -t "Cryptocurrency" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Cryptocurrencies.py; read x" | ||||
|  | @ -277,8 +279,6 @@ function launching_scripts { | |||
|     sleep 0.1 | ||||
|     screen -S "Script_AIL" -X screen -t "Cve" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Cve.py; read x" | ||||
|     sleep 0.1 | ||||
|     screen -S "Script_AIL" -X screen -t "ModuleStats" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./ModuleStats.py; read x" | ||||
|     sleep 0.1 | ||||
|     screen -S "Script_AIL" -X screen -t "MISPtheHIVEfeeder" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./MISP_The_Hive_feeder.py; read x" | ||||
|     sleep 0.1 | ||||
|     screen -S "Script_AIL" -X screen -t "Languages" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Languages.py; read x" | ||||
|  |  | |||
							
								
								
									
										220
									
								
								bin/Mail.py
								
								
								
								
							
							
						
						
									
										220
									
								
								bin/Mail.py
								
								
								
								
							|  | @ -1,220 +0,0 @@ | |||
| #!/usr/bin/env python3 | ||||
| # -*-coding:UTF-8 -* | ||||
| 
 | ||||
| """ | ||||
| The Mails Module | ||||
| ====================== | ||||
| 
 | ||||
| This module is consuming the Redis-list created by the Categ module. | ||||
| 
 | ||||
| It apply mail regexes on item content and warn if above a threshold. | ||||
| 
 | ||||
| """ | ||||
| 
 | ||||
| import os | ||||
| import re | ||||
| import sys | ||||
| import uuid | ||||
| import redis | ||||
| import time | ||||
| import datetime | ||||
| 
 | ||||
| import dns.resolver | ||||
| import dns.exception | ||||
| 
 | ||||
| from multiprocessing import Process as Proc | ||||
| 
 | ||||
| from pubsublogger import publisher | ||||
| from Helper import Process | ||||
| 
 | ||||
| from pyfaup.faup import Faup | ||||
| 
 | ||||
| sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages')) | ||||
| import Item | ||||
| 
 | ||||
| sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) | ||||
| import ConfigLoader | ||||
| 
 | ||||
| ## LOAD CONFIG ## | ||||
| config_loader = ConfigLoader.ConfigLoader() | ||||
| server_statistics = config_loader.get_redis_conn("ARDB_Statistics") | ||||
| r_serv_cache = config_loader.get_redis_conn("Redis_Cache") | ||||
| 
 | ||||
| dns_server = config_loader.get_config_str('Mail', 'dns') | ||||
| 
 | ||||
| config_loader = None | ||||
| ## -- ## | ||||
| 
 | ||||
| def is_mxdomain_in_cache(mxdomain): | ||||
|     return r_serv_cache.exists('mxdomain:{}'.format(mxdomain)) | ||||
| 
 | ||||
| def save_mxdomain_in_cache(mxdomain): | ||||
|     r_serv_cache.setex('mxdomain:{}'.format(mxdomain), 1, datetime.timedelta(days=1)) | ||||
| 
 | ||||
| def check_mx_record(set_mxdomains, dns_server): | ||||
|     """Check if emails MX domains are responding. | ||||
| 
 | ||||
|     :param adress_set: -- (set) This is a set of emails domains | ||||
|     :return: (int) Number of adress with a responding and valid MX domains | ||||
| 
 | ||||
|     """ | ||||
|     resolver = dns.resolver.Resolver() | ||||
|     resolver.nameservers = [dns_server] | ||||
|     resolver.timeout = 5.0 | ||||
|     resolver.lifetime = 2.0 | ||||
| 
 | ||||
|     valid_mxdomain = [] | ||||
|     for mxdomain in set_mxdomains: | ||||
| 
 | ||||
|         # check if is in cache | ||||
|         # # TODO: | ||||
|         if is_mxdomain_in_cache(mxdomain): | ||||
|             valid_mxdomain.append(mxdomain) | ||||
|         else: | ||||
| 
 | ||||
|             # DNS resolution | ||||
|             try: | ||||
|                 answers = resolver.query(mxdomain, rdtype=dns.rdatatype.MX) | ||||
|                 if answers: | ||||
|                     save_mxdomain_in_cache(mxdomain) | ||||
|                     valid_mxdomain.append(mxdomain) | ||||
|                     # DEBUG | ||||
|                     # print('---') | ||||
|                     # print(answers.response) | ||||
|                     # print(answers.qname) | ||||
|                     # print(answers.rdtype) | ||||
|                     # print(answers.rdclass) | ||||
|                     # print(answers.nameserver) | ||||
|                     # print() | ||||
| 
 | ||||
|             except dns.resolver.NoNameservers: | ||||
|                 publisher.debug('NoNameserver, No non-broken nameservers are available to answer the query.') | ||||
|                 print('NoNameserver, No non-broken nameservers are available to answer the query.') | ||||
|             except dns.resolver.NoAnswer: | ||||
|                 publisher.debug('NoAnswer, The response did not contain an answer to the question.') | ||||
|                 print('NoAnswer, The response did not contain an answer to the question.') | ||||
|             except dns.name.EmptyLabel: | ||||
|                 publisher.debug('SyntaxError: EmptyLabel') | ||||
|                 print('SyntaxError: EmptyLabel') | ||||
|             except dns.resolver.NXDOMAIN: | ||||
|                 #save_mxdomain_in_cache(mxdomain) | ||||
|                 publisher.debug('The query name does not exist.') | ||||
|                 print('The query name does not exist.') | ||||
|             except dns.name.LabelTooLong: | ||||
|                 publisher.debug('The Label is too long') | ||||
|                 print('The Label is too long') | ||||
|             except dns.exception.Timeout: | ||||
|                 print('dns timeout') | ||||
|                 #save_mxdomain_in_cache(mxdomain) | ||||
|             except Exception as e: | ||||
|                 print(e) | ||||
|     return valid_mxdomain | ||||
| 
 | ||||
| def extract_all_emails(redis_key, item_content): | ||||
|     all_emails = re.findall(email_regex, item_content) | ||||
|     if len(all_emails) > 1: | ||||
|         r_serv_cache.sadd(redis_key, *all_emails) | ||||
|         r_serv_cache.expire(redis_key, 360) | ||||
|     elif all_emails: | ||||
|         r_serv_cache.sadd(redis_key, all_emails[0]) | ||||
|         r_serv_cache.expire(redis_key, 360) | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     publisher.port = 6380 | ||||
|     publisher.channel = "Script" | ||||
| 
 | ||||
|     config_section = 'Mail' | ||||
| 
 | ||||
|     faup = Faup() | ||||
| 
 | ||||
|     p = Process(config_section) | ||||
| 
 | ||||
|     publisher.info("Mails module started") | ||||
| 
 | ||||
|     # Numbers of Mails needed to Tags | ||||
|     mail_threshold = 10 | ||||
| 
 | ||||
|     max_execution_time = 30 | ||||
| 
 | ||||
|     email_regex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}" | ||||
| 
 | ||||
|     redis_key = 'mail_extracted:{}'.format(str(uuid.uuid4())) | ||||
| 
 | ||||
|     while True: | ||||
|         message = p.get_from_set() | ||||
| 
 | ||||
|         if message is not None: | ||||
|             item_id, score = message.split() | ||||
| 
 | ||||
|             print(item_id) | ||||
| 
 | ||||
|             item_content = Item.get_item_content(item_id) | ||||
| 
 | ||||
|             proc = Proc(target=extract_all_emails, args=(redis_key, item_content, )) | ||||
|             try: | ||||
|                 proc.start() | ||||
|                 proc.join(max_execution_time) | ||||
|                 if proc.is_alive(): | ||||
|                     proc.terminate() | ||||
|                     p.incr_module_timeout_statistic() | ||||
|                     err_mess = "Mails: processing timeout: {}".format(item_id) | ||||
|                     print(err_mess) | ||||
|                     publisher.info(err_mess) | ||||
|                     continue | ||||
|                 else: | ||||
|                     all_emails = r_serv_cache.smembers(redis_key) | ||||
|                     r_serv_cache.delete(redis_key) | ||||
|                     proc.terminate() | ||||
|             except KeyboardInterrupt: | ||||
|                 print("Caught KeyboardInterrupt, terminating workers") | ||||
|                 proc.terminate() | ||||
|                 sys.exit(0) | ||||
| 
 | ||||
|             # get MXdomains | ||||
|             set_mxdomains = set() | ||||
|             dict_mxdomains_email = {} | ||||
|             for email in all_emails: | ||||
|                 mxdomain = email.split('@')[1].lower() | ||||
|                 if not mxdomain in dict_mxdomains_email: | ||||
|                     dict_mxdomains_email[mxdomain] = [] | ||||
|                     set_mxdomains.add(mxdomain) | ||||
|                 dict_mxdomains_email[mxdomain].append(email) | ||||
| 
 | ||||
|                 ## TODO: add MAIL trackers | ||||
| 
 | ||||
|             valid_mx = check_mx_record(set_mxdomains, dns_server) | ||||
| 
 | ||||
|             item_date = Item.get_item_date(item_id) | ||||
| 
 | ||||
|             num_valid_email = 0 | ||||
|             for domain_mx in valid_mx: | ||||
|                 num_valid_email += len(dict_mxdomains_email[domain_mx]) | ||||
| 
 | ||||
|                 for email in dict_mxdomains_email[domain_mx]: | ||||
|                     msg = 'mail;{};{};{}'.format(1, email, item_date) | ||||
|                     p.populate_set_out(msg, 'ModuleStats') | ||||
| 
 | ||||
|                     # Create country stats | ||||
|                     faup.decode(email) | ||||
|                     tld = faup.get()['tld'] | ||||
|                     try: | ||||
|                         tld = tld.decode() | ||||
|                     except: | ||||
|                         pass | ||||
|                     server_statistics.hincrby('mail_by_tld:{}'.format(item_date), tld, 1) | ||||
| 
 | ||||
|             msg = 'Mails;{};{};{};Checked {} e-mail(s);{}'.format(Item.get_source(item_id), item_date, Item.get_item_basename(item_id), num_valid_email, item_id) | ||||
| 
 | ||||
|             if num_valid_email > mail_threshold: | ||||
|                 print('{}    Checked {} e-mail(s)'.format(item_id, num_valid_email)) | ||||
|                 publisher.warning(msg) | ||||
|                 #Send to duplicate | ||||
|                 p.populate_set_out(item_id, 'Duplicate') | ||||
|                 #tags | ||||
|                 msg = 'infoleak:automatic-detection="mail";{}'.format(item_id) | ||||
|                 p.populate_set_out(msg, 'Tags') | ||||
|             else: | ||||
|                 publisher.info(msg) | ||||
| 
 | ||||
|         else: | ||||
|             time.sleep(10) | ||||
|  | @ -1,156 +0,0 @@ | |||
| #!/usr/bin/env python3 | ||||
| # -*-coding:UTF-8 -* | ||||
| """ | ||||
|     This module makes statistics for some modules and providers | ||||
| 
 | ||||
| """ | ||||
| 
 | ||||
| ################################## | ||||
| # Import External packages       # | ||||
| ################################## | ||||
| import time | ||||
| import datetime | ||||
| import redis | ||||
| import os | ||||
| import sys | ||||
| 
 | ||||
| sys.path.append(os.environ['AIL_BIN']) | ||||
| ################################## | ||||
| # Import Project packages        # | ||||
| ################################## | ||||
| from modules.abstract_module import AbstractModule | ||||
| from packages.Date import Date | ||||
| from packages import Paste | ||||
| import ConfigLoader | ||||
| 
 | ||||
| 
 | ||||
| class ModuleStats(AbstractModule): | ||||
|     """ | ||||
|     Module Statistics module for AIL framework | ||||
|     """ | ||||
| 
 | ||||
|     # Config Var | ||||
|     MAX_SET_CARDINALITY = 8 | ||||
| 
 | ||||
| 
 | ||||
|     def __init__(self): | ||||
| 
 | ||||
|         super(ModuleStats, self).__init__() | ||||
| 
 | ||||
|         # Waiting time in secondes between to message proccessed | ||||
|         self.pending_seconds = 20 | ||||
| 
 | ||||
|         # Sent to the logging a description of the module | ||||
|         self.redis_logger.info("Makes statistics about valid URL") | ||||
| 
 | ||||
|         # REDIS # | ||||
|         self.r_serv_trend = ConfigLoader.ConfigLoader().get_redis_conn("ARDB_Trending") | ||||
| 
 | ||||
|     def compute(self, message): | ||||
| 
 | ||||
|         if len(message.split(';')) > 1: | ||||
|             self.compute_most_posted(message) | ||||
|         else: | ||||
|             self.compute_provider_info(message) | ||||
| 
 | ||||
| 
 | ||||
|     def get_date_range(self, num_day): | ||||
|         curr_date = datetime.date.today() | ||||
|         date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2)) | ||||
|         date_list = [] | ||||
| 
 | ||||
|         for i in range(0, num_day+1): | ||||
|             date_list.append(date.substract_day(i)) | ||||
|         return date_list | ||||
| 
 | ||||
| 
 | ||||
|     def compute_most_posted(self, message): | ||||
|         module, num, keyword, paste_date = message.split(';') | ||||
| 
 | ||||
|         redis_progression_name_set = 'top_'+ module +'_set_' + paste_date | ||||
| 
 | ||||
|         # Add/Update in Redis | ||||
|         self.r_serv_trend.hincrby(paste_date, module+'-'+keyword, int(num)) | ||||
| 
 | ||||
|         # Compute Most Posted | ||||
|         date = self.get_date_range(0)[0] | ||||
|         # check if this keyword is eligible for progression | ||||
|         keyword_total_sum = 0 | ||||
| 
 | ||||
|         curr_value = self.r_serv_trend.hget(date, module+'-'+keyword) | ||||
|         keyword_total_sum += int(curr_value) if curr_value is not None else 0 | ||||
| 
 | ||||
|         if self.r_serv_trend.zcard(redis_progression_name_set) < self.MAX_SET_CARDINALITY: | ||||
|             self.r_serv_trend.zadd(redis_progression_name_set, float(keyword_total_sum), keyword) | ||||
| 
 | ||||
|         else: # not in set | ||||
|             member_set = self.r_serv_trend.zrangebyscore(redis_progression_name_set, '-inf', '+inf', withscores=True, start=0, num=1) | ||||
|             # Member set is a list of (value, score) pairs | ||||
|             if int(member_set[0][1]) < keyword_total_sum: | ||||
|                 #remove min from set and add the new one | ||||
|                 self.redis_logger.debug(module + ': adding ' +keyword+ '(' +str(keyword_total_sum)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')') | ||||
|                 self.r_serv_trend.zrem(redis_progression_name_set, member_set[0][0]) | ||||
|                 self.r_serv_trend.zadd(redis_progression_name_set, float(keyword_total_sum), keyword) | ||||
|                 self.redis_logger.debug(redis_progression_name_set) | ||||
| 
 | ||||
| 
 | ||||
|     def compute_provider_info(self, message): | ||||
|         redis_all_provider = 'all_provider_set' | ||||
| 
 | ||||
|         paste = Paste.Paste(message) | ||||
| 
 | ||||
|         paste_baseName = paste.p_name.split('.')[0] | ||||
|         paste_size = paste._get_p_size() | ||||
|         paste_provider = paste.p_source | ||||
|         paste_date = str(paste._get_p_date()) | ||||
|         redis_sum_size_set = 'top_size_set_' + paste_date | ||||
|         redis_avg_size_name_set = 'top_avg_size_set_' + paste_date | ||||
|         redis_providers_name_set = 'providers_set_' + paste_date | ||||
| 
 | ||||
|         # Add/Update in Redis | ||||
|         self.r_serv_trend.sadd(redis_all_provider, paste_provider) | ||||
| 
 | ||||
|         num_paste = int(self.r_serv_trend.hincrby(paste_provider+'_num', paste_date, 1)) | ||||
|         sum_size = float(self.r_serv_trend.hincrbyfloat(paste_provider+'_size', paste_date, paste_size)) | ||||
|         new_avg = float(sum_size) / float(num_paste) | ||||
|         self.r_serv_trend.hset(paste_provider +'_avg', paste_date, new_avg) | ||||
| 
 | ||||
| 
 | ||||
|         # | ||||
|         # Compute Most Posted | ||||
|         # | ||||
| 
 | ||||
|         # Size | ||||
|         if self.r_serv_trend.zcard(redis_sum_size_set) < self.MAX_SET_CARDINALITY or self.r_serv_trend.zscore(redis_sum_size_set, paste_provider) != "nil": | ||||
|             self.r_serv_trend.zadd(redis_sum_size_set, float(num_paste), paste_provider) | ||||
|             self.r_serv_trend.zadd(redis_avg_size_name_set, float(new_avg), paste_provider) | ||||
|         else: #set full capacity | ||||
|             member_set = self.r_serv_trend.zrangebyscore(redis_sum_size_set, '-inf', '+inf', withscores=True, start=0, num=1) | ||||
|             # Member set is a list of (value, score) pairs | ||||
|             if float(member_set[0][1]) < new_avg: | ||||
|                 #remove min from set and add the new one | ||||
|                 self.redis_logger.debug('Size - adding ' +paste_provider+ '(' +str(new_avg)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')') | ||||
|                 self.r_serv_trend.zrem(redis_sum_size_set, member_set[0][0]) | ||||
|                 self.r_serv_trend.zadd(redis_sum_size_set, float(sum_size), paste_provider) | ||||
|                 self.r_serv_trend.zrem(redis_avg_size_name_set, member_set[0][0]) | ||||
|                 self.r_serv_trend.zadd(redis_avg_size_name_set, float(new_avg), paste_provider) | ||||
| 
 | ||||
| 
 | ||||
|         # Num | ||||
|         # if set not full or provider already present | ||||
|         if self.r_serv_trend.zcard(redis_providers_name_set) < self.MAX_SET_CARDINALITY or self.r_serv_trend.zscore(redis_providers_name_set, paste_provider) != "nil": | ||||
|             self.r_serv_trend.zadd(redis_providers_name_set, float(num_paste), paste_provider) | ||||
|         else: #set at full capacity | ||||
|             member_set = self.r_serv_trend.zrangebyscore(redis_providers_name_set, '-inf', '+inf', withscores=True, start=0, num=1) | ||||
|             # Member set is a list of (value, score) pairs | ||||
|             if int(member_set[0][1]) < num_paste: | ||||
|                 #remove min from set and add the new one | ||||
|                 self.redis_logger.debug('Num - adding ' +paste_provider+ '(' +str(num_paste)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')') | ||||
|                 self.r_serv_trend.zrem(member_set[0][0]) | ||||
|                 self.r_serv_trend.zadd(redis_providers_name_set, float(num_paste), paste_provider) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
| 
 | ||||
|     module = ModuleStats() | ||||
|     module.run() | ||||
|  | @ -10,9 +10,133 @@ sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) | |||
| import ConfigLoader | ||||
| 
 | ||||
| config_loader = ConfigLoader.ConfigLoader() | ||||
| r_serv_statistics = config_loader.get_redis_conn("ARDB_Statistics") | ||||
| r_statistics = config_loader.get_redis_conn("ARDB_Statistics") | ||||
| #r_serv_trend = ConfigLoader().get_redis_conn("ARDB_Trending") | ||||
| config_loader = None | ||||
| 
 | ||||
| PIE_CHART_MAX_CARDINALITY = 8 | ||||
| 
 | ||||
| def incr_module_timeout_statistic(module_name): | ||||
|     curr_date = datetime.date.today() | ||||
|     r_serv_statistics.hincrby(curr_date.strftime("%Y%m%d"), 'paste_by_modules_timeout:{}'.format(module_name), 1) | ||||
|     r_statistics.hincrby(curr_date.strftime("%Y%m%d"), 'paste_by_modules_timeout:{}'.format(module_name), 1) | ||||
| 
 | ||||
| def create_item_statistics(item_id, source, size): | ||||
|     pass | ||||
| 
 | ||||
| def get_item_sources(): | ||||
|     return r_statistics.smembers('all_provider_set') | ||||
| 
 | ||||
| def get_nb_items_processed_by_day_and_source(date, source): | ||||
|     nb_items = r_statistics.hget(f'{source}_num', date) | ||||
|     if not nb_items: | ||||
|         nb_items = 0 | ||||
|     return int(nb_items) | ||||
| 
 | ||||
| def get_items_total_size_by_day_and_source(date, source): | ||||
|     total_size = r_statistics.hget(f'{source}_size', date) | ||||
|     if not total_size: | ||||
|         total_size = 0 | ||||
|     return float(total_size) | ||||
| 
 | ||||
| def get_items_av_size_by_day_and_source(date, source): | ||||
|     av_size = r_statistics.hget(f'{source}_avg', date) | ||||
|     if not av_size: | ||||
|         av_size = 0 | ||||
|     return float(av_size) | ||||
| 
 | ||||
| def _create_item_stats_size_nb(date, source, num, size, avg): | ||||
|     r_statistics.hset(f'{source}_num', date, num) | ||||
|     r_statistics.hset(f'{source}_size', date, size) | ||||
|     r_statistics.hset(f'{source}_avg', date, avg) | ||||
| 
 | ||||
| def get_item_stats_size_avg_by_date(): | ||||
|     return r_statistics.zrange(f'top_avg_size_set_{date}', 0, -1, withscores=True) | ||||
| 
 | ||||
| def get_item_stats_nb_by_date(): | ||||
|     return r_statistics.zrange(f'providers_set_{date}', 0, -1, withscores=True) | ||||
| 
 | ||||
| def _set_item_stats_nb_by_date(date, source): | ||||
|     return r_statistics.zrange(f'providers_set_{date}', ) | ||||
| 
 | ||||
| 
 | ||||
| # # TODO: load ZSET IN CACHE => FAST UPDATE | ||||
| def update_item_stats_size_nb(item_id, source, size, date): | ||||
|     # Add/Update in Redis | ||||
|     r_statistics.sadd('all_provider_set', source) | ||||
| 
 | ||||
|     nb_items = int(r_statistics.hincrby(f'{source}_num', date, 1)) | ||||
|     sum_size = float(r_statistics.hincrbyfloat(f'{source}_size', date, size)) | ||||
|     new_avg = sum_size / nb_items | ||||
|     r_statistics.hset(f'{source}_avg', date, new_avg) | ||||
| 
 | ||||
|     # TOP Items Size | ||||
|     if r_statistics.zcard(f'top_size_set_{date}') < PIE_CHART_MAX_CARDINALITY: | ||||
|         r_statistics.zadd(f'top_avg_size_set_{date}', new_avg, source) | ||||
|     else: | ||||
|         member_set = r_statistics.zrangebyscore(f'top_avg_size_set_{date}', '-inf', '+inf', withscores=True, start=0, num=1) | ||||
|         # Member set is a list of (value, score) pairs | ||||
|         if float(member_set[0][1]) < new_avg: | ||||
|             # remove min from set and add the new one | ||||
|             r_statistics.zrem(f'top_avg_size_set_{date}', member_set[0][0]) | ||||
|             r_statistics.zadd(f'top_avg_size_set_{date}', new_avg, source) | ||||
| 
 | ||||
|     # TOP Nb Items | ||||
|     if r_statistics.zcard(f'providers_set_{date}') < PIE_CHART_MAX_CARDINALITY or r_statistics.zscore(f'providers_set_{date}', source) != None: | ||||
|         r_statistics.zadd(f'providers_set_{date}', float(nb_items), source) | ||||
|     else: # zset at full capacity | ||||
|         member_set = r_statistics.zrangebyscore(f'providers_set_{date}', '-inf', '+inf', withscores=True, start=0, num=1) | ||||
|         # Member set is a list of (value, score) pairs | ||||
|         if int(member_set[0][1]) < nb_items: | ||||
|             # remove min from set and add the new one | ||||
|             r_statistics.zrem(member_set[0][0]) | ||||
|             r_statistics.zadd(f'providers_set_{date}', float(nb_items), source) | ||||
| 
 | ||||
| # keyword  num | ||||
| 
 | ||||
| def _add_module_stats(module_name, total_sum, keyword, date): | ||||
|     r_statistics.zadd(f'top_{module_name}_set_{date}', float(total_sum), keyword) | ||||
| 
 | ||||
| # # TODO: ONE HSET BY MODULE / CUSTOM STATS | ||||
| def update_module_stats(module_name, num, keyword, date): | ||||
| 
 | ||||
|     # Add/Update in Redis | ||||
|     r_statistics.hincrby(date, f'{module_name}-{keyword}', int(num)) # # TODO: RENAME ME !!!!!!!!!!!!!!!!!!!!!!!!! | ||||
| 
 | ||||
|     # Compute Most Posted | ||||
|     # check if this keyword is eligible for progression | ||||
|     keyword_total_sum = 0 | ||||
| 
 | ||||
|     curr_value = r_statistics.hget(date, module+'-'+keyword) | ||||
|     keyword_total_sum += int(curr_value) if curr_value is not None else 0 | ||||
| 
 | ||||
|     if r_statistics.zcard(f'top_{module_name}_set_{date}') < PIE_CHART_MAX_CARDINALITY: | ||||
|         r_statistics.zadd(f'top_{module_name}_set_{date}', float(keyword_total_sum), keyword) | ||||
|     else: # zset at full capacity | ||||
|         member_set = r_statistics.zrangebyscore(f'top_{module_name}_set_{date}', '-inf', '+inf', withscores=True, start=0, num=1) | ||||
|         # Member set is a list of (value, score) pairs | ||||
|         if int(member_set[0][1]) < keyword_total_sum: | ||||
|             #remove min from set and add the new one | ||||
|             r_statistics.zrem(f'top_{module_name}_set_{date}', member_set[0][0]) | ||||
|             r_statistics.zadd(f'top_{module_name}_set_{date}', float(keyword_total_sum), keyword) | ||||
| 
 | ||||
| def get_module_tld_stats_by_tld_date(date, tld): | ||||
|     nb_tld = r_statistics.hget(f'credential_by_tld:{date}', tld) | ||||
|     if not nb_tld: | ||||
|         nb_tld = 0 | ||||
|     return int(nb_tld) | ||||
| 
 | ||||
| def get_module_tld_stats_by_date(module, date): | ||||
|     return r_statistics.hgetall(f'{module}_by_tld:{date}') | ||||
| 
 | ||||
| def add_module_tld_stats_by_date(module, date, tld, nb): | ||||
|     r_statistics.hincrby(f'{module}_by_tld:{date}', tld, int(nb)) | ||||
| 
 | ||||
| 
 | ||||
| def get_iban_country_stats_by_date(date): | ||||
|     return r_statistics.hgetall(f'iban_by_country:{date}') | ||||
| 
 | ||||
| def add_iban_country_stats_by_date(date, tld, nb): | ||||
|     r_statistics.hincrby(f'iban_by_country:{date}', tld, int(nb)) | ||||
| 
 | ||||
| # r_stats.zincrby('module:Global:incomplete_file', datetime.datetime.now().strftime('%Y%m%d'), 1) | ||||
| # r_stats.zincrby('module:Global:invalid_file', datetime.datetime.now().strftime('%Y%m%d'), 1) | ||||
|  |  | |||
|  | @ -616,6 +616,12 @@ def api_set_nb_crawlers_to_launch(dict_splash_name): | |||
|     else: | ||||
|         return ({'error':'invalid input'}, 400) | ||||
| 
 | ||||
| def get_domains_blacklist(domain_type): | ||||
|     return r_serv_onion.smembers(f'blacklist_{domain_type}') | ||||
| 
 | ||||
| def add_domain_blacklist(domain_type, domain): | ||||
|     r_serv_onion.sadd(f'blacklist_{domain_type}', domain) | ||||
| 
 | ||||
| ##-- CRAWLER GLOBAL --## | ||||
| 
 | ||||
| #### AUTOMATIC CRAWLER #### | ||||
|  |  | |||
|  | @ -42,6 +42,7 @@ from modules.abstract_module import AbstractModule | |||
| from packages.Item import Item | ||||
| from lib import ConfigLoader | ||||
| from lib import regex_helper | ||||
| from lib import Statistics | ||||
| 
 | ||||
| 
 | ||||
| class Credential(AbstractModule): | ||||
|  | @ -96,6 +97,7 @@ class Credential(AbstractModule): | |||
| 
 | ||||
|         item_content = item.get_content() | ||||
| 
 | ||||
|         # TODO: USE SETS | ||||
|         # Extract all credentials | ||||
|         all_credentials = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.regex_cred, item.get_id(), item_content, max_time=self.max_execution_time) | ||||
| 
 | ||||
|  | @ -117,9 +119,6 @@ class Credential(AbstractModule): | |||
|                 print(f"========> Found more than 10 credentials in this file : {item.get_id()}") | ||||
|                 self.redis_logger.warning(to_print) | ||||
| 
 | ||||
|                 # Send to duplicate | ||||
|                 self.send_message_to_queue(item.get_id(), 'Duplicate') | ||||
| 
 | ||||
|                 msg = f'infoleak:automatic-detection="credential";{item.get_id()}' | ||||
|                 self.send_message_to_queue(msg, 'Tags') | ||||
| 
 | ||||
|  | @ -158,6 +157,7 @@ class Credential(AbstractModule): | |||
|                     print(f"=======> Probably on : {discovered_sites}") | ||||
| 
 | ||||
|                 date = datetime.now().strftime("%Y%m") | ||||
|                 nb_tlds = {} | ||||
|                 for cred in all_credentials: | ||||
|                     maildomains = re.findall("@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,20}", cred.lower())[0] | ||||
|                     self.faup.decode(maildomains) | ||||
|  | @ -167,7 +167,9 @@ class Credential(AbstractModule): | |||
|                         tld = tld.decode() | ||||
|                     except: | ||||
|                         pass | ||||
|                     self.server_statistics.hincrby('credential_by_tld:'+date, tld, 1) | ||||
|                     nb_tlds[tld] = nb_tlds.get(tld, 0) + 1 | ||||
|                 for tld in nb_tlds: | ||||
|                     Statistics.add_module_tld_stats_by_date('credential', date, tld, nb_tlds[tld]) | ||||
|             else: | ||||
|                 self.redis_logger.info(to_print) | ||||
|                 print(f'found {nb_cred} credentials') | ||||
|  |  | |||
|  | @ -75,9 +75,6 @@ class CreditCards(AbstractModule): | |||
|             if (len(creditcard_set) > 0): | ||||
|                 self.redis_logger.warning(f'{to_print}Checked {len(creditcard_set)} valid number(s);{item.get_id()}') | ||||
| 
 | ||||
|                 #Send to duplicate | ||||
|                 self.send_message_to_queue(item.get_id(), 'Duplicate') | ||||
| 
 | ||||
|                 msg = f'infoleak:automatic-detection="credit-card";{item.get_id()}' | ||||
|                 self.send_message_to_queue(msg, 'Tags') | ||||
| 
 | ||||
|  |  | |||
|  | @ -0,0 +1,177 @@ | |||
| #!/usr/bin/env python3 | ||||
| # -*-coding:UTF-8 -* | ||||
| 
 | ||||
| """ | ||||
| The Mails Module | ||||
| ====================== | ||||
| 
 | ||||
| This module is consuming the Redis-list created by the Categ module. | ||||
| 
 | ||||
| It apply mail regexes on item content and warn if above a threshold. | ||||
| 
 | ||||
| """ | ||||
| 
 | ||||
| import os | ||||
| import re | ||||
| import redis | ||||
| import sys | ||||
| import time | ||||
| import datetime | ||||
| 
 | ||||
| import dns.resolver | ||||
| import dns.exception | ||||
| 
 | ||||
| from pyfaup.faup import Faup | ||||
| 
 | ||||
| sys.path.append(os.environ['AIL_BIN']) | ||||
| ################################## | ||||
| # Import Project packages        # | ||||
| ################################## | ||||
| from modules.abstract_module import AbstractModule | ||||
| from lib.objects.Items import Item | ||||
| from lib.ConfigLoader import ConfigLoader | ||||
| from lib import Statistics | ||||
| 
 | ||||
| 
 | ||||
| class Mail(AbstractModule): | ||||
|     """ | ||||
|     Module Mail module for AIL framework | ||||
|     """ | ||||
| 
 | ||||
|     def __init__(self): | ||||
|         super(Mail, self).__init__() | ||||
| 
 | ||||
|         config_loader = ConfigLoader() | ||||
|         self.r_cache = config_loader.get_redis_conn("Redis_Cache") | ||||
| 
 | ||||
|         self.dns_server = config_loader.get_config_str('Mail', 'dns') | ||||
| 
 | ||||
|         self.faup = Faup() | ||||
| 
 | ||||
|         # Numbers of Mails needed to Tags | ||||
|         self.mail_threshold = 10 | ||||
| 
 | ||||
|         self.regex_timeout = 30 | ||||
|         self.email_regex = "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}" | ||||
|         re.compile(self.email_regex) | ||||
| 
 | ||||
|     def is_mxdomain_in_cache(self, mxdomain): | ||||
|         return self.r_cache.exists(f'mxdomain:{mxdomain}') | ||||
| 
 | ||||
|     def save_mxdomain_in_cache(self, mxdomain): | ||||
|         self.r_cache.setex(f'mxdomain:{mxdomain}', 1, datetime.timedelta(days=1)) | ||||
| 
 | ||||
|     def check_mx_record(self, set_mxdomains): | ||||
|         """Check if emails MX domains are responding. | ||||
| 
 | ||||
|         :param adress_set: -- (set) This is a set of emails domains | ||||
|         :return: (int) Number of adress with a responding and valid MX domains | ||||
| 
 | ||||
|         """ | ||||
|         resolver = dns.resolver.Resolver() | ||||
|         resolver.nameservers = [self.dns_server] | ||||
|         resolver.timeout = 5.0 | ||||
|         resolver.lifetime = 2.0 | ||||
| 
 | ||||
|         valid_mxdomain = [] | ||||
|         for mxdomain in set_mxdomains: | ||||
| 
 | ||||
|             # check if is in cache | ||||
|             # # TODO: | ||||
|             if self.is_mxdomain_in_cache(mxdomain): | ||||
|                 valid_mxdomain.append(mxdomain) | ||||
|             else: | ||||
| 
 | ||||
|                 # DNS resolution | ||||
|                 try: | ||||
|                     answers = resolver.query(mxdomain, rdtype=dns.rdatatype.MX) | ||||
|                     if answers: | ||||
|                         self.save_mxdomain_in_cache(mxdomain) | ||||
|                         valid_mxdomain.append(mxdomain) | ||||
|                         # DEBUG | ||||
|                         # print('---') | ||||
|                         # print(answers.response) | ||||
|                         # print(answers.qname) | ||||
|                         # print(answers.rdtype) | ||||
|                         # print(answers.rdclass) | ||||
|                         # print(answers.nameserver) | ||||
|                         # print() | ||||
| 
 | ||||
|                 except dns.resolver.NoNameservers: | ||||
|                     self.redis_logger.debug('NoNameserver, No non-broken nameservers are available to answer the query.') | ||||
|                     print('NoNameserver, No non-broken nameservers are available to answer the query.') | ||||
|                 except dns.resolver.NoAnswer: | ||||
|                     self.redis_logger.debug('NoAnswer, The response did not contain an answer to the question.') | ||||
|                     print('NoAnswer, The response did not contain an answer to the question.') | ||||
|                 except dns.name.EmptyLabel: | ||||
|                     self.redis_logger.debug('SyntaxError: EmptyLabel') | ||||
|                     print('SyntaxError: EmptyLabel') | ||||
|                 except dns.resolver.NXDOMAIN: | ||||
|                     #save_mxdomain_in_cache(mxdomain) | ||||
|                     self.redis_logger.debug('The query name does not exist.') | ||||
|                     print('The query name does not exist.') | ||||
|                 except dns.name.LabelTooLong: | ||||
|                     self.redis_logger.debug('The Label is too long') | ||||
|                     print('The Label is too long') | ||||
|                 except dns.exception.Timeout: | ||||
|                     print('dns timeout') | ||||
|                     #save_mxdomain_in_cache(mxdomain) | ||||
|                 except Exception as e: | ||||
|                     print(e) | ||||
|         return valid_mxdomain | ||||
| 
 | ||||
|     # # TODO: sanityze mails | ||||
|     def compute(self, message): | ||||
|         item_id, score = message.split() | ||||
|         item = Item(item_id) | ||||
|         item_date = item.get_date() | ||||
| 
 | ||||
|         mails = self.regex_findall(self.email_regex, item_id, item.get_content()) | ||||
|         mxdomains_email = {} | ||||
|         for mail in mails: | ||||
|             mxdomain = mail.rsplit('@', 1)[1].lower() | ||||
|             if not mxdomain in mxdomains_email: | ||||
|                 mxdomains_email[mxdomain] = set() | ||||
|             mxdomains_email[mxdomain].add(mail) | ||||
| 
 | ||||
|             ## TODO: add MAIL trackers | ||||
| 
 | ||||
|         valid_mx = self.check_mx_record(mxdomains_email.keys()) | ||||
|         print(f'valid_mx: {valid_mx}') | ||||
|         mx_tlds = {} | ||||
|         num_valid_email = 0 | ||||
|         for domain_mx in valid_mx: | ||||
|             nb_mails = len(mxdomains_email[domain_mx]) | ||||
|             num_valid_email += nb_mails | ||||
| 
 | ||||
|             # Create doamin_mail stats | ||||
|             msg = f'mail;{nb_mails};{domain_mx};{item_date}' | ||||
|             self.send_message_to_queue(msg, 'ModuleStats') | ||||
| 
 | ||||
|             # Create country stats | ||||
|             self.faup.decode(domain_mx) | ||||
|             tld = self.faup.get()['tld'] | ||||
|             try: | ||||
|                 tld = tld.decode() | ||||
|             except: | ||||
|                 pass | ||||
|             mx_tlds[tld] = mx_tlds.get(tld, 0) + nb_mails | ||||
|         for tld in mx_tlds: | ||||
|             Statistics.add_module_tld_stats_by_date('mail', item_date, tld, mx_tlds[tld]) | ||||
| 
 | ||||
|         if num_valid_email > self.mail_threshold: | ||||
|             msg = f'Mails;{item.get_source()};{item_date};{item.get_basename()};Checked {num_valid_email} e-mail(s);{item_id}' | ||||
|             print(f'{item_id}    Checked {num_valid_email} e-mail(s)') | ||||
|             self.redis_logger.warning(msg) | ||||
|             # Tags | ||||
|             msg = f'infoleak:automatic-detection="mail";{item_id}' | ||||
|             self.send_message_to_queue(msg, 'Tags') | ||||
|         else: | ||||
|             self.redis_logger.info(msg) | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     module = Mail() | ||||
|     #module.compute('tests/2021/01/01/mails.gz 50') | ||||
|     module.run() | ||||
|  | @ -0,0 +1,54 @@ | |||
| #!/usr/bin/env python3 | ||||
| # -*-coding:UTF-8 -* | ||||
| """ | ||||
|     This module makes statistics for some modules and providers | ||||
| 
 | ||||
| """ | ||||
| 
 | ||||
| ################################## | ||||
| # Import External packages       # | ||||
| ################################## | ||||
| import os | ||||
| import sys | ||||
| 
 | ||||
| sys.path.append(os.environ['AIL_BIN']) | ||||
| ################################## | ||||
| # Import Project packages        # | ||||
| ################################## | ||||
| from modules.abstract_module import AbstractModule | ||||
| from lib.objects.Items import Item | ||||
| from lib import Statistics | ||||
| 
 | ||||
| 
 | ||||
| class ModuleStats(AbstractModule): | ||||
|     """ | ||||
|     Module Statistics module for AIL framework | ||||
|     """ | ||||
| 
 | ||||
| 
 | ||||
|     def __init__(self): | ||||
| 
 | ||||
|         super(ModuleStats, self).__init__() | ||||
| 
 | ||||
|         # Waiting time in secondes between to message proccessed | ||||
|         self.pending_seconds = 20 | ||||
| 
 | ||||
|     def compute(self, message): | ||||
| 
 | ||||
|         # MODULE STATS | ||||
|         if len(message.split(';')) > 1: | ||||
|             module_name, num, keyword, date = message.split(';') | ||||
|             Statisticsupdate_module_stats(module_name, num, keyword, date) | ||||
|         # ITEM STATS | ||||
|         else: | ||||
|             item = Item(item_id) | ||||
|             source = item.get_source() | ||||
|             date = item.get_date() | ||||
|             size = item.get_size() | ||||
|             Statistics.update_item_stats_size_nb(item_id, source, size, date) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
| 
 | ||||
|     module = ModuleStats() | ||||
|     module.run() | ||||
|  | @ -66,7 +66,7 @@ publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Urls,Redis_Credential,R | |||
| 
 | ||||
| [CreditCards] | ||||
| subscribe = Redis_CreditCards | ||||
| publish = Redis_ModuleStats,Redis_Tags | ||||
| publish = Redis_Tags | ||||
| 
 | ||||
| [BankAccount] | ||||
| subscribe = Redis_Global | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	 Terrtia
						Terrtia