From 727bc5596691da467d9585e205a2a7d183c64506 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Thu, 30 Jan 2020 11:31:33 +0100 Subject: [PATCH 01/15] fix: [Tag core] check if item_date type is an integer --- bin/packages/Tag.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bin/packages/Tag.py b/bin/packages/Tag.py index b8394b34..03b9c990 100755 --- a/bin/packages/Tag.py +++ b/bin/packages/Tag.py @@ -392,6 +392,11 @@ def add_tag(object_type, tag, object_id, obj_date=None): # new tag if not is_obj_tagged(object_id, tag): # # TODO: # FIXME: sanityze object_type + if obj_date: + try: + obj_date = int(obj_date) + except: + obj_date = None if not obj_date: obj_date = get_obj_date(object_type, object_id) add_global_tag(tag, object_type=object_type) From e19a3b3e630ed8cacd492e5c36ffa59c3cdfac78 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 3 Feb 2020 09:37:08 +0100 Subject: [PATCH 02/15] fix: [Domain explorer UI] fix daterange pagination links --- .../crawler_splash/domain_explorer.html | 3 + .../templates/domains/domain_explorer.html | 239 ------------------ 2 files changed, 3 insertions(+), 239 deletions(-) delete mode 100644 var/www/templates/domains/domain_explorer.html diff --git a/var/www/templates/crawler/crawler_splash/domain_explorer.html b/var/www/templates/crawler/crawler_splash/domain_explorer.html index 007da8f0..16595014 100644 --- a/var/www/templates/crawler/crawler_splash/domain_explorer.html +++ b/var/www/templates/crawler/crawler_splash/domain_explorer.html @@ -83,6 +83,9 @@ {%else%} {% set target_url=url_for('crawler_splash.domains_explorer_web') + "?domain_type=regular" %} {%endif%} + {%if 'date_from' in dict_data %} + {% set target_url = target_url + '&date_from=' + dict_data['date_from'] + '&date_to=' + dict_data['date_to'] %} + {%endif%} {% include 'pagination.html' %} {% endwith %} {%endif%} diff --git a/var/www/templates/domains/domain_explorer.html b/var/www/templates/domains/domain_explorer.html deleted file mode 100644 index ec6211c7..00000000 --- a/var/www/templates/domains/domain_explorer.html +++ /dev/null @@ -1,239 +0,0 @@ - - - - Show Domain - AIL - - - - - - - - - - - - - - - {% include 'nav_bar.html' %} - -
-
- - {% include 'crawler/menu_sidebar.html' %} - -
- -
-
-
-
- -
-
- -
-
- -
-
-
-
- - {% for dict_domain in dict_data['list_elem'] %} - - {% if loop.index0 % 4 == 0 %} -
- {% endif %} - -
-
- - -
-
-
- - {{dict_domain["id"]}} - -
-

- - First seen: {{dict_domain["first_seen"]}}
- Last_seen: {{dict_domain["first_seen"]}}
- Ports: {{dict_domain["ports"]}} -
-

- Status: - {% if dict_domain["status"] %} - - - UP - - {% else %} - - - DOWN - - {% endif %} -
- {% for tag in dict_domain['tags'] %} - - {{ tag }} - - {% endfor %} -
-
-
- - - {% if loop.index0 % 4 == 3 %} -
- {% endif %} - - {% endfor %} - - {% if dict_data['list_elem']|length % 4 != 0 %} -
- {% endif %} - -
-
- - {%if 'list_elem' in dict_data%} - {% with page=dict_data['page'], nb_page_max=dict_data['nb_pages'], nb_first_elem=dict_data['nb_first_elem'], nb_last_elem=dict_data['nb_last_elem'], nb_all_elem=dict_data['nb_all_elem'] %} - {% set object_name="domain" %} - {%if domain_type=='onion'%} - {% set target_url=url_for('crawler_splash.domains_explorer_onion') + "?domain_type=onion" %} - {%else%} - {% set target_url=url_for('crawler_splash.domains_explorer_web') + "?domain_type=regular" %} - {%endif%} - {% include 'pagination.html' %} - {% endwith %} - {%endif%} - - - - - -
-
- - - - - - - - - - - From e808840f957c810b8e3944cba808716dc722581b Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 3 Feb 2020 10:32:20 +0100 Subject: [PATCH 03/15] fix: [Global: filename provided by all feeders] avoid path tranversal --- bin/Global.py | 54 +++++++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/bin/Global.py b/bin/Global.py index 00207c63..b0419bea 100755 --- a/bin/Global.py +++ b/bin/Global.py @@ -45,8 +45,10 @@ if __name__ == '__main__': p = Process(config_section) + # get and sanityze PASTE DIRECTORY PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) PASTES_FOLDERS = PASTES_FOLDER + '/' + PASTES_FOLDERS = os.path.join(os.path.realpath(PASTES_FOLDERS), '') # LOGGING # publisher.info("Feed Script started to receive & publish.") @@ -75,6 +77,10 @@ if __name__ == '__main__': time.sleep(1) continue + # remove PASTES_FOLDER from item path (crawled item + submited) + if PASTES_FOLDERS in paste: + paste = paste.replace(PASTES_FOLDERS, '', 1) + file_name_paste = paste.split('/')[-1] if len(file_name_paste)>255: new_file_name_paste = '{}{}.gz'.format(file_name_paste[:215], str(uuid.uuid4())) @@ -82,33 +88,35 @@ if __name__ == '__main__': # Creating the full filepath filename = os.path.join(PASTES_FOLDER, paste) + filename = os.path.realpath(filename) - dirname = os.path.dirname(filename) - if not os.path.exists(dirname): - os.makedirs(dirname) + # incorrect filename + if not os.path.commonprefix([filename, PASTES_FOLDER]) == PASTES_FOLDER: + print('Path traversal detected {}'.format(filename)) + publisher.warning('Global; Path traversal detected') + else: + dirname = os.path.dirname(filename) + if not os.path.exists(dirname): + os.makedirs(dirname) - decoded = base64.standard_b64decode(gzip64encoded) + decoded = base64.standard_b64decode(gzip64encoded) - with open(filename, 'wb') as f: - f.write(decoded) - '''try: - decoded2 = gunzip_bytes_obj(decoded) - except: - decoded2 ='' + with open(filename, 'wb') as f: + f.write(decoded) + '''try: + decoded2 = gunzip_bytes_obj(decoded) + except: + decoded2 ='' - type = magic.from_buffer(decoded2, mime=True) + type = magic.from_buffer(decoded2, mime=True) - if type!= 'text/x-c++' and type!= 'text/html' and type!= 'text/x-c' and type!= 'text/x-python' and type!= 'text/x-php' and type!= 'application/xml' and type!= 'text/x-shellscript' and type!= 'text/plain' and type!= 'text/x-diff' and type!= 'text/x-ruby': + if type!= 'text/x-c++' and type!= 'text/html' and type!= 'text/x-c' and type!= 'text/x-python' and type!= 'text/x-php' and type!= 'application/xml' and type!= 'text/x-shellscript' and type!= 'text/plain' and type!= 'text/x-diff' and type!= 'text/x-ruby': - print('-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------') - print(filename) - print(type) - print('-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------') - ''' + print('-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------') + print(filename) + print(type) + print('-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------') + ''' - # remove PASTES_FOLDER from item path (crawled item + submited) - if PASTES_FOLDERS in paste: - paste = paste.replace(PASTES_FOLDERS, '', 1) - - p.populate_set_out(paste) - processed_paste+=1 + p.populate_set_out(paste) + processed_paste+=1 From 4d8db3fcc4757cad99ed3bbea057e5080abaffa8 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 3 Feb 2020 14:51:51 +0100 Subject: [PATCH 04/15] fix: [Global: already saved filename] save updated + filter duplicated items --- bin/Global.py | 71 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 53 insertions(+), 18 deletions(-) diff --git a/bin/Global.py b/bin/Global.py index b0419bea..7ef3c78e 100755 --- a/bin/Global.py +++ b/bin/Global.py @@ -21,14 +21,24 @@ Requirements """ import base64 +import hashlib +import io +import gzip import os +import sys import time import uuid from pubsublogger import publisher from Helper import Process -import magic +def gunzip_bytes_obj(bytes_obj): + in_ = io.BytesIO() + in_.write(bytes_obj) + in_.seek(0) + with gzip.GzipFile(fileobj=in_, mode='rb') as fo: + gunzipped_bytes_obj = fo.read() + return gunzipped_bytes_obj def rreplace(s, old, new, occurrence): li = s.rsplit(old, occurrence) @@ -67,9 +77,9 @@ if __name__ == '__main__': publisher.debug("Empty Paste: {0} not processed".format(message)) continue else: - print("Empty Queues: Waiting...") + #print("Empty Queues: Waiting...") if int(time.time() - time_1) > 30: - to_print = 'Global; ; ; ;glob Processed {0} paste(s)'.format(processed_paste) + to_print = 'Global; ; ; ;glob Processed {0} paste(s) in {1} s'.format(processed_paste, time.time() - time_1) print(to_print) #publisher.info(to_print) time_1 = time.time() @@ -95,28 +105,53 @@ if __name__ == '__main__': print('Path traversal detected {}'.format(filename)) publisher.warning('Global; Path traversal detected') else: + + # decode compressed base64 + decoded = base64.standard_b64decode(gzip64encoded) + + # check if file exist + if os.path.isfile(filename): + print('File already exist {}'.format(filename)) + publisher.warning('Global; File already exist') + + + with gzip.open(filename, 'rb') as f: + curr_file_content = f.read() + curr_file_md5 = hashlib.md5(curr_file_content).hexdigest() + + new_file_content = gunzip_bytes_obj(decoded) + new_file_md5 = hashlib.md5(new_file_content).hexdigest() + + if new_file_md5 != curr_file_md5: + + if filename.endswith('.gz'): + filename = '{}_{}.gz'.format(filename[:-3], new_file_md5) + else: + filename = '{}_{}'.format(filename, new_file_md5) + + # continue if new file already exist + if os.path.isfile(filename): + print('ignore duplicated file') + continue + + print('new file: {}'.format(filename)) + # ignore duplicate + else: + print('ignore duplicated file') + continue + + # create subdir dirname = os.path.dirname(filename) if not os.path.exists(dirname): os.makedirs(dirname) - decoded = base64.standard_b64decode(gzip64encoded) - with open(filename, 'wb') as f: f.write(decoded) - '''try: - decoded2 = gunzip_bytes_obj(decoded) - except: - decoded2 ='' - type = magic.from_buffer(decoded2, mime=True) - - if type!= 'text/x-c++' and type!= 'text/html' and type!= 'text/x-c' and type!= 'text/x-python' and type!= 'text/x-php' and type!= 'application/xml' and type!= 'text/x-shellscript' and type!= 'text/plain' and type!= 'text/x-diff' and type!= 'text/x-ruby': - - print('-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------') - print(filename) - print(type) - print('-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------') - ''' + paste = filename + # remove PASTES_FOLDER from + if PASTES_FOLDERS in paste: + paste = paste.replace(PASTES_FOLDERS, '', 1) p.populate_set_out(paste) processed_paste+=1 From 8770bf05d732afe3a48cb03cd2b463c73bd8c061 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 3 Feb 2020 15:29:37 +0100 Subject: [PATCH 05/15] fix: [IPAddress] catch empty config error --- bin/IPAddress.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/bin/IPAddress.py b/bin/IPAddress.py index f03ee8b3..e45a4ce6 100755 --- a/bin/IPAddress.py +++ b/bin/IPAddress.py @@ -6,7 +6,7 @@ The IP Module This module is consuming the global channel. -It first performs a regex to find IP addresses and then matches those IPs to +It first performs a regex to find IP addresses and then matches those IPs to some configured ip ranges. The list of IP ranges are expected to be in CIDR format (e.g. 192.168.0.0/16) @@ -16,6 +16,7 @@ and should be defined in the config.cfg file, under the [IP] section import time import re +import sys from pubsublogger import publisher from packages import Paste from Helper import Process @@ -60,8 +61,12 @@ if __name__ == '__main__': p = Process(config_section) ip_networks = [] - for network in p.config.get("IP", "networks").split(","): - ip_networks.append(IPv4Network(network)) + try: + for network in p.config.get("IP", "networks").split(","): + ip_networks.append(IPv4Network(network)) + except: + print('Please provide a list of valid IP addresses') + sys.exit(0) # Sent to the logging a description of the module @@ -78,4 +83,3 @@ if __name__ == '__main__': # Do something with the message from the queue search_ip(message) - From f422be917af56f757b3f6df138ab08eecf103bc5 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Tue, 4 Feb 2020 09:29:53 +0100 Subject: [PATCH 06/15] fix: [domain explorer web] fix pagination by daterange --- var/www/blueprints/crawler_splash.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/var/www/blueprints/crawler_splash.py b/var/www/blueprints/crawler_splash.py index ee2e68ba..696b2d6e 100644 --- a/var/www/blueprints/crawler_splash.py +++ b/var/www/blueprints/crawler_splash.py @@ -154,5 +154,5 @@ def domains_explorer_web(): except: page = 1 - dict_data = Domain.domains_up_by_page('regular', page=page) + dict_data = Domain.domains_up_by_page('regular', page=page, date_from=date_from, date_to=date_to) return render_template("domain_explorer.html", dict_data=dict_data, bootstrap_label=bootstrap_label, domain_type='regular') From 6b6c6a4d9e66bbc01db76ecc1261be88bd414aa7 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Tue, 4 Feb 2020 09:34:05 +0100 Subject: [PATCH 07/15] fix: [domain explorer web] fix pagination by daterange --- var/www/blueprints/crawler_splash.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/var/www/blueprints/crawler_splash.py b/var/www/blueprints/crawler_splash.py index 696b2d6e..7acecfbe 100644 --- a/var/www/blueprints/crawler_splash.py +++ b/var/www/blueprints/crawler_splash.py @@ -154,5 +154,5 @@ def domains_explorer_web(): except: page = 1 - dict_data = Domain.domains_up_by_page('regular', page=page, date_from=date_from, date_to=date_to) + dict_data = Domain.get_domains_up_by_filers('regular', page=page, date_from=date_from, date_to=date_to) return render_template("domain_explorer.html", dict_data=dict_data, bootstrap_label=bootstrap_label, domain_type='regular') From 7888f5490232a937c8f03d467c640e72325307cd Mon Sep 17 00:00:00 2001 From: Mike Peters Date: Thu, 6 Feb 2020 09:18:54 +0100 Subject: [PATCH 08/15] Added support for username as notification credentials --- bin/NotificationHelper.py | 6 +++++- configs/core.cfg.sample | 3 +++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/bin/NotificationHelper.py b/bin/NotificationHelper.py index 02568a1e..b9c90104 100755 --- a/bin/NotificationHelper.py +++ b/bin/NotificationHelper.py @@ -26,6 +26,7 @@ publisher.channel = "Script" def sendEmailNotification(recipient, alert_name, content): sender = config_loader.get_config_str("Notifications", "sender") + sender_user = config_loader.get_config_str("Notifications", "sender_user") sender_host = config_loader.get_config_str("Notifications", "sender_host") sender_port = config_loader.get_config_int("Notifications", "sender_port") sender_pw = config_loader.get_config_str("Notifications", "sender_pw") @@ -49,7 +50,10 @@ def sendEmailNotification(recipient, alert_name, content): smtp_server = smtplib.SMTP_SSL(sender_host, sender_port) smtp_server.ehlo() - smtp_server.login(sender, sender_pw) + if sender_user is not None: + smtp_server.login(sender_user, sender_pw) + else: + smtp_server.login(sender, sender_pw) else: smtp_server = smtplib.SMTP(sender_host, sender_port) diff --git a/configs/core.cfg.sample b/configs/core.cfg.sample index 632e1d07..f5d88d4e 100644 --- a/configs/core.cfg.sample +++ b/configs/core.cfg.sample @@ -28,6 +28,9 @@ sender = sender@example.com sender_host = smtp.example.com sender_port = 1337 sender_pw = None +# Only needed when the credentials for email server needs a username instead of an email address +#sender_user = sender +sender_user = # optional for using with authenticated SMTP over SSL # sender_pw = securepassword From fc58940ed67f8b8b6c55f1b7ccc44c9da2d7b561 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Thu, 6 Feb 2020 09:41:43 +0100 Subject: [PATCH 09/15] fix: [core Global] catch and log incomplete files --- bin/Global.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/bin/Global.py b/bin/Global.py index 7ef3c78e..9a6b37e2 100755 --- a/bin/Global.py +++ b/bin/Global.py @@ -114,9 +114,14 @@ if __name__ == '__main__': print('File already exist {}'.format(filename)) publisher.warning('Global; File already exist') + try: + with gzip.open(filename, 'rb') as f: + curr_file_content = f.read() + except EOFError: + publisher.warning('Global; Incomplete file: {}'.format(filename)) + # discard item + continue - with gzip.open(filename, 'rb') as f: - curr_file_content = f.read() curr_file_md5 = hashlib.md5(curr_file_content).hexdigest() new_file_content = gunzip_bytes_obj(decoded) From 62ce4646e5b605893eec81df22c902e7d1269073 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Fri, 7 Feb 2020 10:12:38 +0100 Subject: [PATCH 10/15] fix: [search items tag] fix daterange --- bin/packages/Tag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/packages/Tag.py b/bin/packages/Tag.py index 03b9c990..bc5a290d 100755 --- a/bin/packages/Tag.py +++ b/bin/packages/Tag.py @@ -451,7 +451,7 @@ def delete_obj_tags(object_id, object_type, tags=[]): return res def sanitise_tags_date_range(l_tags, date_from=None, date_to=None): - if date_from or date_to is None: + if date_from is None or date_to is None: date_from = get_tags_min_last_seen(l_tags, r_int=False) date_to = date_from return Date.sanitise_date_range(date_from, date_to) From d8fbd72863f56f33de5f70bc450e115861623866 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Fri, 7 Feb 2020 10:53:45 +0100 Subject: [PATCH 11/15] fix: [Global catch incomplete file error] fix #464 --- bin/Global.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/bin/Global.py b/bin/Global.py index 9a6b37e2..3bf36215 100755 --- a/bin/Global.py +++ b/bin/Global.py @@ -28,10 +28,21 @@ import os import sys import time import uuid + +import datetime +import redis + +sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) +import ConfigLoader + from pubsublogger import publisher from Helper import Process +config_loader = ConfigLoader.ConfigLoader() +r_stats = config_loader.get_redis_conn("ARDB_Statistics") +config_loader = None + def gunzip_bytes_obj(bytes_obj): in_ = io.BytesIO() in_.write(bytes_obj) @@ -119,6 +130,14 @@ if __name__ == '__main__': curr_file_content = f.read() except EOFError: publisher.warning('Global; Incomplete file: {}'.format(filename)) + # save daily stats + r_stats.zincrby('module:Global:incomplete_file', datetime.datetime.now().strftime('%Y%m%d'), 1) + # discard item + continue + except OSError: + publisher.warning('Global; Not a gzipped file: {}'.format(filename)) + # save daily stats + r_stats.zincrby('module:Global:invalid_file', datetime.datetime.now().strftime('%Y%m%d'), 1) # discard item continue From f9856a1589cb49fc6e407587d7609bf4a3e6af23 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 10 Feb 2020 10:31:53 +0100 Subject: [PATCH 12/15] fix: [module Webstats + BankAccount-Decoder] fix faup return type + remove old Paste library from BankAccount-Decoder #465 --- bin/BankAccount.py | 24 ++++++++++++------------ bin/Decoder.py | 38 ++++++++++++++++++-------------------- bin/WebStats.py | 5 ++++- bin/packages/Item.py | 3 +++ 4 files changed, 37 insertions(+), 33 deletions(-) diff --git a/bin/BankAccount.py b/bin/BankAccount.py index 16a8a11f..d3cb4c16 100755 --- a/bin/BankAccount.py +++ b/bin/BankAccount.py @@ -5,7 +5,7 @@ The BankAccount Module ====================== -It apply IBAN regexes on paste content and warn if above a threshold. +It apply IBAN regexes on item content and warn if above a threshold. """ @@ -17,7 +17,7 @@ import re import string from itertools import chain -from packages import Paste +from packages import Item from pubsublogger import publisher from Helper import Process @@ -49,7 +49,7 @@ def is_valid_iban(iban): return True return False -def check_all_iban(l_iban, paste, filename): +def check_all_iban(l_iban, obj_id): nb_valid_iban = 0 for iban in l_iban: iban = iban[0]+iban[1]+iban[2] @@ -65,14 +65,14 @@ def check_all_iban(l_iban, paste, filename): server_statistics.hincrby('iban_by_country:'+date, iban[0:2], 1) if(nb_valid_iban > 0): - to_print = 'Iban;{};{};{};'.format(paste.p_source, paste.p_date, paste.p_name) + to_print = 'Iban;{};{};{};'.format(Item.get_source(obj_id), Item.get_item_date(obj_id), Item.get_basename(obj_id)) publisher.warning('{}Checked found {} IBAN;{}'.format( - to_print, nb_valid_iban, paste.p_rel_path)) - msg = 'infoleak:automatic-detection="iban";{}'.format(filename) + to_print, nb_valid_iban, obj_id)) + msg = 'infoleak:automatic-detection="iban";{}'.format(obj_id) p.populate_set_out(msg, 'Tags') #Send to duplicate - p.populate_set_out(filename, 'Duplicate') + p.populate_set_out(obj_id, 'Duplicate') if __name__ == "__main__": publisher.port = 6380 @@ -103,21 +103,21 @@ if __name__ == "__main__": if message is not None: - filename = message - paste = Paste.Paste(filename) - content = paste.get_p_content() + obj_id = Item.get_item_id(message) + + content = Item.get_item_content(obj_id) signal.alarm(max_execution_time) try: l_iban = iban_regex.findall(content) except TimeoutException: - print ("{0} processing timeout".format(paste.p_rel_path)) + print ("{0} processing timeout".format(obj_id)) continue else: signal.alarm(0) if(len(l_iban) > 0): - check_all_iban(l_iban, paste, filename) + check_all_iban(l_iban, obj_id) else: publisher.debug("Script BankAccount is Idling 10s") diff --git a/bin/Decoder.py b/bin/Decoder.py index 82133de7..9ea3adcc 100755 --- a/bin/Decoder.py +++ b/bin/Decoder.py @@ -17,7 +17,6 @@ import datetime from pubsublogger import publisher from Helper import Process -from packages import Paste from packages import Item import re @@ -50,11 +49,11 @@ def decode_string(content, message, date, encoded_list, decoder_name, encoded_mi save_hash(decoder_name, message, date, decode) - #remove encoded from paste content + #remove encoded from item content content = content.replace(encoded, '', 1) if(find): - set_out_paste(decoder_name, message) + set_out_item(decoder_name, message) return content @@ -72,8 +71,8 @@ def save_hash(decoder_name, message, date, decoded): data['estimated type'] = type json_data = json.dumps(data) - date_paste = '{}/{}/{}'.format(date[0:4], date[4:6], date[6:8]) - date_key = date[0:4] + date[4:6] + date[6:8] + date_item = '{}/{}/{}'.format(date[0:4], date[4:6], date[6:8]) + date_key = date serv_metadata.incrby(decoder_name+'_decoded:'+date_key, 1) serv_metadata.zincrby('hash_date:'+date_key, hash, 1) @@ -81,24 +80,24 @@ def save_hash(decoder_name, message, date, decoded): # first time we see this hash if not serv_metadata.hexists('metadata_hash:'+hash, 'estimated_type'): - serv_metadata.hset('metadata_hash:'+hash, 'first_seen', date_paste) - serv_metadata.hset('metadata_hash:'+hash, 'last_seen', date_paste) + serv_metadata.hset('metadata_hash:'+hash, 'first_seen', date_item) + serv_metadata.hset('metadata_hash:'+hash, 'last_seen', date_item) else: - serv_metadata.hset('metadata_hash:'+hash, 'last_seen', date_paste) + serv_metadata.hset('metadata_hash:'+hash, 'last_seen', date_item) - # first time we see this hash (all encoding) on this paste + # first time we see this hash (all encoding) on this item if serv_metadata.zscore('nb_seen_hash:'+hash, message) is None: serv_metadata.hincrby('metadata_hash:'+hash, 'nb_seen_in_all_pastes', 1) - serv_metadata.sadd('hash_paste:'+message, hash) # paste - hash map + serv_metadata.sadd('hash_paste:'+message, hash) # item - hash map # create hash metadata serv_metadata.hset('metadata_hash:'+hash, 'estimated_type', type) serv_metadata.sadd('hash_all_type', type) - # first time we see this hash encoding on this paste + # first time we see this hash encoding on this item if serv_metadata.zscore(decoder_name+'_hash:'+hash, message) is None: print('first '+decoder_name) - serv_metadata.sadd(decoder_name+'_paste:'+message, hash) # paste - hash map + serv_metadata.sadd(decoder_name+'_paste:'+message, hash) # item - hash map # create hash metadata serv_metadata.sadd('hash_'+ decoder_name +'_all_type', type) @@ -118,8 +117,8 @@ def save_hash(decoder_name, message, date, decoded): serv_metadata.zincrby(decoder_name+'_type:'+type, date_key, 1) - serv_metadata.zincrby('nb_seen_hash:'+hash, message, 1)# hash - paste map - serv_metadata.zincrby(decoder_name+'_hash:'+hash, message, 1) # number of b64 on this paste + serv_metadata.zincrby('nb_seen_hash:'+hash, message, 1)# hash - item map + serv_metadata.zincrby(decoder_name+'_hash:'+hash, message, 1) # number of b64 on this item # Domain Object if Item.is_crawled(message): @@ -150,7 +149,7 @@ def save_hash_on_disk(decode, type, hash, json_data): with open(filename_json, 'w') as f: f.write(json_data) -def set_out_paste(decoder_name, message): +def set_out_item(decoder_name, message): publisher.warning(decoder_name+' decoded') #Send to duplicate p.populate_set_out(message, 'Duplicate') @@ -217,12 +216,11 @@ if __name__ == '__main__': time.sleep(1) continue - filename = message - paste = Paste.Paste(filename) + obj_id = Item.get_item_id(message) # Do something with the message from the queue - content = paste.get_p_content() - date = str(paste._get_p_date()) + content = Item.get_item_content(obj_id) + date = Item.get_item_date(obj_id) for decoder in decoder_order: # add threshold and size limit @@ -233,7 +231,7 @@ if __name__ == '__main__': except TimeoutException: encoded_list = [] p.incr_module_timeout_statistic() # add encoder type - print ("{0} processing timeout".format(paste.p_rel_path)) + print ("{0} processing timeout".format(obj_id)) continue else: signal.alarm(0) diff --git a/bin/WebStats.py b/bin/WebStats.py index 7eecb0d2..10aba917 100755 --- a/bin/WebStats.py +++ b/bin/WebStats.py @@ -29,7 +29,10 @@ num_day_to_look = 5 # the detection of the progression start num_day_to_lo def analyse(server, field_name, date, url_parsed): field = url_parsed[field_name] if field is not None: - field = field.decode('utf8') + try: # faup version + field = field.decode() + except: + pass server.hincrby(field, date, 1) if field_name == "domain": #save domain in a set for the monthly plot domain_set_name = "domain_set_" + date[0:6] diff --git a/bin/packages/Item.py b/bin/packages/Item.py index b1722209..e9fcd18b 100755 --- a/bin/packages/Item.py +++ b/bin/packages/Item.py @@ -32,6 +32,9 @@ def exist_item(item_id): else: return False +def get_basename(item_id): + return os.path.basename(item_id) + def get_item_id(full_path): return full_path.replace(PASTES_FOLDER, '', 1) From cf24c59e1d390db39b305b1aec7a17d3b0551b0f Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 10 Feb 2020 10:44:06 +0100 Subject: [PATCH 13/15] fix: [IPAdress] remove leading zeros #465 --- bin/IPAddress.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/IPAddress.py b/bin/IPAddress.py index e45a4ce6..e2ed59cf 100755 --- a/bin/IPAddress.py +++ b/bin/IPAddress.py @@ -32,8 +32,9 @@ def search_ip(message): results = reg_ip.findall(content) matching_ips = [] - for res in results: - address = IPv4Address(res) + for ip in results: + ip = '.'.join([str(int(x)) for x in ip.split('.')]) + address = IPv4Address(ip) for network in ip_networks: if address in network: matching_ips.append(address) From 4097d95237a81e3c506743397eea4880622a1ccf Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 10 Feb 2020 11:04:24 +0100 Subject: [PATCH 14/15] fix: [SentimentAnalysis] download nltk punkt --- bin/SentimentAnalysis.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/bin/SentimentAnalysis.py b/bin/SentimentAnalysis.py index fc9a2f4c..eb27d408 100755 --- a/bin/SentimentAnalysis.py +++ b/bin/SentimentAnalysis.py @@ -30,7 +30,7 @@ sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) import ConfigLoader from nltk.sentiment.vader import SentimentIntensityAnalyzer -from nltk import tokenize +from nltk import tokenize, download # Config Variables accepted_Mime_type = ['text/plain'] @@ -62,7 +62,12 @@ def Analyse(message, server): combined_datetime = datetime.datetime.combine(the_date, the_time) timestamp = calendar.timegm(combined_datetime.timetuple()) - sentences = tokenize.sent_tokenize(p_content) + try: + sentences = tokenize.sent_tokenize(p_content) + except Exception as e: + # use the NLTK Downloader to obtain the resource + download('punkt') + sentences = tokenize.sent_tokenize(p_content) if len(sentences) > 0: avg_score = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compoundPos': 0.0, 'compoundNeg': 0.0} From f950e700bd26900988de32a576655d725a7ff698 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 10 Feb 2020 11:06:39 +0100 Subject: [PATCH 15/15] chg: [SentimentAnalysis] clean --- bin/SentimentAnalysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/SentimentAnalysis.py b/bin/SentimentAnalysis.py index eb27d408..a90a3a09 100755 --- a/bin/SentimentAnalysis.py +++ b/bin/SentimentAnalysis.py @@ -64,7 +64,7 @@ def Analyse(message, server): try: sentences = tokenize.sent_tokenize(p_content) - except Exception as e: + except: # use the NLTK Downloader to obtain the resource download('punkt') sentences = tokenize.sent_tokenize(p_content)