diff --git a/bin/BankAccount.py b/bin/BankAccount.py index 16a8a11f..d3cb4c16 100755 --- a/bin/BankAccount.py +++ b/bin/BankAccount.py @@ -5,7 +5,7 @@ The BankAccount Module ====================== -It apply IBAN regexes on paste content and warn if above a threshold. +It apply IBAN regexes on item content and warn if above a threshold. """ @@ -17,7 +17,7 @@ import re import string from itertools import chain -from packages import Paste +from packages import Item from pubsublogger import publisher from Helper import Process @@ -49,7 +49,7 @@ def is_valid_iban(iban): return True return False -def check_all_iban(l_iban, paste, filename): +def check_all_iban(l_iban, obj_id): nb_valid_iban = 0 for iban in l_iban: iban = iban[0]+iban[1]+iban[2] @@ -65,14 +65,14 @@ def check_all_iban(l_iban, paste, filename): server_statistics.hincrby('iban_by_country:'+date, iban[0:2], 1) if(nb_valid_iban > 0): - to_print = 'Iban;{};{};{};'.format(paste.p_source, paste.p_date, paste.p_name) + to_print = 'Iban;{};{};{};'.format(Item.get_source(obj_id), Item.get_item_date(obj_id), Item.get_basename(obj_id)) publisher.warning('{}Checked found {} IBAN;{}'.format( - to_print, nb_valid_iban, paste.p_rel_path)) - msg = 'infoleak:automatic-detection="iban";{}'.format(filename) + to_print, nb_valid_iban, obj_id)) + msg = 'infoleak:automatic-detection="iban";{}'.format(obj_id) p.populate_set_out(msg, 'Tags') #Send to duplicate - p.populate_set_out(filename, 'Duplicate') + p.populate_set_out(obj_id, 'Duplicate') if __name__ == "__main__": publisher.port = 6380 @@ -103,21 +103,21 @@ if __name__ == "__main__": if message is not None: - filename = message - paste = Paste.Paste(filename) - content = paste.get_p_content() + obj_id = Item.get_item_id(message) + + content = Item.get_item_content(obj_id) signal.alarm(max_execution_time) try: l_iban = iban_regex.findall(content) except TimeoutException: - print ("{0} processing timeout".format(paste.p_rel_path)) + print ("{0} processing timeout".format(obj_id)) continue else: signal.alarm(0) if(len(l_iban) > 0): - check_all_iban(l_iban, paste, filename) + check_all_iban(l_iban, obj_id) else: publisher.debug("Script BankAccount is Idling 10s") diff --git a/bin/Decoder.py b/bin/Decoder.py index 82133de7..9ea3adcc 100755 --- a/bin/Decoder.py +++ b/bin/Decoder.py @@ -17,7 +17,6 @@ import datetime from pubsublogger import publisher from Helper import Process -from packages import Paste from packages import Item import re @@ -50,11 +49,11 @@ def decode_string(content, message, date, encoded_list, decoder_name, encoded_mi save_hash(decoder_name, message, date, decode) - #remove encoded from paste content + #remove encoded from item content content = content.replace(encoded, '', 1) if(find): - set_out_paste(decoder_name, message) + set_out_item(decoder_name, message) return content @@ -72,8 +71,8 @@ def save_hash(decoder_name, message, date, decoded): data['estimated type'] = type json_data = json.dumps(data) - date_paste = '{}/{}/{}'.format(date[0:4], date[4:6], date[6:8]) - date_key = date[0:4] + date[4:6] + date[6:8] + date_item = '{}/{}/{}'.format(date[0:4], date[4:6], date[6:8]) + date_key = date serv_metadata.incrby(decoder_name+'_decoded:'+date_key, 1) serv_metadata.zincrby('hash_date:'+date_key, hash, 1) @@ -81,24 +80,24 @@ def save_hash(decoder_name, message, date, decoded): # first time we see this hash if not serv_metadata.hexists('metadata_hash:'+hash, 'estimated_type'): - serv_metadata.hset('metadata_hash:'+hash, 'first_seen', date_paste) - serv_metadata.hset('metadata_hash:'+hash, 'last_seen', date_paste) + serv_metadata.hset('metadata_hash:'+hash, 'first_seen', date_item) + serv_metadata.hset('metadata_hash:'+hash, 'last_seen', date_item) else: - serv_metadata.hset('metadata_hash:'+hash, 'last_seen', date_paste) + serv_metadata.hset('metadata_hash:'+hash, 'last_seen', date_item) - # first time we see this hash (all encoding) on this paste + # first time we see this hash (all encoding) on this item if serv_metadata.zscore('nb_seen_hash:'+hash, message) is None: serv_metadata.hincrby('metadata_hash:'+hash, 'nb_seen_in_all_pastes', 1) - serv_metadata.sadd('hash_paste:'+message, hash) # paste - hash map + serv_metadata.sadd('hash_paste:'+message, hash) # item - hash map # create hash metadata serv_metadata.hset('metadata_hash:'+hash, 'estimated_type', type) serv_metadata.sadd('hash_all_type', type) - # first time we see this hash encoding on this paste + # first time we see this hash encoding on this item if serv_metadata.zscore(decoder_name+'_hash:'+hash, message) is None: print('first '+decoder_name) - serv_metadata.sadd(decoder_name+'_paste:'+message, hash) # paste - hash map + serv_metadata.sadd(decoder_name+'_paste:'+message, hash) # item - hash map # create hash metadata serv_metadata.sadd('hash_'+ decoder_name +'_all_type', type) @@ -118,8 +117,8 @@ def save_hash(decoder_name, message, date, decoded): serv_metadata.zincrby(decoder_name+'_type:'+type, date_key, 1) - serv_metadata.zincrby('nb_seen_hash:'+hash, message, 1)# hash - paste map - serv_metadata.zincrby(decoder_name+'_hash:'+hash, message, 1) # number of b64 on this paste + serv_metadata.zincrby('nb_seen_hash:'+hash, message, 1)# hash - item map + serv_metadata.zincrby(decoder_name+'_hash:'+hash, message, 1) # number of b64 on this item # Domain Object if Item.is_crawled(message): @@ -150,7 +149,7 @@ def save_hash_on_disk(decode, type, hash, json_data): with open(filename_json, 'w') as f: f.write(json_data) -def set_out_paste(decoder_name, message): +def set_out_item(decoder_name, message): publisher.warning(decoder_name+' decoded') #Send to duplicate p.populate_set_out(message, 'Duplicate') @@ -217,12 +216,11 @@ if __name__ == '__main__': time.sleep(1) continue - filename = message - paste = Paste.Paste(filename) + obj_id = Item.get_item_id(message) # Do something with the message from the queue - content = paste.get_p_content() - date = str(paste._get_p_date()) + content = Item.get_item_content(obj_id) + date = Item.get_item_date(obj_id) for decoder in decoder_order: # add threshold and size limit @@ -233,7 +231,7 @@ if __name__ == '__main__': except TimeoutException: encoded_list = [] p.incr_module_timeout_statistic() # add encoder type - print ("{0} processing timeout".format(paste.p_rel_path)) + print ("{0} processing timeout".format(obj_id)) continue else: signal.alarm(0) diff --git a/bin/WebStats.py b/bin/WebStats.py index 7eecb0d2..10aba917 100755 --- a/bin/WebStats.py +++ b/bin/WebStats.py @@ -29,7 +29,10 @@ num_day_to_look = 5 # the detection of the progression start num_day_to_lo def analyse(server, field_name, date, url_parsed): field = url_parsed[field_name] if field is not None: - field = field.decode('utf8') + try: # faup version + field = field.decode() + except: + pass server.hincrby(field, date, 1) if field_name == "domain": #save domain in a set for the monthly plot domain_set_name = "domain_set_" + date[0:6] diff --git a/bin/packages/Item.py b/bin/packages/Item.py index b1722209..e9fcd18b 100755 --- a/bin/packages/Item.py +++ b/bin/packages/Item.py @@ -32,6 +32,9 @@ def exist_item(item_id): else: return False +def get_basename(item_id): + return os.path.basename(item_id) + def get_item_id(full_path): return full_path.replace(PASTES_FOLDER, '', 1)