fix: [module Webstats + BankAccount-Decoder] fix faup return type + remove old Paste library from BankAccount-Decoder #465

2020-02-10 10:31:53 +01:00 · 2020-02-10 10:31:53 +01:00 · f9856a1589
parent d8fbd72863
commit f9856a1589
4 changed files with 37 additions and 33 deletions
--- a/bin/BankAccount.py
+++ b/bin/BankAccount.py
@ -5,7 +5,7 @@
 The BankAccount Module
 ======================

-It apply IBAN regexes on paste content and warn if above a threshold.
+It apply IBAN regexes on item content and warn if above a threshold.

 """

@ -17,7 +17,7 @@ import re
 import string
 from itertools import chain

-from packages import Paste
+from packages import Item
 from pubsublogger import publisher

 from Helper import Process
@ -49,7 +49,7 @@ def is_valid_iban(iban):
        return True
    return False

-def check_all_iban(l_iban, paste, filename):
+def check_all_iban(l_iban, obj_id):
    nb_valid_iban = 0
    for iban in l_iban:
        iban = iban[0]+iban[1]+iban[2]
@ -65,14 +65,14 @@ def check_all_iban(l_iban, paste, filename):
                server_statistics.hincrby('iban_by_country:'+date, iban[0:2], 1)

    if(nb_valid_iban > 0):
-        to_print = 'Iban;{};{};{};'.format(paste.p_source, paste.p_date, paste.p_name)
+        to_print = 'Iban;{};{};{};'.format(Item.get_source(obj_id), Item.get_item_date(obj_id), Item.get_basename(obj_id))
        publisher.warning('{}Checked found {} IBAN;{}'.format(
-            to_print, nb_valid_iban, paste.p_rel_path))
-        msg = 'infoleak:automatic-detection="iban";{}'.format(filename)
+            to_print, nb_valid_iban, obj_id))
+        msg = 'infoleak:automatic-detection="iban";{}'.format(obj_id)
        p.populate_set_out(msg, 'Tags')

        #Send to duplicate
-        p.populate_set_out(filename, 'Duplicate')
+        p.populate_set_out(obj_id, 'Duplicate')

 if __name__ == "__main__":
    publisher.port = 6380
@ -103,21 +103,21 @@ if __name__ == "__main__":

        if message is not None:

-            filename = message
-            paste = Paste.Paste(filename)
-            content = paste.get_p_content()
+            obj_id = Item.get_item_id(message)
+
+            content = Item.get_item_content(obj_id)

            signal.alarm(max_execution_time)
            try:
                l_iban = iban_regex.findall(content)
            except TimeoutException:
-                 print ("{0} processing timeout".format(paste.p_rel_path))
+                 print ("{0} processing timeout".format(obj_id))
                 continue
            else:
                signal.alarm(0)

            if(len(l_iban) > 0):
-                check_all_iban(l_iban, paste, filename)
+                check_all_iban(l_iban, obj_id)

        else:
            publisher.debug("Script BankAccount is Idling 10s")
--- a/bin/Decoder.py
+++ b/bin/Decoder.py
@ -17,7 +17,6 @@ import datetime
 from pubsublogger import publisher

 from Helper import Process
-from packages import Paste
 from packages import Item

 import re
@ -50,11 +49,11 @@ def decode_string(content, message, date, encoded_list, decoder_name, encoded_mi

            save_hash(decoder_name, message, date, decode)

-            #remove encoded from paste content
+            #remove encoded from item content
            content = content.replace(encoded, '', 1)

    if(find):
-        set_out_paste(decoder_name, message)
+        set_out_item(decoder_name, message)

    return content

@ -72,8 +71,8 @@ def save_hash(decoder_name, message, date, decoded):
    data['estimated type'] = type
    json_data = json.dumps(data)

-    date_paste = '{}/{}/{}'.format(date[0:4], date[4:6], date[6:8])
-    date_key = date[0:4] + date[4:6] + date[6:8]
+    date_item = '{}/{}/{}'.format(date[0:4], date[4:6], date[6:8])
+    date_key = date

    serv_metadata.incrby(decoder_name+'_decoded:'+date_key, 1)
    serv_metadata.zincrby('hash_date:'+date_key, hash, 1)
@ -81,24 +80,24 @@ def save_hash(decoder_name, message, date, decoded):

    # first time we see this hash
    if not serv_metadata.hexists('metadata_hash:'+hash, 'estimated_type'):
-        serv_metadata.hset('metadata_hash:'+hash, 'first_seen', date_paste)
-        serv_metadata.hset('metadata_hash:'+hash, 'last_seen', date_paste)
+        serv_metadata.hset('metadata_hash:'+hash, 'first_seen', date_item)
+        serv_metadata.hset('metadata_hash:'+hash, 'last_seen', date_item)
    else:
-        serv_metadata.hset('metadata_hash:'+hash, 'last_seen', date_paste)
+        serv_metadata.hset('metadata_hash:'+hash, 'last_seen', date_item)

-    # first time we see this hash (all encoding) on this paste
+    # first time we see this hash (all encoding) on this item
    if serv_metadata.zscore('nb_seen_hash:'+hash, message) is None:
        serv_metadata.hincrby('metadata_hash:'+hash, 'nb_seen_in_all_pastes', 1)
-        serv_metadata.sadd('hash_paste:'+message, hash) # paste - hash map
+        serv_metadata.sadd('hash_paste:'+message, hash) # item - hash map
        # create hash metadata
        serv_metadata.hset('metadata_hash:'+hash, 'estimated_type', type)
        serv_metadata.sadd('hash_all_type', type)

-    # first time we see this hash encoding on this paste
+    # first time we see this hash encoding on this item
    if serv_metadata.zscore(decoder_name+'_hash:'+hash, message) is None:
        print('first '+decoder_name)

-        serv_metadata.sadd(decoder_name+'_paste:'+message, hash) # paste - hash map
+        serv_metadata.sadd(decoder_name+'_paste:'+message, hash) # item - hash map

        # create hash metadata
        serv_metadata.sadd('hash_'+ decoder_name +'_all_type', type)
@ -118,8 +117,8 @@ def save_hash(decoder_name, message, date, decoded):

    serv_metadata.zincrby(decoder_name+'_type:'+type, date_key, 1)

-    serv_metadata.zincrby('nb_seen_hash:'+hash, message, 1)# hash - paste map
-    serv_metadata.zincrby(decoder_name+'_hash:'+hash, message, 1) # number of b64 on this paste
+    serv_metadata.zincrby('nb_seen_hash:'+hash, message, 1)# hash - item map
+    serv_metadata.zincrby(decoder_name+'_hash:'+hash, message, 1) # number of b64 on this item

    # Domain Object
    if Item.is_crawled(message):
@ -150,7 +149,7 @@ def save_hash_on_disk(decode, type, hash, json_data):
    with open(filename_json, 'w') as f:
        f.write(json_data)

-def set_out_paste(decoder_name, message):
+def set_out_item(decoder_name, message):
    publisher.warning(decoder_name+' decoded')
    #Send to duplicate
    p.populate_set_out(message, 'Duplicate')
@ -217,12 +216,11 @@ if __name__ == '__main__':
            time.sleep(1)
            continue

-        filename = message
-        paste = Paste.Paste(filename)
+        obj_id = Item.get_item_id(message)

        # Do something with the message from the queue
-        content = paste.get_p_content()
-        date = str(paste._get_p_date())
+        content = Item.get_item_content(obj_id)
+        date = Item.get_item_date(obj_id)

        for decoder in decoder_order: # add threshold and size limit

@ -233,7 +231,7 @@ if __name__ == '__main__':
            except TimeoutException:
                encoded_list = []
                p.incr_module_timeout_statistic() # add encoder type
-                print ("{0} processing timeout".format(paste.p_rel_path))
+                print ("{0} processing timeout".format(obj_id))
                continue
            else:
                signal.alarm(0)
--- a/bin/WebStats.py
+++ b/bin/WebStats.py
@ -29,7 +29,10 @@ num_day_to_look = 5       # the detection of the progression start num_day_to_lo
 def analyse(server, field_name, date, url_parsed):
    field = url_parsed[field_name]
    if field is not None:
-        field = field.decode('utf8')
+        try: # faup version
+            field = field.decode()
+        except:
+            pass
        server.hincrby(field, date, 1)
        if field_name == "domain": #save domain in a set for the monthly plot
            domain_set_name = "domain_set_" + date[0:6]
--- a/bin/packages/Item.py
+++ b/bin/packages/Item.py
@ -32,6 +32,9 @@ def exist_item(item_id):
    else:
        return False

+def get_basename(item_id):
+    return os.path.basename(item_id)
+
 def get_item_id(full_path):
    return full_path.replace(PASTES_FOLDER, '', 1)