Merge pull request #63 from mokaddem/uptodate-duplicate-module

Uptodate duplicate module
2016-07-25 09:44:49 +02:00 · 2016-07-25 09:44:49 +02:00 · 4aa484a91f
parent ea8703e608 c686f69ca6
commit 4aa484a91f
17 changed files with 435 additions and 8 deletions
--- a/bin/Credential.py
+++ b/bin/Credential.py
@ -50,6 +50,8 @@ if __name__ == "__main__":
        if len(creds) > critical:
            print("========> Found more than 10 credentials in this file : {}".format(filepath))
            publisher.warning(to_print)
+            #Send to duplicate
+            p.populate_set_out(filepath)
            if sites:
                print("=======> Probably on : {}".format(', '.join(sites)))
        else:
--- a/bin/CreditCard.py
+++ b/bin/CreditCard.py
@ -65,6 +65,8 @@ if __name__ == "__main__":
                if (len(creditcard_set) > 0):
                    publisher.warning('{}Checked {} valid number(s)'.format(
                        to_print, len(creditcard_set)))
+                    #Send to duplicate
+                    p.populate_set_out(filename)
                else:
                    publisher.info('{}CreditCard related'.format(to_print))
        else:
--- a/bin/Duplicate.py
+++ b/bin/Duplicate.py
@ -74,9 +74,9 @@ if __name__ == "__main__":
            # Creating the bloom filter name: bloomyyyymm
            filebloompath = os.path.join(bloompath, 'bloom' + PST.p_date.year +
                                         PST.p_date.month)
-
            if os.path.exists(filebloompath):
                bloom = BloomFilter.open(filebloompath)
+                bloop_path_set.add(filebloompath)
            else:
                bloom = BloomFilter(100000000, 0.01, filebloompath)
                bloop_path_set.add(filebloompath)
@ -94,7 +94,6 @@ if __name__ == "__main__":
            for bloo in bloop_path_set:
                # Opening blooms
                opened_bloom.append(BloomFilter.open(bloo))
-
            # For each hash of the paste
            for line_hash in PST._get_hash_lines(min=5, start=1, jump=0):
                nb_hash_current += 1
@ -105,7 +104,6 @@ if __name__ == "__main__":
                    r_serv1.sadd("HASHS", line_hash)
                # Adding the hash in the bloom of the month
                bloom.add(line_hash)
-
                # Go throught the Database of the bloom filter (of the month)
                for bloo in opened_bloom:
                    if line_hash in bloo:
@ -148,6 +146,8 @@ if __name__ == "__main__":
                    percentage = round((count/float(nb_hash_current))*100, 2)
                    if percentage >= 50:
                        dupl.append((paste, percentage))
+                    else:
+                        print 'percentage: ' + str(percentage)

                # Creating the object attribute and save it.
                to_print = 'Duplicate;{};{};{};'.format(
@ -156,6 +156,7 @@ if __name__ == "__main__":
                    PST.__setattr__("p_duplicate", dupl)
                    PST.save_attribute_redis("p_duplicate", dupl)
                    publisher.info('{}Detected {}'.format(to_print, len(dupl)))
+                    print '{}Detected {}'.format(to_print, len(dupl))

                y = time.time()

--- a/bin/Duplicate_ssdeep.py
+++ b/bin/Duplicate_ssdeep.py
@ -0,0 +1,182 @@
+#!/usr/bin/env python2
+# -*-coding:UTF-8 -*
+
+"""
+The Duplicate module
+====================
+
+This huge module is, in short term, checking duplicates.
+
+Requirements:
+-------------
+
+
+"""
+import redis
+import os
+import time
+import datetime
+import json
+import ssdeep
+from packages import Paste
+from pubsublogger import publisher
+
+from Helper import Process
+
+if __name__ == "__main__":
+    publisher.port = 6380
+    publisher.channel = "Script"
+
+    config_section = 'Duplicates'
+    save_dico_and_reload = 1 #min
+    time_1 = time.time()
+    flag_reload_from_disk = True
+    flag_write_to_disk = False
+
+    p = Process(config_section)
+
+    # REDIS #
+    # DB OBJECT & HASHS ( DISK )
+    # FIXME increase flexibility
+    dico_redis = {}
+    for year in xrange(2013, datetime.date.today().year+1):
+        for month in xrange(0, 16):
+            dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis(
+                host=p.config.get("Redis_Level_DB", "host"), port=year,
+                db=month)
+	    #print("dup: "+str(year)+str(month).zfill(2)+"\n")
+
+    # FUNCTIONS #
+    publisher.info("Script duplicate started")
+
+    dicopath = os.path.join(os.environ['AIL_HOME'],
+                             p.config.get("Directories", "dicofilters"))
+
+    dico_path_set = set()
+    while True:
+        try:
+            hash_dico = {}
+            dupl = []
+
+            x = time.time()
+
+            message = p.get_from_set()
+            if message is not None:
+                path = message
+                PST = Paste.Paste(path)
+            else:
+                publisher.debug("Script Attribute is idling 10s")
+                time.sleep(10)
+                continue
+
+            PST._set_p_hash_kind("ssdeep")
+
+            # Assignate the correct redis connexion
+            r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month]
+
+            # Creating the dicor name: dicoyyyymm
+            filedicopath = os.path.join(dicopath, 'dico' + PST.p_date.year +
+                                         PST.p_date.month)
+            filedicopath_today = filedicopath
+
+            # Save I/O
+            if time.time() - time_1 > save_dico_and_reload*60:
+                flag_write_to_disk = True
+
+            if os.path.exists(filedicopath):
+                if flag_reload_from_disk == True:
+                    flag_reload_from_disk = False
+                    print 'Reloading'
+                    with open(filedicopath, 'r') as fp:
+                        today_dico = json.load(fp)
+            else:
+                today_dico = {}
+                with open(filedicopath, 'w') as fp:
+                    json.dump(today_dico, fp)
+
+            # For now, just use monthly dico
+            dico_path_set.add(filedicopath)
+
+            # UNIQUE INDEX HASHS TABLE
+            yearly_index = str(datetime.date.today().year)+'00'
+            r_serv0 = dico_redis[yearly_index]
+            r_serv0.incr("current_index")
+            index = r_serv0.get("current_index")+str(PST.p_date)
+            
+            # For each dico
+            opened_dico = []
+            for dico in dico_path_set:
+                # Opening dico
+                if dico == filedicopath_today:
+                    opened_dico.append([dico, today_dico])
+                else:
+                    with open(dico, 'r') as fp:
+                        opened_dico.append([dico, json.load(fp)])
+
+              
+            #retrieve hash from paste
+            paste_hash = PST._get_p_hash()
+            
+            # Go throught the Database of the dico (of the month)
+            threshold_dup = 99 
+            for dico_name, dico in opened_dico:
+                for dico_key, dico_hash in dico.items():
+                    percent = ssdeep.compare(dico_hash, paste_hash)
+                    if percent > threshold_dup:
+                        db = dico_name[-6:]
+                        # Go throught the Database of the dico filter (month)
+                        r_serv_dico = dico_redis[db]
+                        
+                        # index of paste
+                        index_current = r_serv_dico.get(dico_hash)
+                        paste_path = r_serv_dico.get(index_current)
+                        if paste_path != None:
+                            hash_dico[dico_hash] = (paste_path, percent)
+
+                        #print 'comparing: ' + str(dico_hash[:20]) + '  and  ' + str(paste_hash[:20]) + ' percentage: ' + str(percent)
+                        print '   '+ PST.p_path[44:]  +', '+ paste_path[44:] + ', ' + str(percent)
+
+            # Add paste in DB to prevent its analyse twice
+            # HASHTABLES PER MONTH (because of r_serv1 changing db)
+            r_serv1.set(index, PST.p_path)
+            r_serv1.sadd("INDEX", index)
+            # Adding the hash in Redis
+            r_serv1.set(paste_hash, index)
+            r_serv1.sadd("HASHS", paste_hash)
+    ##################### Similarity found  #######################
+
+            # if there is data in this dictionnary
+            if len(hash_dico) != 0:
+                for dico_hash, paste_tuple in hash_dico.items():
+                    paste_path, percent = paste_tuple
+                    dupl.append((paste_path, percent))
+
+                # Creating the object attribute and save it.
+                to_print = 'Duplicate;{};{};{};'.format(
+                    PST.p_source, PST.p_date, PST.p_name)
+                if dupl != []:
+                    PST.__setattr__("p_duplicate", dupl)
+                    PST.save_attribute_redis("p_duplicate", dupl)
+                    publisher.info('{}Detected {}'.format(to_print, len(dupl)))
+                    print '{}Detected {}'.format(to_print, len(dupl))
+
+                y = time.time()
+
+                publisher.debug('{}Processed in {} sec'.format(to_print, y-x))
+           
+
+            # Adding the hash in the dico of the month
+            today_dico[index] = paste_hash
+
+            if flag_write_to_disk:
+                time_1 = time.time()
+                flag_write_to_disk = False
+                flag_reload_from_disk = True
+                print 'writing'
+                with open(filedicopath, 'w') as fp:
+                    json.dump(today_dico, fp)
+        except IOError:
+            to_print = 'Duplicate;{};{};{};'.format(
+                PST.p_source, PST.p_date, PST.p_name)
+            print "CRC Checksum Failed on :", PST.p_path
+            publisher.error('{}CRC Checksum Failed'.format(to_print))
--- a/bin/Duplicate_ssdeep_v2.py
+++ b/bin/Duplicate_ssdeep_v2.py
@ -0,0 +1,161 @@
+#!/usr/bin/env python2
+# -*-coding:UTF-8 -*
+
+"""
+The Duplicate module
+====================
+
+This huge module is, in short term, checking duplicates.
+Its input comes from other modules, namely:
+    Credential, CreditCard, Keys, Mails and Phone
+
+This one differ from v1 by only using redis and not json file stored on disk
+
+Requirements:
+-------------
+
+
+"""
+import redis
+import os
+import time
+from datetime import datetime, timedelta
+import json
+import ssdeep
+from packages import Paste
+from pubsublogger import publisher
+
+from Helper import Process
+
+if __name__ == "__main__":
+    publisher.port = 6380
+    publisher.channel = "Script"
+
+    config_section = 'Duplicates'
+
+    p = Process(config_section)
+
+    maximum_month_range = int(p.config.get("Modules_Duplicates", "maximum_month_range"))
+    threshold_duplicate = int(p.config.get("Modules_Duplicates", "threshold_duplicate")) 
+    min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size")) 
+
+    # REDIS #
+    dico_redis = {}
+    date_today = datetime.today()
+    for year in xrange(2013, date_today.year+1):
+        for month in xrange(0, 13):
+            dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis(
+                host=p.config.get("Redis_Level_DB", "host"), port=year,
+                db=month)
+	    #print("dup: "+str(year)+str(month).zfill(2)+"\n")
+
+    # FUNCTIONS #
+    publisher.info("Script duplicate started")
+
+    while True:
+        try:
+            hash_dico = {}
+            dupl = []
+            dico_range_list = []
+
+            x = time.time()
+
+            message = p.get_from_set()
+            if message is not None:
+                path = message
+                PST = Paste.Paste(path)
+            else:
+                publisher.debug("Script Attribute is idling 10s")
+                time.sleep(10)
+                continue
+
+            # the paste is too small
+            if (PST._get_p_size() < min_paste_size): 
+                continue
+
+            PST._set_p_hash_kind("ssdeep")
+
+            # Assignate the correct redis connexion
+            r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month]
+
+            # Creating the dico name: yyyymm
+            # Get the date of the range
+            date_range = date_today - timedelta(days = maximum_month_range*30.4166666)
+            num_of_month = (date_today.year - date_range.year)*12 + (date_today.month - date_range.month)
+            for diff_month in xrange(0, num_of_month+1):
+                curr_date_range = date_today - timedelta(days = diff_month*30.4166666)
+                to_append = str(curr_date_range.year)+str(curr_date_range.month).zfill(2)
+                dico_range_list.append(to_append)
+            
+            # Use all dico in range
+            dico_range_list = dico_range_list[0:maximum_month_range]
+
+            # UNIQUE INDEX HASHS TABLE
+            yearly_index = str(date_today.year)+'00'
+            r_serv0 = dico_redis[yearly_index]
+            r_serv0.incr("current_index")
+            index = r_serv0.get("current_index")+str(PST.p_date)
+            
+            # Open selected dico range 
+            opened_dico = []
+            for dico_name in dico_range_list:
+                opened_dico.append([dico_name, dico_redis[dico_name]])
+              
+            # retrieve hash from paste
+            paste_hash = PST._get_p_hash()
+            
+            # Go throught the Database of the dico (of the month)
+            for curr_dico_name, curr_dico_redis in opened_dico:
+                for dico_hash in curr_dico_redis.smembers('HASHS'):
+                    try:
+                        percent = ssdeep.compare(dico_hash, paste_hash)
+                        if percent > threshold_duplicate:
+                            # Go throught the Database of the dico filter (month)
+                            r_serv_dico = dico_redis[curr_dico_name]
+                            
+                            # index of paste
+                            index_current = r_serv_dico.get(dico_hash)
+                            paste_path = r_serv_dico.get(index_current)
+                            if paste_path != None:
+                                hash_dico[dico_hash] = (paste_path, percent)
+
+                            #print 'comparing: ' + str(PST.p_path[44:]) + '  and  ' + str(paste_path[44:]) + ' percentage: ' + str(percent)
+                    except:
+                        # ssdeep hash not comparable
+                        print 'ssdeep hash not comparable, cleaning bad hash: '+dico_hash
+                        curr_dico_redis.srem('HASHS', dico_hash)
+
+            # Add paste in DB after checking to prevent its analysis twice
+            # hash_i -> index_i  AND  index_i -> PST.PATH
+            r_serv1.set(index, PST.p_path)
+            r_serv1.sadd("INDEX", index)
+            # Adding the hash in Redis
+            r_serv1.set(paste_hash, index)
+            r_serv1.sadd("HASHS", paste_hash)
+    ##################### Similarity found  #######################
+
+            # if there is data in this dictionnary
+            if len(hash_dico) != 0:
+                # paste_tuple = (paste_path, percent)
+                for dico_hash, paste_tuple in hash_dico.items():
+                    dupl.append(paste_tuple)
+
+                # Creating the object attribute and save it.
+                to_print = 'Duplicate;{};{};{};'.format(
+                    PST.p_source, PST.p_date, PST.p_name)
+                if dupl != []:
+                    PST.__setattr__("p_duplicate", dupl)
+                    PST.save_attribute_redis("p_duplicate", dupl)
+                    publisher.info('{}Detected {}'.format(to_print, len(dupl)))
+                    print '{}Detected {}'.format(to_print, len(dupl))
+
+                y = time.time()
+
+                publisher.debug('{}Processed in {} sec'.format(to_print, y-x))
+                #print '{}Processed in {} sec'.format(to_print, y-x)
+           
+        except IOError:
+            to_print = 'Duplicate;{};{};{};'.format(
+                PST.p_source, PST.p_date, PST.p_name)
+            print "CRC Checksum Failed on :", PST.p_path
+            publisher.error('{}CRC Checksum Failed'.format(to_print))
--- a/bin/Keys.py
+++ b/bin/Keys.py
@ -16,6 +16,8 @@ def search_gpg(message):
    content = paste.get_p_content()
    if '-----BEGIN PGP MESSAGE-----' in content:
        publisher.warning('{} has a PGP enc message'.format(paste.p_name))
+        #Send to duplicate
+        p.populate_set_out(message)


 if __name__ == '__main__':
--- a/bin/LAUNCH.sh
+++ b/bin/LAUNCH.sh
@ -105,7 +105,7 @@ function launching_scripts {

    screen -S "Script" -X screen -t "Global" bash -c './Global.py; read x'
    sleep 0.1
-    screen -S "Script" -X screen -t "Duplicate" bash -c './Duplicate.py; read x'
+    screen -S "Script" -X screen -t "Duplicate" bash -c './Duplicate_ssdeep_v2.py; read x'
    sleep 0.1
    screen -S "Script" -X screen -t "Attribute" bash -c './Attribute.py; read x'
    sleep 0.1
--- a/bin/Mail.py
+++ b/bin/Mail.py
@ -60,6 +60,8 @@ if __name__ == "__main__":
                               MX_values[0])
                    if MX_values[0] > is_critical:
                        publisher.warning(to_print)
+                        #Send to duplicate
+                        p.populate_set_out(filename)
                    else:
                        publisher.info(to_print)
            prec_filename = filename
--- a/bin/Phone.py
+++ b/bin/Phone.py
@ -23,6 +23,8 @@ def search_phone(message):
    if len(results) > 4:
        print results
        publisher.warning('{} contains PID (phone numbers)'.format(paste.p_name))
+        #Send to duplicate
+        p.populate_set_out(message)

 if __name__ == '__main__':
    # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
--- a/bin/packages/Hash.py
+++ b/bin/packages/Hash.py
@ -1,6 +1,7 @@
 import hashlib
 import crcmod
 import mmh3
+import ssdeep


 class Hash(object):
@ -32,4 +33,7 @@ class Hash(object):
        elif self.name == "murmur":
            hash = mmh3.hash(string)

+        elif self.name == "ssdeep":
+            hash = ssdeep.hash(string)
+
        return hash
--- a/bin/packages/Paste.py
+++ b/bin/packages/Paste.py
@ -91,6 +91,7 @@ class Paste(object):
        self.p_langage = None
        self.p_nb_lines = None
        self.p_max_length_line = None
+        self.p_duplicate = None

    def get_p_content(self):
        """
@ -277,6 +278,10 @@ class Paste(object):
            return True, var
        else:
            return False, var
+    
+    def _get_p_duplicate(self):
+        self.p_duplicate = self.store.hget(self.p_path, "p_duplicate")
+        return self.p_duplicate if self.p_duplicate is not None else []

    def save_all_attributes_redis(self, key=None):
        """
--- a/bin/packages/config.cfg.sample
+++ b/bin/packages/config.cfg.sample
@ -25,6 +25,16 @@ max_preview_modal = 800
 #Default number of header to display in trending graphs
 default_display = 10

+#### Modules #### 
+[Modules_Duplicates]
+#Number of month to look back
+maximum_month_range = 3
+#The value where two pastes are considerate duplicate.
+threshold_duplicate = 50
+#Minimum size of the paste considered
+min_paste_size = 0.3
+
+
 ##### Redis #####
 [Redis_Cache]
 host = localhost
--- a/bin/packages/modules.cfg
+++ b/bin/packages/modules.cfg
@ -3,7 +3,7 @@ subscribe = ZMQ_Global
 publish = Redis_Global

 [Duplicates]
-subscribe = Redis_Global
+subscribe = Redis_Duplicate

 [Indexer]
 subscribe = Redis_Global
@ -31,9 +31,11 @@ publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Web,Redis_Credential,Re

 [CreditCards]
 subscribe = Redis_CreditCards
+publish = Redis_Duplicate 

 [Mail]
 subscribe = Redis_Mail
+publish = Redis_Duplicate 

 [Onion]
 subscribe = Redis_Onion
@ -55,15 +57,18 @@ subscribe = Redis_Global

 [Credential]
 subscribe = Redis_Credential
+publish = Redis_Duplicate 

 [Cve]
 subscribe = Redis_Cve

 [Phone]
 subscribe = Redis_Global
+publish = Redis_Duplicate 

 [SourceCode]
 subscribe = Redis_SourceCode

 [Keys]
 subscribe = Redis_Global
+publish = Redis_Duplicate 
--- a/pip_packages_requirement.txt
+++ b/pip_packages_requirement.txt
@ -17,6 +17,7 @@ nltk
 # Hashlib
 crcmod
 mmh3
+ssdeep

 #Others
 python-magic
--- a/var/www/Flask_server.py
+++ b/var/www/Flask_server.py
@ -58,6 +58,21 @@ def list_len(s):
    return len(s)
 app.jinja_env.filters['list_len'] = list_len

+def parseStringToList(the_string):
+    strList = ""
+    elemList = []
+    for c in the_string:
+        if c != ']':
+            if c != '[' and c !=' ' and c != '"':
+                strList += c
+        else:
+            the_list = strList.split(',')
+            if len(the_list) == 2:
+               elemList.append(the_list)
+            elif len(the_list) > 1:
+               elemList.append(the_list[1:])
+            strList = ""
+    return elemList

 def showpaste(content_range):    
    requested_path = request.args.get('paste', '')
@ -71,10 +86,19 @@ def showpaste(content_range):
    p_mime = paste.p_mime
    p_lineinfo = paste.get_lines_info()
    p_content = paste.get_p_content().decode('utf-8', 'ignore')
+    p_duplicate_full_list = parseStringToList(paste._get_p_duplicate())
+    p_duplicate_list = []
+    p_simil_list = []
+
+    for dup_list in p_duplicate_full_list:
+        path, simil_percent = dup_list
+        p_duplicate_list.append(path)
+        p_simil_list.append(simil_percent)
+
    if content_range != 0:
       p_content = p_content[0:content_range] 

-    return render_template("show_saved_paste.html", date=p_date, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content))
+    return render_template("show_saved_paste.html", date=p_date, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list)


@app.route("/_logs")
--- a/var/www/static/js/indexjavascript.js
+++ b/var/www/static/js/indexjavascript.js
@ -1,3 +1,17 @@
+function initfunc( csvay, scroot) {
+  window.csv = csvay;
+  window.scroot = scroot;
+};
+
+function update_values() {
+  $SCRIPT_ROOT = window.scroot ;
+    $.getJSON($SCRIPT_ROOT+"/_stuff",
+        function(data) {
+            window.glob_tabvar = data;
+        });
+    };
+
+
 // Plot and update the number of processed pastes
 $(function() {
    var data = [];
@ -25,7 +39,7 @@ $(function() {
        return res;
    }

-    var updateInterval = 1000; //1s
+    var updateInterval = 1000;
    var options = {
        series: { shadowSize: 1 },
        lines: { fill: true, fillColor: { colors: [ { opacity: 1 }, { opacity: 0.1 } ] }},
--- a/var/www/templates/show_saved_paste.html
+++ b/var/www/templates/show_saved_paste.html
@ -42,7 +42,17 @@
    </table>
  </div>
  <div class="panel-body" id="panel-body">
-  <h4> Content: </h4>
+  {% if  duplicate_list|length == 0 %}
+      <h4> No Duplicate </h4>
+  {% else %}
+      <h4> Duplicate list: </h4>
+      {% set i = 0 %}
+      {% for dup_path in duplicate_list %}
+          Similarity: {{ simil_list[i] }}% - <a target="_blank" href="{{ url_for('showsavedpaste') }}?paste={{ dup_path }}" id='dup_path'>{{ dup_path }}</a></br>
+          {% set i = i + 1 %}
+      {% endfor %}
+  {% endif %}
+    <h4> Content: </h4>
  <p data-initsize="{{ initsize }}"> <xmp id="paste-holder">{{ content }}</xmp></p>
  </div>
 </div>