Merge branch 'master' into advanced_crawler

2019-03-26 16:03:42 +01:00 · 2019-03-26 16:03:42 +01:00 · 59664efe45
parent 5165a5de2f 3d36ddbc1e
commit 59664efe45
5 changed files with 109 additions and 14 deletions
--- a/README.md
+++ b/README.md
@ -12,7 +12,7 @@ AIL is a modular framework to analyse potential information leaks from unstructu
 <table>
 <tr>
  <td>Latest Release</td>
-  <td><a href="https://badge.fury.io/gh/CIRCL%2FAIL-Framework"><img src="https://badge.fury.io/gh/CIRCL%2FAIL-Framework.svg" alt="GitHub version" height="18"></a></td>
+  <td><a href="https://github.com/CIRCL/AIL-framework/releases/latest"><img src="https://img.shields.io/github/release/CIRCL/AIL-framework/all.svg"></a></td>
 </tr>
 <tr>
  <td>Contributors</td>
--- a/bin/CVE_check.py
+++ b/bin/CVE_check.py
@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+# -*-coding:UTF-8 -*
+
+from packages import Paste
+from Helper import Process
+
+import os
+import re
+import time
+import redis
+import configparser
+
+from collections import defaultdict
+
+def get_dict_cve(list_paste_cve, only_one_same_cve_by_paste=False):
+    dict_keyword = {}
+
+    for paste_cve in list_paste_cve:
+        paste_content = Paste.Paste(paste_cve).get_p_content()
+
+        cve_list = reg_cve.findall(paste_content)
+        if only_one_same_cve_by_paste:
+            cve_list = set(cve_list)
+
+        for cve in reg_cve.findall(paste_content):
+            try:
+                dict_keyword[cve] += 1
+            except KeyError:
+                dict_keyword[cve] = 1
+
+    print('------------------------------------------------')
+    if dict_keyword:
+        res = [(k, dict_keyword[k]) for k in sorted(dict_keyword, key=dict_keyword.get, reverse=True)]
+        for item in res:
+            pass
+            print(item)
+
+
+
+if __name__ == '__main__':
+
+    # CONFIG #
+    configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
+    if not os.path.exists(configfile):
+        raise Exception('Unable to find the configuration file. \
+                        Did you set environment variables? \
+                        Or activate the virtualenv.')
+
+    cfg = configparser.ConfigParser()
+    cfg.read(configfile)
+
+    serv_metadata = redis.StrictRedis(
+        host=cfg.get("ARDB_Metadata", "host"),
+        port=cfg.getint("ARDB_Metadata", "port"),
+        db=cfg.getint("ARDB_Metadata", "db"),
+        decode_responses=True)
+
+    serv_tags = redis.StrictRedis(
+                host=cfg.get("ARDB_Tags", "host"),
+                port=cfg.get("ARDB_Tags", "port"),
+                db=cfg.get("ARDB_Tags", "db"),
+                decode_responses=True)
+
+    reg_cve = re.compile(r'CVE-[1-2]\d{1,4}-\d{1,7}')
+
+    #all_past_cve = serv_tags.smembers('infoleak:automatic-detection="cve"')
+    #all_past_cve_regular = serv_tags.sdiff('infoleak:automatic-detection="cve"', 'infoleak:submission="crawler"')
+    #all_past_cve_crawler = serv_tags.sinter('infoleak:automatic-detection="cve"', 'infoleak:submission="crawler"')
+
+    #print('{} + {} = {}'.format(len(all_past_cve_regular), len(all_past_cve_crawler), len(all_past_cve)))
+
+    print('ALL_CVE')
+    get_dict_cve(serv_tags.smembers('infoleak:automatic-detection="cve"'), True)
+    print()
+    print()
+    print()
+    print('REGULAR_CVE')
+    get_dict_cve(serv_tags.sdiff('infoleak:automatic-detection="cve"', 'infoleak:submission="crawler"'), True)
+    print()
+    print()
+    print()
+    print('CRAWLER_CVE')
+    get_dict_cve(serv_tags.sinter('infoleak:automatic-detection="cve"', 'infoleak:submission="crawler"'), True)
--- a/bin/Global.py
+++ b/bin/Global.py
@ -23,23 +23,17 @@ Requirements
 import base64
 import os
 import time
+import uuid
 from pubsublogger import publisher

 from Helper import Process

 import magic
-import io
-#import gzip

-'''
-def gunzip_bytes_obj(bytes_obj):
-    in_ = io.BytesIO()
-    in_.write(bytes_obj)
-    in_.seek(0)
-    with gzip.GzipFile(fileobj=in_, mode='rb') as fo:
-        gunzipped_bytes_obj = fo.read()
+def rreplace(s, old, new, occurrence):
+    li = s.rsplit(old, occurrence)
+    return new.join(li)

-    return gunzipped_bytes_obj.decode()'''

 if __name__ == '__main__':
    publisher.port = 6380
@ -77,6 +71,12 @@ if __name__ == '__main__':
                processed_paste = 0
            time.sleep(1)
            continue
+
+        file_name_paste = paste.split('/')[-1]
+        if len(file_name_paste)>255:
+            new_file_name_paste = '{}{}.gz'.format(file_name_paste[:215], str(uuid.uuid4()))
+            paste = rreplace(paste, file_name_paste, new_file_name_paste, 1)
+
        # Creating the full filepath
        filename = os.path.join(os.environ['AIL_HOME'],
                                p.config.get("Directories", "pastes"), paste)
--- a/bin/Onion.py
+++ b/bin/Onion.py
@ -198,8 +198,12 @@ if __name__ == "__main__":
                print(len(domains_list))
                if len(domains_list) > 0:

-                    publisher.warning('{}Detected {} .onion(s);{}'.format(
-                        to_print, len(domains_list),PST.p_path))
+                    if not activate_crawler:
+                        publisher.warning('{}Detected {} .onion(s);{}'.format(
+                            to_print, len(domains_list),PST.p_path))
+                    else:
+                        publisher.info('{}Detected {} .onion(s);{}'.format(
+                            to_print, len(domains_list),PST.p_path))
                    now = datetime.datetime.now()
                    path = os.path.join('onions', str(now.year).zfill(4),
                                        str(now.month).zfill(2),
@ -220,6 +224,10 @@ if __name__ == "__main__":
                            else:
                                continue

+                            # too many subdomain
+                            if len(domain.split('.')) > 5:
+                                continue
+
                            if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain):
                                if not r_onion.sismember('onion_domain_crawler_queue', domain):
                                    print('send to onion crawler')
--- a/bin/torcrawler/TorSplashCrawler.py
+++ b/bin/torcrawler/TorSplashCrawler.py
@ -130,7 +130,11 @@ class TorSplashCrawler():
                    print('Connection to proxy refused')
            else:

-                UUID = self.domains[0]+str(uuid.uuid4())
+                #avoid filename too big
+                if len(self.domains[0]) > 215:
+                    UUID = self.domains[0][-215:]+str(uuid.uuid4())
+                else:
+                    UUID = self.domains[0]+str(uuid.uuid4())
                filename_paste = os.path.join(self.crawled_paste_filemame, UUID)
                relative_filename_paste = os.path.join(self.crawler_path, UUID)
                filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png')