chg: [extractor] add cache + UI extractor + word/set extractor

2023-02-23 16:25:15 +01:00 · 2023-02-23 16:25:15 +01:00 · 0fa27c6a51
parent ab24343b48
commit 0fa27c6a51
8 changed files with 208 additions and 69 deletions
--- a/bin/lib/module_extractor.py
+++ b/bin/lib/module_extractor.py
@ -1,17 +1,19 @@
 #!/usr/bin/env python3
 # -*-coding:UTF-8 -*
-
+import json
 import os
 import sys
 import time

 import yara

+from operator import itemgetter
+
 sys.path.append(os.environ['AIL_BIN'])
 ##################################
 # Import Project packages
 ##################################
-import lib.objects.ail_objects
+from lib.objects import ail_objects
 from lib.objects.Items import Item
 from lib import correlations_engine
 from lib import regex_helper
@ -25,23 +27,19 @@ from modules.Mail import Mail
 from modules.Onion import Onion
 from modules.Tools import Tools

-creditCards = CreditCards()
-ibans = Iban()
-mails = Mail()
-onions = Onion()
-tools = Tools()
-
 config_loader = ConfigLoader()
 r_cache = config_loader.get_redis_conn("Redis_Cache")
 config_loader = None

 r_key = regex_helper.generate_redis_cache_key('extractor')

+# TODO UI Link
+
 MODULES = {
-    'infoleak:automatic-detection="credit-card"': creditCards,
-    'infoleak:automatic-detection="iban"': ibans,
-    'infoleak:automatic-detection="mail"': mails,
-    'infoleak:automatic-detection="onion"': onions,
+    'infoleak:automatic-detection="credit-card"': CreditCards(),
+    'infoleak:automatic-detection="iban"': Iban(),
+    'infoleak:automatic-detection="mail"': Mail(),
+    'infoleak:automatic-detection="onion"': Onion(),
    # APIkey ???
    # Credentials
    # Zerobins
@ -49,20 +47,28 @@ MODULES = {
    # SQL Injetction / Libinjection ???

 }
+tools = Tools()
 for tool_name in tools.get_tools():
    MODULES[f'infoleak:automatic-detection="{tool_name}-tool"'] = tools

-def get_correl_match(extract_type, obj_id, content, filter_subtypes=['']):
+def get_correl_match(extract_type, obj_id, content):
+    extracted = []
    correl = correlations_engine.get_correlation_by_correl_type('item', '', obj_id, extract_type)
    to_extract = []
+    map_subtype = {}
    for c in correl:
        subtype, value = c.split(':', 1)
-        # if subtype in filter_subtypes:
+        map_subtype[value] = subtype
        to_extract.append(value)
    if to_extract:
-        return regex_helper.regex_finditer(r_key, '|'.join(to_extract), obj_id, content)
-    else:
-        return []
+        objs = regex_helper.regex_finditer(r_key, '|'.join(to_extract), obj_id, content)
+        for obj in objs:
+            if map_subtype[obj[2]]:
+                subtype = map_subtype[obj[2]]
+            else:
+                subtype = ''
+            extracted.append([obj[0], obj[1], obj[2], f'{extract_type}:{subtype}:{obj[2]}'])
+    return extracted

 def _get_yara_match(data):
    for row in data.get('strings'):
@ -73,14 +79,27 @@ def _get_yara_match(data):
        r_cache.expire(f'extractor:yara:match:{r_key}', 300)
    return yara.CALLBACK_CONTINUE

+def _get_word_regex(word):
+    return '(?:^|(?<=[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]))' + word + '(?:$|(?=[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]))'
+
 # TODO RETRO HUNTS
+# TODO TRACKER TYPE IN UI
 def get_tracker_match(obj_id, content):
+    cached = r_cache.get(f'extractor:cache:{obj_id}')
+    if cached:
+        r_cache.expire(f'extractor:cache:{obj_id}', 300)
+        return json.loads(cached)
+
+    extracted = []
    trackers = Tracker.get_obj_all_trackers('item', '', obj_id)
    for tracker_uuid in trackers:
        tracker_type = Tracker.get_tracker_type(tracker_uuid)
+        print(tracker_type)
        tracker = Tracker.get_tracker_by_uuid(tracker_uuid)
-        if tracker_type == 'regex':
-            return regex_helper.regex_finditer(r_key, tracker, obj_id, content)
+        if tracker_type == 'regex':  # TODO Improve word detection -> word delimiter
+            regex_match = regex_helper.regex_finditer(r_key, tracker, obj_id, content)
+            for match in regex_match:
+                extracted.append([int(match[0]), int(match[1]), match[2], f'tracker:{tracker_uuid}'])
        elif tracker_type == 'yara':
            rule = Tracker.get_yara_rule_by_uuid(tracker_uuid)
            rule.match(data=content, callback=_get_yara_match,
@ -90,22 +109,39 @@ def get_tracker_match(obj_id, content):
            extracted = []
            for match in yara_match:
                start, end, value = match.split(':', 2)
-                extracted.append((int(start), int(end), value))
-            return extracted
+                extracted.append([int(start), int(end), value, f'tracker:{tracker_uuid}'])

-        # elif tracker_type == 'term': # TODO
-        #
-        # elif tracker_type == '':
-    return []
+        elif tracker_type == 'word' or tracker_type == 'set':
+            if tracker_type == 'set':
+                tracker = tracker.rsplit(';', 1)[0]
+                words = tracker.split(',')
+            else:
+                words = [tracker]
+            for word in words:
+                regex = _get_word_regex(word)
+                regex_match = regex_helper.regex_finditer(r_key, regex, obj_id, content)
+                print(regex_match)
+                for match in regex_match:
+                    extracted.append([int(match[0]), int(match[1]), match[2], f'tracker:{tracker_uuid}'])

+    # Save In Cache
+    if extracted:
+        extracted_dump = json.dumps(extracted)
+        r_cache.set(f'extractor:cache:{obj_id}', extracted_dump)
+        r_cache.expire(f'extractor:cache:{obj_id}', 300)  # TODO Reduce CACHE ???????????????
+
+    return extracted
+
+# Type:subtype:id
+# tag:iban
+# tracker:uuid

 def extract(obj_id, content=None):
    item = Item(obj_id)
    if not content:
        content = item.get_content()
-    extracted = []

-    extracted = extracted + get_tracker_match(obj_id, content)
+    extracted = get_tracker_match(obj_id, content)

    # print(item.get_tags())
    for tag in item.get_tags():
@ -116,33 +152,70 @@ def extract(obj_id, content=None):
            if matches:
                extracted = extracted + matches

-    for obj_t in ['cve', 'cryptocurrency', 'username']: # Decoded, PGP->extract bloc
+    for obj_t in ['cve', 'cryptocurrency', 'username']:  # Decoded, PGP->extract bloc
        matches = get_correl_match(obj_t, obj_id, content)
        if matches:
            extracted = extracted + matches

-    from operator import itemgetter
-
+    # SORT By Start Pos
    extracted = sorted(extracted, key=itemgetter(0))
-    print(extracted)
+    # print(extracted)
    return extracted

+# TODO ADD LINK UI
+def get_extracted_by_match(extracted):
+    matches = {}
+    for start, end, value, str_obj in extracted:

-if __name__ == '__main__':
-    t0 = time.time()
-    obj_id = 'crawled/2022/09/15/circl.lu179c7903-5b21-452e-9f25-4b61d9934e2b'
-    obj_id = 'crawled/2022/09/15/circl.lu1e4f9721-06dc-404f-aabf-3c3bd0b533bd'
-    obj_id = 'submitted/2022/09/13/submitted_ba3ee771-c91c-4f50-9d6a-8558cdac7aeb.gz'
-    # obj_id = 'tests/2021/01/01/credit_cards.gz'
-    # obj_id = 'crawled/2020/07/20/circl.luc9301321-f1b1-4d91-9082-5eb452b946c5'
-    obj_id = 'submitted/2019/09/22/97172282-e4c2-4a1e-b82c-c4fb9490a56e.gz'
-    obj_id = 'submitted/2019/09/20/4fb7f02d-1241-4ef4-b17e-80ae76038835.gz'
+        if str_obj not in matches:
+            matches[str_obj] = {}
+            ob_type, row_id = str_obj.split(':', 1)
+            if ob_type == 'tag':  # TODO put me in object class
+                matches[str_obj]['subtype'] = 'tag'
+                matches[str_obj]['id'] = row_id
+                matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf02b', 'color': '#28a745', 'radius': 5}
+                matches[str_obj]['link'] = ''
+            elif ob_type == 'tracker':  # TODO put me in object class
+                matches[str_obj]['subtype'] = 'tracker'
+                matches[str_obj]['id'] = row_id
+                matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf05b', 'color': '#ffc107', 'radius': 5}
+                matches[str_obj]['link'] = ''
+            else:
+                row_id = row_id.split(':', 1)
+                if len(row_id) == 2:
+                    subtype = row_id[0]
+                    obj_id = row_id[1]
+                else:
+                    subtype = ''
+                    obj_id = row_id[0]
+                matches[str_obj]['subtype'] = subtype
+                matches[str_obj]['id'] = obj_id
+                matches[str_obj]['icon'] = ail_objects.get_object_svg(ob_type, subtype, obj_id)
+                matches[str_obj]['link'] = ail_objects.get_object_link(ob_type, subtype, obj_id)

-    extract(obj_id)
+            matches[str_obj]['matches'] = []

-    # get_obj_correl('cve', obj_id, content)
-    # r = get_tracker_match(obj_id, content)
-    # print(r)
+        match = [start, end, value]
+        matches[str_obj]['matches'].append(match)
+    return matches

-    print(time.time() - t0)
+
+# if __name__ == '__main__':
+#     t0 = time.time()
+#     obj_id = 'crawled/2022/09/15/circl.lu179c7903-5b21-452e-9f25-4b61d9934e2b'
+#     obj_id = 'crawled/2022/09/15/circl.lu1e4f9721-06dc-404f-aabf-3c3bd0b533bd'
+#     obj_id = 'submitted/2022/09/13/submitted_ba3ee771-c91c-4f50-9d6a-8558cdac7aeb.gz'
+#     # obj_id = 'tests/2021/01/01/credit_cards.gz'
+#     # obj_id = 'crawled/2020/07/20/circl.luc9301321-f1b1-4d91-9082-5eb452b946c5'
+#     obj_id = 'submitted/2019/09/22/97172282-e4c2-4a1e-b82c-c4fb9490a56e.gz'
+#     obj_id = 'submitted/2019/09/20/4fb7f02d-1241-4ef4-b17e-80ae76038835.gz'
+#     obj_id = 'crawled/2023/02/21/circl.lu1c300acb-0cbe-480f-917e-9afe3ec958e8'
+#
+#     extract(obj_id)
+#
+#     # get_obj_correl('cve', obj_id, content)
+#     # r = get_tracker_match(obj_id, content)
+#     # print(r)
+#
+#     print(time.time() - t0)

--- a/bin/modules/CreditCards.py
+++ b/bin/modules/CreditCards.py
@ -64,7 +64,7 @@ class CreditCards(AbstractModule):
        for card in cards:
            start, end, value = card
            if self.get_valid_card(value):
-                extracted.append(card)
+                extracted.append([start, end, value, f'tag:{tag}'])
        return extracted

    def compute(self, message, r_result=False):
--- a/bin/modules/Iban.py
+++ b/bin/modules/Iban.py
@ -69,8 +69,7 @@ class Iban(AbstractModule):
            start, end, value = iban
            value = ''.join(e for e in value if e.isalnum())
            if self.is_valid_iban(value):
-                print(value)
-                extracted.append(iban)
+                extracted.append([start, end, value, f'tag:{tag}'])
        return extracted

    def compute(self, message):
--- a/bin/modules/Mail.py
+++ b/bin/modules/Mail.py
@ -130,7 +130,7 @@ class Mail(AbstractModule):
            mxdomains[mxdomain].append(mail)
        for mx in self.check_mx_record(mxdomains.keys()):
            for row in mxdomains[mx]:
-                extracted.append(row)
+                extracted.append([row[0], row[1], row[2], f'tag:{tag}'])
        return extracted

    # # TODO: sanitize mails
--- a/bin/modules/Onion.py
+++ b/bin/modules/Onion.py
@ -62,7 +62,7 @@ class Onion(AbstractModule):
            url_unpack = crawlers.unpack_url(value)
            domain = url_unpack['domain']
            if crawlers.is_valid_onion_domain(domain):
-                extracted.append(onion)
+                extracted.append([start, end, value, f'tag:{tag}'])
        return extracted

    def compute(self, message):
--- a/bin/modules/Tools.py
+++ b/bin/modules/Tools.py
@ -409,8 +409,12 @@ class Tools(AbstractModule):
        return TOOLS.keys()

    def extract(self, obj_id, content, tag):
+        extracted = []
        tool_name = tag.rsplit('"', 2)[1][:-5]
-        return self.regex_finditer(TOOLS[tool_name]['regex'], obj_id, content)
+        tools = self.regex_finditer(TOOLS[tool_name]['regex'], obj_id, content)
+        for tool in tools:
+            extracted.append([tool[0], tool[1], tool[2], f'tag:{tag}'])
+        return extracted

    def compute(self, message):
        item = Item(message)
--- a/var/www/blueprints/objects_item.py
+++ b/var/www/blueprints/objects_item.py
@ -67,7 +67,7 @@ def showItem():  # # TODO: support post
        abort(404)

    item = Item(item_id)
-    meta = item.get_meta(options=['content', 'crawler', 'duplicates', 'lines', 'size'])
+    meta = item.get_meta(options={'content', 'crawler', 'duplicates', 'lines', 'size'})

    meta['name'] = meta['id'].replace('/', ' / ')
    meta['father'] = item_basic.get_item_parent(item_id)
@ -76,11 +76,13 @@ def showItem():  # # TODO: support post
    meta['hive_case'] = Export.get_item_hive_cases(item_id)

    extracted = module_extractor.extract(item.id, content=meta['content'])
+    extracted_matches = module_extractor.get_extracted_by_match(extracted)

    return render_template("show_item.html", bootstrap_label=bootstrap_label,
-                            modal_add_tags=Tag.get_modal_add_tags(meta['id'], object_type='item'),
-                            is_hive_connected=Export.get_item_hive_cases(item_id),
-                            meta=meta, extracted=extracted)
+                           modal_add_tags=Tag.get_modal_add_tags(meta['id'], object_type='item'),
+                           is_hive_connected=Export.get_item_hive_cases(item_id),
+                           meta=meta,
+                           extracted=extracted, extracted_matches=extracted_matches)

    # kvrocks data

--- a/var/www/templates/objects/item/show_item.html
+++ b/var/www/templates/objects/item/show_item.html
@ -20,6 +20,9 @@
  <script src="{{ url_for('static', filename='js/tags.js') }}"></script>

  <style>
+    .popover{
+        max-width: 100%;
+    }
 	.rotate{
 		-moz-transition: all 0.1s linear;
 		-webkit-transition: all 0.1s linear;
@ -349,10 +352,62 @@
  {% endif %}


-  {% if extracted %}
-      {% for row in extracted %}
-          <div><a href="#{{ row[0] }}:{{ row[1] }}">{{ row[2] }}</a></div>
-      {% endfor  %}
+  {% if extracted_matches %}
+  <div id="accordion_extracted" class="mb-3 mx-3">
+      <div class="card">
+          <div class="card-header py-1" id="heading_extracted">
+              <div class="row">
+                  <div class="col-11">
+                      <div class="mt-2">
+                          <img id="misp-logo" src="{{ url_for('static', filename='image/ail-icon.png')}}" height="32"> Extracted&nbsp;&nbsp;
+                          <div class="badge badge-warning">{{extracted_matches|length}}</div>
+                      </div>
+                  </div>
+                  <div class="col-1">
+                      <button class="btn btn-link btn-lg py-2 float-right rotate down" data-toggle="collapse" data-target="#collapse_extracted" aria-expanded="true" aria-controls="collapseDecoded">
+                          <i class="fas fa-chevron-circle-down"></i>
+                      </button>
+                  </div>
+              </div>
+          </div>
+
+          <div id="collapse_extracted" class="collapse show" aria-labelledby="heading_extracted" data-parent="#accordion_extracted">
+              <div class="card-body">
+                  <table id="table_extracted" class="table table-striped">
+                      <thead class="thead-dark">
+                      <tr>
+                          <th>Type</th>
+                          <th>ID</th>
+                          <th>Extracted</th>
+                      </tr>
+                      </thead>
+                      <tbody>
+                      {% for match in extracted_matches %}
+                          <tr>
+                              <td>
+                                  <svg height="26" width="26">
+                                      <g class="nodes">
+                                          <circle cx="13" cy="13" r="13" fill="{{ extracted_matches[match]['icon']['color'] }}"></circle>
+                                          <text x="13" y="13" text-anchor="middle" dominant-baseline="central" class="graph_node_icon {{ extracted_matches[match]['icon']['style'] }}" font-size="16px">{{ extracted_matches[match]['icon']['icon'] }}</text>
+                                      </g>
+                                  </svg>
+                                  {{ extracted_matches[match]['subtype'] }}
+                              </td>
+                              <td>{{ extracted_matches[match]['id'] }}</td>
+                              <td>
+                                  {% for row in extracted_matches[match]['matches'] %}
+                                      <a href="#{{ row[0] }}:{{row[1] }}">{{ row[2] }}</a><br>
+                                  {% endfor %}
+                              </td>
+                          </tr>
+                      {% endfor %}
+                      </tbody>
+                  </table>
+              </div>
+          </div>
+
+      </div>
+  </div>
  {% endif %}


@ -382,7 +437,7 @@
            {% if not extracted %}
                <p class="my-0"> <pre class="border">{{ meta['content'] }}</pre></p>
            {% else %}
-                <p class="my-0"> <pre class="border">{{ meta['content'][:extracted[0][0]] }}{% for row in extracted %}<span class="hg-text" id="{{ row[0] }}:{{ row[1] }}">{{ meta['content'][row[0]:row[1]] }}</span>{% if loop.index + 1 > extracted|length %}{{ meta['content'][extracted[-1][1]:] }}{% else %}{{ meta['content'][row[1]:extracted[loop.index][0]] }}{% endif %}{% endfor %}</pre></p>
+                <p class="my-0"> <pre class="border">{{ meta['content'][:extracted[0][0]] }}{% for row in extracted %}<span class="hg-text" data-toggle="popover" data-trigger="hover" data-html="true" title="<svg height=&quot;26&quot; width=&quot;26&quot;><g class=&quot;nodes&quot;><circle cx=&quot;13&quot; cy=&quot;13&quot; r=&quot;13&quot; fill=&quot;{{ extracted_matches[row[3]]['icon']['color'] }}&quot;></circle><text x=&quot;13&quot; y=&quot;13&quot; text-anchor=&quot;middle&quot; dominant-baseline=&quot;central&quot; class=&quot;graph_node_icon {{ extracted_matches[row[3]]['icon']['style'] }}&quot; font-size=&quot;16px&quot;>{{ extracted_matches[row[3]]['icon']['icon'] }}</text></g></svg> {{ extracted_matches[row[3]]['subtype'] }}" data-content="{{ extracted_matches[row[3]]['id'] }}" id="{{ row[0] }}:{{ row[1] }}">{{ meta['content'][row[0]:row[1]] }}</span>{% if loop.index + 1 > extracted|length %}{{ meta['content'][extracted[-1][1]:] }}{% else %}{{ meta['content'][row[1]:extracted[loop.index][0]] }}{% endif %}{% endfor %}</pre></p>
            {% endif %}
        </div>
        <div class="tab-pane fade" id="pills-html2text" role="tabpanel" aria-labelledby="pills-html2text-tab">
@ -395,19 +450,22 @@
    </div>
  </div>

-  <script>
-      var ltags
-      var ltagsgalaxies
-      $(document).ready(function(){
-          $('#tableDup').DataTable();
+<script>
+    var ltags
+    var ltagsgalaxies
+    $(document).ready(function(){
+        $('#tableDup').DataTable();
          // $('#tableb64').DataTable({
    			// 	"aLengthMenu": [[5, 10, 15, -1], [5, 10, 15, "All"]],
    			// 	"iDisplayLength": 5,
    			// 	"order": [[ 1, "asc" ]]
    			// });
-          $(".rotate").click(function(){
-        	 $(this).toggleClass("down")  ;
-        	})
+        {% if extracted %}
+            $('#table_extracted').DataTable();
+        {% endif %}
+        $(".rotate").click(function(){
+            $(this).toggleClass("down");
+        })
      });

      $('#pills-html2text-tab').on('shown.bs.tab', function (e) {
@ -418,7 +476,7 @@

        }
      });
-  </script>
+</script>

 {% if meta['crawler'] %}
  <script>
@ -470,6 +528,9 @@
  }

  blocks.addEventListener('change', pixelate, false);
+  $(function () {
+    $('[data-toggle="popover"]').popover()
+  })
  </script>
 {% endif %}