AIL-framework/bin/lib/module_extractor.py

#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import json
import logging
import os
import sys

import yara

from hashlib import sha256
from operator import itemgetter

sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from lib.objects import ail_objects
from lib.objects.Items import Item
from lib.objects.Titles import Title
from lib import correlations_engine
from lib import regex_helper
from lib.ConfigLoader import ConfigLoader

from lib import Tracker

from modules.CreditCards import CreditCards
from modules.Iban import Iban
from modules.Mail import Mail
from modules.Onion import Onion
from modules.Phone import Phone
from modules.Tools import Tools

logger = logging.getLogger()

config_loader = ConfigLoader()
r_cache = config_loader.get_redis_conn("Redis_Cache")
config_loader = None

r_key = regex_helper.generate_redis_cache_key('extractor')

# TODO UI Link

MODULES = {
    'infoleak:automatic-detection="credit-card"': CreditCards(queue=False),
    'infoleak:automatic-detection="iban"': Iban(queue=False),
    'infoleak:automatic-detection="mail"': Mail(queue=False),
    'infoleak:automatic-detection="onion"': Onion(queue=False),
    'infoleak:automatic-detection="phone-number"': Phone(queue=False),
    # APIkey ???
    # Credentials
    # Zerobins
    # CERTIFICATE + KEYS ???
    # SQL Injetction / Libinjection ???

}
tools = Tools(queue=False)
for tool_name in tools.get_tools():
    MODULES[f'infoleak:automatic-detection="{tool_name}-tool"'] = tools

def get_correl_match(extract_type, obj_id, content):
    extracted = []
    correl = correlations_engine.get_correlation_by_correl_type('item', '', obj_id, extract_type)
    to_extract = []
    map_subtype = {}
    map_value_id = {}
    for c in correl:
        subtype, value = c.split(':', 1)
        if extract_type == 'title':
            title = Title(value).get_content()
            to_extract.append(title)
            sha256_val = sha256(title.encode()).hexdigest()
        else:
            map_subtype[value] = subtype
            to_extract.append(value)
            sha256_val = sha256(value.encode()).hexdigest()
        map_value_id[sha256_val] = value
    if to_extract:
        objs = regex_helper.regex_finditer(r_key, '|'.join(to_extract), obj_id, content)
        for obj in objs:
            if map_subtype.get(obj[2]):
                subtype = map_subtype[obj[2]]
            else:
                subtype = ''
                sha256_val = sha256(obj[2].encode()).hexdigest()
            value_id = map_value_id.get(sha256_val)
            if not value_id:
                logger.critical(f'Error module extractor: {sha256_val}\n{extract_type}\n{subtype}\n{value_id}\n{map_value_id}\n{objs}')
                value_id = 'ERROR'
            extracted.append([obj[0], obj[1], obj[2], f'{extract_type}:{subtype}:{value_id}'])
    return extracted

def _get_yara_match(data):
    for string_match in data.get('strings'):
        for string_match_instance in string_match.instances:
            start = string_match_instance.offset
            value = string_match_instance.matched_data.decode()
            end = start + string_match_instance.matched_length
            r_cache.sadd(f'extractor:yara:match:{r_key}', f'{start}:{end}:{value}')
            r_cache.expire(f'extractor:yara:match:{r_key}', 300)
    return yara.CALLBACK_CONTINUE

def _get_word_regex(word):
    return '(?:^|(?<=[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]))' + word + '(?:$|(?=[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]))'

def convert_byte_offset_to_string(b_content, offset):
    byte_chunk = b_content[:offset + 1]
    try:
        string_chunk = byte_chunk.decode()
        offset = len(string_chunk) - 1
        return offset
    except UnicodeDecodeError as e:
        logger.error(f'Yara offset converter error, {str(e)}\n{offset}/{len(b_content)}')
        return convert_byte_offset_to_string(b_content, offset - 1)


# TODO RETRO HUNTS
# TODO TRACKER TYPE IN UI
def get_tracker_match(obj_id, content):
    extracted = []
    extracted_yara = []
    trackers = Tracker.get_obj_trackers('item', '', obj_id)
    for tracker_uuid in trackers:
        tracker = Tracker.Tracker(tracker_uuid)
        tracker_type = tracker.get_type()
        # print(tracker_type)
        tracked = tracker.get_tracked()
        if tracker_type == 'regex':  # TODO Improve word detection -> word delimiter
            regex_match = regex_helper.regex_finditer(r_key, tracked, obj_id, content)
            for match in regex_match:
                extracted.append([int(match[0]), int(match[1]), match[2], f'tracker:{tracker.uuid}'])
        elif tracker_type == 'yara':
            rule = tracker.get_rule()
            rule.match(data=content.encode(), callback=_get_yara_match,
                       which_callbacks=yara.CALLBACK_MATCHES, timeout=30)
            yara_match = r_cache.smembers(f'extractor:yara:match:{r_key}')
            r_cache.delete(f'extractor:yara:match:{r_key}')
            extracted = []
            for match in yara_match:
                start, end, value = match.split(':', 2)
                extracted_yara.append([int(start), int(end), value, f'tracker:{tracker.uuid}'])

        elif tracker_type == 'word' or tracker_type == 'set':
            if tracker_type == 'set':
                tracked = tracked.rsplit(';', 1)[0]
                words = tracked.split(',')
            else:
                words = [tracked]
            for word in words:
                regex = _get_word_regex(word)
                regex_match = regex_helper.regex_finditer(r_key, regex, obj_id, content)
                # print(regex_match)
                for match in regex_match:
                    extracted.append([int(match[0]), int(match[1]), match[2], f'tracker:{tracker.uuid}'])

    # Retro Hunt
    retro_hunts = Tracker.get_obj_retro_hunts('item', '', obj_id)
    for retro_uuid in retro_hunts:
        retro_hunt = Tracker.RetroHunt(retro_uuid)
        rule = retro_hunt.get_rule(r_compile=True)
        rule.match(data=content.encode(), callback=_get_yara_match,
                   which_callbacks=yara.CALLBACK_MATCHES, timeout=30)
        yara_match = r_cache.smembers(f'extractor:yara:match:{r_key}')
        r_cache.delete(f'extractor:yara:match:{r_key}')
        extracted = []
        for match in yara_match:
            start, end, value = match.split(':', 2)
            extracted_yara.append([int(start), int(end), value, f'retro_hunt:{retro_hunt.uuid}'])

    # Convert byte offset to string offset
    if extracted_yara:
        b_content = content.encode()
        if len(b_content) == len(content):
            extracted[0:0] = extracted_yara
        else:
            for yara_m in extracted_yara:
                start = convert_byte_offset_to_string(b_content, yara_m[0])
                end = convert_byte_offset_to_string(b_content, yara_m[1])
                extracted.append([int(start), int(end), yara_m[2], yara_m[3]])

    return extracted

# Type:subtype:id
# tag:iban
# tracker:uuid

def extract(obj_id, content=None):
    item = Item(obj_id)
    if not item.exists():
        return []

    # CHECK CACHE
    cached = r_cache.get(f'extractor:cache:{obj_id}')
    # cached = None
    if cached:
        r_cache.expire(f'extractor:cache:{obj_id}', 300)
        return json.loads(cached)

    if not content:
        content = item.get_content()

    extracted = get_tracker_match(obj_id, content)

    # print(item.get_tags())
    for tag in item.get_tags():
        if MODULES.get(tag):
            # print(tag)
            module = MODULES.get(tag)
            matches = module.extract(obj_id, content, tag)
            if matches:
                extracted = extracted + matches

    for obj_t in ['cve', 'cryptocurrency', 'title', 'username']:  # Decoded, PGP->extract bloc
        matches = get_correl_match(obj_t, obj_id, content)
        if matches:
            extracted = extracted + matches

    # SORT By Start Pos
    extracted = sorted(extracted, key=itemgetter(0))
    # print(extracted)

    # Save In Cache
    if extracted:
        extracted_dump = json.dumps(extracted)
        r_cache.set(f'extractor:cache:{obj_id}', extracted_dump)
        r_cache.expire(f'extractor:cache:{obj_id}', 300)  # TODO Reduce CACHE ???????????????

    return extracted

# TODO ADD LINK UI
def get_extracted_by_match(extracted):
    matches = {}
    for start, end, value, str_obj in extracted:

        if str_obj not in matches:
            matches[str_obj] = {}
            ob_type, row_id = str_obj.split(':', 1)
            if ob_type == 'tag':  # TODO put me in object class
                matches[str_obj]['subtype'] = 'tag'
                matches[str_obj]['id'] = row_id
                matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf02b', 'color': '#28a745', 'radius': 5}
                matches[str_obj]['link'] = ''
            elif ob_type == 'tracker':  # TODO put me in object class
                matches[str_obj]['subtype'] = 'tracker'
                matches[str_obj]['id'] = row_id
                matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf05b', 'color': '#ffc107', 'radius': 5}
                matches[str_obj]['link'] = ''
            elif ob_type == 'retro_hunt':  # TODO put me in object class
                matches[str_obj]['subtype'] = 'retro_hunt'
                matches[str_obj]['id'] = row_id
                matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf05b', 'color': '#008107', 'radius': 5}
                matches[str_obj]['link'] = ''
            else:
                row_id = row_id.split(':', 1)
                if len(row_id) == 2:
                    subtype = row_id[0]
                    obj_id = row_id[1]
                else:
                    subtype = ''
                    obj_id = row_id[0]
                matches[str_obj]['subtype'] = subtype
                matches[str_obj]['id'] = obj_id
                matches[str_obj]['icon'] = ail_objects.get_object_svg(ob_type, subtype, obj_id)
                matches[str_obj]['link'] = ail_objects.get_object_link(ob_type, subtype, obj_id)

            matches[str_obj]['matches'] = []

        match = [start, end, value]
        matches[str_obj]['matches'].append(match)
    return matches


# if __name__ == '__main__':
#     t0 = time.time()
#     obj_id = 'crawled/2022/09/15/circl.lu179c7903-5b21-452e-9f25-4b61d9934e2b'
#     obj_id = 'crawled/2022/09/15/circl.lu1e4f9721-06dc-404f-aabf-3c3bd0b533bd'
#     obj_id = 'submitted/2022/09/13/submitted_ba3ee771-c91c-4f50-9d6a-8558cdac7aeb.gz'
#     # obj_id = 'tests/2021/01/01/credit_cards.gz'
#     # obj_id = 'crawled/2020/07/20/circl.luc9301321-f1b1-4d91-9082-5eb452b946c5'
#     obj_id = 'submitted/2019/09/22/97172282-e4c2-4a1e-b82c-c4fb9490a56e.gz'
#     obj_id = 'submitted/2019/09/20/4fb7f02d-1241-4ef4-b17e-80ae76038835.gz'
#     obj_id = 'crawled/2023/02/21/circl.lu1c300acb-0cbe-480f-917e-9afe3ec958e8'
#
#     extract(obj_id)
#
#     # get_obj_correl('cve', obj_id, content)
#     # r = get_tracker_match(obj_id, content)
#     # print(r)
#
#     print(time.time() - t0)