AIL-framework/bin/lib/module_extractor.py

#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import json
import logging
import os
import sys

import yara

from hashlib import sha256
from operator import itemgetter

sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from lib.objects import ail_objects
from lib.objects.Items import Item
from lib.objects.Titles import Title
from lib import correlations_engine
from lib import regex_helper
from lib.ConfigLoader import ConfigLoader

from lib import Tracker

from modules.CreditCards import CreditCards
from modules.Iban import Iban
from modules.Mail import Mail
from modules.Onion import Onion
from modules.Phone import Phone
from modules.Tools import Tools

logger = logging.getLogger()

config_loader = ConfigLoader()
r_cache = config_loader.get_redis_conn("Redis_Cache")
config_loader = None

r_key = regex_helper.generate_redis_cache_key('extractor')

# TODO UI Link

CORRELATION_TO_EXTRACT = {
    'item': ['cve', 'cryptocurrency', 'title', 'username'],
    'message': ['cve', 'cryptocurrency', 'username']
}

MODULES = {
    'infoleak:automatic-detection="credit-card"': CreditCards(queue=False),
    'infoleak:automatic-detection="iban"': Iban(queue=False),
    'infoleak:automatic-detection="mail"': Mail(queue=False),
    'infoleak:automatic-detection="onion"': Onion(queue=False),
    'infoleak:automatic-detection="phone-number"': Phone(queue=False),
    # APIkey ???
    # Credentials
    # Zerobins
    # CERTIFICATE + KEYS ???
    # SQL Injetction / Libinjection ???

}
tools = Tools(queue=False)
for tool_name in tools.get_tools():
    MODULES[f'infoleak:automatic-detection="{tool_name}-tool"'] = tools

def merge_overlap(extracted):
    merged = []
    curr_start, curr_end, curr_string_match, curr_obj_ref = extracted[0]
    curr_obj_ref = [(curr_obj_ref, curr_string_match)]

    for start, end, mstring, ref in extracted[1:]:
        # overlap
        if start <= curr_end:
            curr_string_match += mstring[curr_end - start:]
            curr_end = max(curr_end, end)
            curr_obj_ref.append((ref, mstring))
        else:
            merged.append((curr_start, curr_end, curr_string_match, curr_obj_ref))
            curr_start, curr_end, curr_string_match, curr_obj_ref = start, end, mstring, [(ref, mstring)]

    merged.append((curr_start, curr_end, curr_string_match, curr_obj_ref))
    return merged

def get_correl_match(extract_type, obj, content):
    extracted = []
    correl = correlations_engine.get_correlation_by_correl_type(obj.type, obj.get_subtype(r_str=True), obj.id, extract_type)
    to_extract = []
    map_subtype = {}
    map_value_id = {}
    for c in correl:
        subtype, value = c.split(':', 1)
        if extract_type == 'title':
            title = Title(value).get_content()
            to_extract.append(title)
            sha256_val = sha256(title.encode()).hexdigest()
        else:
            map_subtype[value] = subtype
            to_extract.append(value)
            sha256_val = sha256(value.encode()).hexdigest()
        map_value_id[sha256_val] = value
    if to_extract:
        objs = regex_helper.regex_finditer(r_key, '|'.join(to_extract), obj.get_global_id(), content)
        if extract_type == 'title' and objs:
            objs = [objs[0]]
        for ob in objs:
            if map_subtype.get(ob[2]):
                subtype = map_subtype[ob[2]]
            else:
                subtype = ''
                sha256_val = sha256(ob[2].encode()).hexdigest()
            value_id = map_value_id.get(sha256_val)
            if not value_id:
                logger.critical(f'Error module extractor: {sha256_val}\n{extract_type}\n{subtype}\n{value_id}\n{map_value_id}\n{objs}')
                value_id = 'ERROR'
            extracted.append([ob[0], ob[1], ob[2], f'{extract_type}:{subtype}:{value_id}'])
    return extracted

def _get_yara_match(data):
    for string_match in data.get('strings'):
        for string_match_instance in string_match.instances:
            start = string_match_instance.offset
            value = string_match_instance.matched_data.decode()
            end = start + string_match_instance.matched_length
            r_cache.sadd(f'extractor:yara:match:{r_key}', f'{start}:{end}:{value}')
            r_cache.expire(f'extractor:yara:match:{r_key}', 300)
    return yara.CALLBACK_CONTINUE

def _get_word_regex(word):
    return '(?i)(?:^|(?<=[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]))' + word + '(?:$|(?=[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]))'

def convert_byte_offset_to_string(b_content, offset):
    byte_chunk = b_content[:offset + 1]
    try:
        string_chunk = byte_chunk.decode()
        offset = len(string_chunk) - 1
        return offset
    except UnicodeDecodeError as e:
        logger.error(f'Yara offset converter error, {str(e)}\n{offset}/{len(b_content)}')
        return convert_byte_offset_to_string(b_content, offset - 1)


# TODO RETRO HUNTS
# TODO TRACKER TYPE IN UI
def get_tracker_match(obj, content):
    extracted = []
    extracted_yara = []
    obj_gid = obj.get_global_id()
    trackers = Tracker.get_obj_trackers(obj.type, obj.get_subtype(r_str=True), obj.id)
    for tracker_uuid in trackers:
        tracker = Tracker.Tracker(tracker_uuid)
        tracker_type = tracker.get_type()
        # print(tracker_type)
        tracked = tracker.get_tracked()
        if tracker_type == 'regex':  # TODO Improve word detection -> word delimiter
            regex_match = regex_helper.regex_finditer(r_key, tracked, obj_gid, content)
            for match in regex_match:
                extracted.append([int(match[0]), int(match[1]), match[2], f'tracker:{tracker.uuid}'])
        elif tracker_type == 'yara':
            rule = tracker.get_rule()
            rule.match(data=content.encode(), callback=_get_yara_match,
                       which_callbacks=yara.CALLBACK_MATCHES, timeout=30)
            yara_match = r_cache.smembers(f'extractor:yara:match:{r_key}')
            r_cache.delete(f'extractor:yara:match:{r_key}')
            extracted = []
            for match in yara_match:
                start, end, value = match.split(':', 2)
                extracted_yara.append([int(start), int(end), value, f'tracker:{tracker.uuid}'])

        elif tracker_type == 'word' or tracker_type == 'set':
            if tracker_type == 'set':
                tracked = tracked.rsplit(';', 1)[0]
                words = tracked.split(',')
            else:
                words = [tracked]
            for word in words:
                regex = _get_word_regex(word)
                regex_match = regex_helper.regex_finditer(r_key, regex, obj_gid, content)
                # print(regex_match)
                for match in regex_match:
                    extracted.append([int(match[0]), int(match[1]), match[2], f'tracker:{tracker.uuid}'])

    # Retro Hunt
    retro_hunts = Tracker.get_obj_retro_hunts(obj.type, obj.get_subtype(r_str=True), obj.id)
    for retro_uuid in retro_hunts:
        retro_hunt = Tracker.RetroHunt(retro_uuid)
        rule = retro_hunt.get_rule(r_compile=True)
        rule.match(data=content.encode(), callback=_get_yara_match,
                   which_callbacks=yara.CALLBACK_MATCHES, timeout=30)
        yara_match = r_cache.smembers(f'extractor:yara:match:{r_key}')
        r_cache.delete(f'extractor:yara:match:{r_key}')
        extracted = []
        for match in yara_match:
            start, end, value = match.split(':', 2)
            extracted_yara.append([int(start), int(end), value, f'retro_hunt:{retro_hunt.uuid}'])

    # Convert byte offset to string offset
    if extracted_yara:
        b_content = content.encode()
        if len(b_content) == len(content):
            extracted[0:0] = extracted_yara
        else:
            for yara_m in extracted_yara:
                start = convert_byte_offset_to_string(b_content, yara_m[0])
                end = convert_byte_offset_to_string(b_content, yara_m[1])
                extracted.append([int(start), int(end), yara_m[2], yara_m[3]])

    return extracted

# Type:subtype:id
# tag:iban
# tracker:uuid
# def extract(obj_id, content=None):
def extract(obj_type, subtype, obj_id, content=None):
    obj = ail_objects.get_object(obj_type, subtype, obj_id)
    if not obj.exists():
        return []
    obj_gid = obj.get_global_id()

    # CHECK CACHE
    cached = r_cache.get(f'extractor:cache:{obj_gid}')
    # cached = None
    if cached:
        r_cache.expire(f'extractor:cache:{obj_gid}', 300)
        return json.loads(cached)

    if not content:
        content = obj.get_content()

    extracted = get_tracker_match(obj, content)

    # print(item.get_tags())
    for tag in obj.get_tags():
        if MODULES.get(tag):
            # print(tag)
            module = MODULES.get(tag)
            matches = module.extract(obj, content, tag)
            if matches:
                extracted = extracted + matches

    for obj_t in CORRELATION_TO_EXTRACT[obj.type]:
        matches = get_correl_match(obj_t, obj, content)
        if matches:
            extracted = extracted + matches

    # SORT By Start Pos
    extracted = sorted(extracted, key=itemgetter(0))
    extracted = merge_overlap(extracted)

    # Save In Cache
    if extracted:
        extracted_dump = json.dumps(extracted)
        r_cache.set(f'extractor:cache:{obj_gid}', extracted_dump)
        r_cache.expire(f'extractor:cache:{obj_gid}', 300)  # TODO Reduce CACHE ???????????????

    return extracted

# TODO ADD LINK UI
def get_extracted_by_match(extracted):
    matches = {}
    for start, end, value, raw_objs in extracted:

        for raw in raw_objs:
            str_obj, str_match = raw

            if str_obj not in matches:
                matches[str_obj] = {}
                ob_type, row_id = str_obj.split(':', 1)
                if ob_type == 'tag':  # TODO put me in object class
                    matches[str_obj]['subtype'] = 'tag'
                    matches[str_obj]['id'] = row_id
                    matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf02b', 'color': '#28a745', 'radius': 5}
                    matches[str_obj]['link'] = ''
                elif ob_type == 'tracker':  # TODO put me in object class
                    matches[str_obj]['subtype'] = 'tracker'
                    matches[str_obj]['id'] = row_id
                    matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf05b', 'color': '#ffc107', 'radius': 5}
                    matches[str_obj]['link'] = ''
                elif ob_type == 'retro_hunt':  # TODO put me in object class
                    matches[str_obj]['subtype'] = 'retro_hunt'
                    matches[str_obj]['id'] = row_id
                    matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf05b', 'color': '#008107', 'radius': 5}
                    matches[str_obj]['link'] = ''
                else:
                    row_id = row_id.split(':', 1)
                    if len(row_id) == 2:
                        subtype = row_id[0]
                        obj_id = row_id[1]
                    else:
                        subtype = ''
                        obj_id = row_id[0]
                    matches[str_obj]['subtype'] = subtype
                    matches[str_obj]['id'] = obj_id
                    matches[str_obj]['icon'] = ail_objects.get_object_svg(ob_type, subtype, obj_id)
                    matches[str_obj]['link'] = ail_objects.get_object_link(ob_type, subtype, obj_id)

                matches[str_obj]['matches'] = []

            match = [start, end, str_match]
            matches[str_obj]['matches'].append(match)
    return matches


# if __name__ == '__main__':
#     t0 = time.time()
#     obj_id = 'crawled/2023/02/21/circl.lu1c300acb-0cbe-480f-917e-9afe3ec958e8'
#     extract(obj_id)
#
#     # get_obj_correl('cve', obj_id, content)
#     # r = get_tracker_match(obj_id, content)
#     # print(r)
#
#     print(time.time() - t0)