AIL-framework/bin/lib/module_extractor.py

248 lines
8.9 KiB
Python
Executable File

#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import json
import os
import sys
import time
import yara
from operator import itemgetter
sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from lib.objects import ail_objects
from lib.objects.Items import Item
from lib import correlations_engine
from lib import regex_helper
from lib.ConfigLoader import ConfigLoader
from lib import Tracker
from modules.CreditCards import CreditCards
from modules.Iban import Iban
from modules.Mail import Mail
from modules.Onion import Onion
from modules.Tools import Tools
config_loader = ConfigLoader()
r_cache = config_loader.get_redis_conn("Redis_Cache")
config_loader = None
r_key = regex_helper.generate_redis_cache_key('extractor')
# TODO UI Link
MODULES = {
'infoleak:automatic-detection="credit-card"': CreditCards(queue=False),
'infoleak:automatic-detection="iban"': Iban(queue=False),
'infoleak:automatic-detection="mail"': Mail(queue=False),
'infoleak:automatic-detection="onion"': Onion(queue=False),
# APIkey ???
# Credentials
# Zerobins
# CERTIFICATE + KEYS ???
# SQL Injetction / Libinjection ???
}
tools = Tools(queue=False)
for tool_name in tools.get_tools():
MODULES[f'infoleak:automatic-detection="{tool_name}-tool"'] = tools
def get_correl_match(extract_type, obj_id, content):
extracted = []
correl = correlations_engine.get_correlation_by_correl_type('item', '', obj_id, extract_type)
to_extract = []
map_subtype = {}
for c in correl:
subtype, value = c.split(':', 1)
map_subtype[value] = subtype
to_extract.append(value)
if to_extract:
objs = regex_helper.regex_finditer(r_key, '|'.join(to_extract), obj_id, content)
for obj in objs:
if map_subtype[obj[2]]:
subtype = map_subtype[obj[2]]
else:
subtype = ''
extracted.append([obj[0], obj[1], obj[2], f'{extract_type}:{subtype}:{obj[2]}'])
return extracted
def _get_yara_match(data):
for string_match in data.get('strings'):
for string_match_instance in string_match.instances:
start = string_match_instance.offset
value = string_match_instance.matched_data.decode()
end = start + string_match_instance.matched_length
r_cache.sadd(f'extractor:yara:match:{r_key}', f'{start}:{end}:{value}')
r_cache.expire(f'extractor:yara:match:{r_key}', 300)
return yara.CALLBACK_CONTINUE
def _get_word_regex(word):
return '(?:^|(?<=[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]))' + word + '(?:$|(?=[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]))'
def convert_byte_offset_to_string(b_content, offset):
byte_chunk = b_content[:offset + 1]
string_chunk = byte_chunk.decode()
offset = len(string_chunk) - 1
return offset
# TODO RETRO HUNTS
# TODO TRACKER TYPE IN UI
def get_tracker_match(obj_id, content):
extracted = []
extracted_yara = []
trackers = Tracker.get_obj_trackers('item', '', obj_id)
for tracker_uuid in trackers:
tracker = Tracker.Tracker(tracker_uuid)
tracker_type = tracker.get_type()
# print(tracker_type)
tracked = tracker.get_tracked()
if tracker_type == 'regex': # TODO Improve word detection -> word delimiter
regex_match = regex_helper.regex_finditer(r_key, tracked, obj_id, content)
for match in regex_match:
extracted.append([int(match[0]), int(match[1]), match[2], f'tracker:{tracker.uuid}'])
elif tracker_type == 'yara':
rule = tracker.get_rule()
rule.match(data=content.encode(), callback=_get_yara_match,
which_callbacks=yara.CALLBACK_MATCHES, timeout=30)
yara_match = r_cache.smembers(f'extractor:yara:match:{r_key}')
r_cache.delete(f'extractor:yara:match:{r_key}')
extracted = []
for match in yara_match:
start, end, value = match.split(':', 2)
extracted_yara.append([int(start), int(end), value, f'tracker:{tracker.uuid}'])
elif tracker_type == 'word' or tracker_type == 'set':
if tracker_type == 'set':
tracked = tracked.rsplit(';', 1)[0]
words = tracked.split(',')
else:
words = [tracked]
for word in words:
regex = _get_word_regex(word)
regex_match = regex_helper.regex_finditer(r_key, regex, obj_id, content)
# print(regex_match)
for match in regex_match:
extracted.append([int(match[0]), int(match[1]), match[2], f'tracker:{tracker.uuid}'])
# Convert byte offset to string offset
if extracted_yara:
b_content = content.encode()
if len(b_content) == len(content):
extracted[0:0] = extracted_yara
else:
for yara_m in extracted_yara:
start = convert_byte_offset_to_string(b_content, yara_m[0])
end = convert_byte_offset_to_string(b_content, yara_m[1])
extracted.append([int(start), int(end), yara_m[2], yara_m[3]])
return extracted
# Type:subtype:id
# tag:iban
# tracker:uuid
def extract(obj_id, content=None):
item = Item(obj_id)
if not item.exists():
return []
# CHECK CACHE
cached = r_cache.get(f'extractor:cache:{obj_id}')
if cached:
r_cache.expire(f'extractor:cache:{obj_id}', 300)
return json.loads(cached)
if not content:
content = item.get_content()
extracted = get_tracker_match(obj_id, content)
# print(item.get_tags())
for tag in item.get_tags():
if MODULES.get(tag):
# print(tag)
module = MODULES.get(tag)
matches = module.extract(obj_id, content, tag)
if matches:
extracted = extracted + matches
for obj_t in ['cve', 'cryptocurrency', 'username']: # Decoded, PGP->extract bloc
matches = get_correl_match(obj_t, obj_id, content)
if matches:
extracted = extracted + matches
# SORT By Start Pos
extracted = sorted(extracted, key=itemgetter(0))
# print(extracted)
# Save In Cache
if extracted:
extracted_dump = json.dumps(extracted)
r_cache.set(f'extractor:cache:{obj_id}', extracted_dump)
r_cache.expire(f'extractor:cache:{obj_id}', 300) # TODO Reduce CACHE ???????????????
return extracted
# TODO ADD LINK UI
def get_extracted_by_match(extracted):
matches = {}
for start, end, value, str_obj in extracted:
if str_obj not in matches:
matches[str_obj] = {}
ob_type, row_id = str_obj.split(':', 1)
if ob_type == 'tag': # TODO put me in object class
matches[str_obj]['subtype'] = 'tag'
matches[str_obj]['id'] = row_id
matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf02b', 'color': '#28a745', 'radius': 5}
matches[str_obj]['link'] = ''
elif ob_type == 'tracker': # TODO put me in object class
matches[str_obj]['subtype'] = 'tracker'
matches[str_obj]['id'] = row_id
matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf05b', 'color': '#ffc107', 'radius': 5}
matches[str_obj]['link'] = ''
else:
row_id = row_id.split(':', 1)
if len(row_id) == 2:
subtype = row_id[0]
obj_id = row_id[1]
else:
subtype = ''
obj_id = row_id[0]
matches[str_obj]['subtype'] = subtype
matches[str_obj]['id'] = obj_id
matches[str_obj]['icon'] = ail_objects.get_object_svg(ob_type, subtype, obj_id)
matches[str_obj]['link'] = ail_objects.get_object_link(ob_type, subtype, obj_id)
matches[str_obj]['matches'] = []
match = [start, end, value]
matches[str_obj]['matches'].append(match)
return matches
# if __name__ == '__main__':
# t0 = time.time()
# obj_id = 'crawled/2022/09/15/circl.lu179c7903-5b21-452e-9f25-4b61d9934e2b'
# obj_id = 'crawled/2022/09/15/circl.lu1e4f9721-06dc-404f-aabf-3c3bd0b533bd'
# obj_id = 'submitted/2022/09/13/submitted_ba3ee771-c91c-4f50-9d6a-8558cdac7aeb.gz'
# # obj_id = 'tests/2021/01/01/credit_cards.gz'
# # obj_id = 'crawled/2020/07/20/circl.luc9301321-f1b1-4d91-9082-5eb452b946c5'
# obj_id = 'submitted/2019/09/22/97172282-e4c2-4a1e-b82c-c4fb9490a56e.gz'
# obj_id = 'submitted/2019/09/20/4fb7f02d-1241-4ef4-b17e-80ae76038835.gz'
# obj_id = 'crawled/2023/02/21/circl.lu1c300acb-0cbe-480f-917e-9afe3ec958e8'
#
# extract(obj_id)
#
# # get_obj_correl('cve', obj_id, content)
# # r = get_tracker_match(obj_id, content)
# # print(r)
#
# print(time.time() - t0)