AIL-framework/bin/lib/module_extractor.py

272 lines
10 KiB
Python
Executable File

#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import json
import logging
import os
import sys
import yara
from hashlib import sha256
from operator import itemgetter
sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from lib.objects import ail_objects
from lib.objects.Items import Item
from lib.objects.Titles import Title
from lib import correlations_engine
from lib import regex_helper
from lib.ConfigLoader import ConfigLoader
from lib import Tracker
from modules.CreditCards import CreditCards
from modules.Iban import Iban
from modules.Mail import Mail
from modules.Onion import Onion
from modules.Phone import Phone
from modules.Tools import Tools
logger = logging.getLogger()
config_loader = ConfigLoader()
r_cache = config_loader.get_redis_conn("Redis_Cache")
config_loader = None
r_key = regex_helper.generate_redis_cache_key('extractor')
# TODO UI Link
MODULES = {
'infoleak:automatic-detection="credit-card"': CreditCards(queue=False),
'infoleak:automatic-detection="iban"': Iban(queue=False),
'infoleak:automatic-detection="mail"': Mail(queue=False),
'infoleak:automatic-detection="onion"': Onion(queue=False),
'infoleak:automatic-detection="phone-number"': Phone(queue=False),
# APIkey ???
# Credentials
# Zerobins
# CERTIFICATE + KEYS ???
# SQL Injetction / Libinjection ???
}
tools = Tools(queue=False)
for tool_name in tools.get_tools():
MODULES[f'infoleak:automatic-detection="{tool_name}-tool"'] = tools
def get_correl_match(extract_type, obj_id, content):
extracted = []
correl = correlations_engine.get_correlation_by_correl_type('item', '', obj_id, extract_type)
to_extract = []
map_subtype = {}
map_value_id = {}
for c in correl:
subtype, value = c.split(':', 1)
if extract_type == 'title':
title = Title(value).get_content()
to_extract.append(title)
sha256_val = sha256(title.encode()).hexdigest()
else:
map_subtype[value] = subtype
to_extract.append(value)
sha256_val = sha256(value.encode()).hexdigest()
map_value_id[sha256_val] = value
if to_extract:
objs = regex_helper.regex_finditer(r_key, '|'.join(to_extract), obj_id, content)
for obj in objs:
if map_subtype.get(obj[2]):
subtype = map_subtype[obj[2]]
else:
subtype = ''
sha256_val = sha256(obj[2].encode()).hexdigest()
value_id = map_value_id.get(sha256_val)
if not value_id:
logger.critical(f'Error module extractor: {sha256_val}\n{extract_type}\n{subtype}\n{value_id}\n{map_value_id}\n{objs}')
value_id = 'ERROR'
extracted.append([obj[0], obj[1], obj[2], f'{extract_type}:{subtype}:{value_id}'])
return extracted
def _get_yara_match(data):
for string_match in data.get('strings'):
for string_match_instance in string_match.instances:
start = string_match_instance.offset
value = string_match_instance.matched_data.decode()
end = start + string_match_instance.matched_length
r_cache.sadd(f'extractor:yara:match:{r_key}', f'{start}:{end}:{value}')
r_cache.expire(f'extractor:yara:match:{r_key}', 300)
return yara.CALLBACK_CONTINUE
def _get_word_regex(word):
return '(?:^|(?<=[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]))' + word + '(?:$|(?=[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]))'
def convert_byte_offset_to_string(b_content, offset):
byte_chunk = b_content[:offset + 1]
try:
string_chunk = byte_chunk.decode()
offset = len(string_chunk) - 1
return offset
except UnicodeDecodeError as e:
logger.error(f'Yara offset converter error, {str(e)}\n{offset}/{len(b_content)}')
return convert_byte_offset_to_string(b_content, offset - 1)
# TODO RETRO HUNTS
# TODO TRACKER TYPE IN UI
def get_tracker_match(obj_id, content):
extracted = []
extracted_yara = []
trackers = Tracker.get_obj_trackers('item', '', obj_id)
for tracker_uuid in trackers:
tracker = Tracker.Tracker(tracker_uuid)
tracker_type = tracker.get_type()
# print(tracker_type)
tracked = tracker.get_tracked()
if tracker_type == 'regex': # TODO Improve word detection -> word delimiter
regex_match = regex_helper.regex_finditer(r_key, tracked, obj_id, content)
for match in regex_match:
extracted.append([int(match[0]), int(match[1]), match[2], f'tracker:{tracker.uuid}'])
elif tracker_type == 'yara':
rule = tracker.get_rule()
rule.match(data=content.encode(), callback=_get_yara_match,
which_callbacks=yara.CALLBACK_MATCHES, timeout=30)
yara_match = r_cache.smembers(f'extractor:yara:match:{r_key}')
r_cache.delete(f'extractor:yara:match:{r_key}')
extracted = []
for match in yara_match:
start, end, value = match.split(':', 2)
extracted_yara.append([int(start), int(end), value, f'tracker:{tracker.uuid}'])
elif tracker_type == 'word' or tracker_type == 'set':
if tracker_type == 'set':
tracked = tracked.rsplit(';', 1)[0]
words = tracked.split(',')
else:
words = [tracked]
for word in words:
regex = _get_word_regex(word)
regex_match = regex_helper.regex_finditer(r_key, regex, obj_id, content)
# print(regex_match)
for match in regex_match:
extracted.append([int(match[0]), int(match[1]), match[2], f'tracker:{tracker.uuid}'])
# Convert byte offset to string offset
if extracted_yara:
b_content = content.encode()
if len(b_content) == len(content):
extracted[0:0] = extracted_yara
else:
for yara_m in extracted_yara:
start = convert_byte_offset_to_string(b_content, yara_m[0])
end = convert_byte_offset_to_string(b_content, yara_m[1])
extracted.append([int(start), int(end), yara_m[2], yara_m[3]])
return extracted
# Type:subtype:id
# tag:iban
# tracker:uuid
def extract(obj_id, content=None):
item = Item(obj_id)
if not item.exists():
return []
# CHECK CACHE
cached = r_cache.get(f'extractor:cache:{obj_id}')
# cached = None
if cached:
r_cache.expire(f'extractor:cache:{obj_id}', 300)
return json.loads(cached)
if not content:
content = item.get_content()
extracted = get_tracker_match(obj_id, content)
# print(item.get_tags())
for tag in item.get_tags():
if MODULES.get(tag):
# print(tag)
module = MODULES.get(tag)
matches = module.extract(obj_id, content, tag)
if matches:
extracted = extracted + matches
for obj_t in ['cve', 'cryptocurrency', 'title', 'username']: # Decoded, PGP->extract bloc
matches = get_correl_match(obj_t, obj_id, content)
if matches:
extracted = extracted + matches
# SORT By Start Pos
extracted = sorted(extracted, key=itemgetter(0))
# print(extracted)
# Save In Cache
if extracted:
extracted_dump = json.dumps(extracted)
r_cache.set(f'extractor:cache:{obj_id}', extracted_dump)
r_cache.expire(f'extractor:cache:{obj_id}', 300) # TODO Reduce CACHE ???????????????
return extracted
# TODO ADD LINK UI
def get_extracted_by_match(extracted):
matches = {}
for start, end, value, str_obj in extracted:
if str_obj not in matches:
matches[str_obj] = {}
ob_type, row_id = str_obj.split(':', 1)
if ob_type == 'tag': # TODO put me in object class
matches[str_obj]['subtype'] = 'tag'
matches[str_obj]['id'] = row_id
matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf02b', 'color': '#28a745', 'radius': 5}
matches[str_obj]['link'] = ''
elif ob_type == 'tracker': # TODO put me in object class
matches[str_obj]['subtype'] = 'tracker'
matches[str_obj]['id'] = row_id
matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf05b', 'color': '#ffc107', 'radius': 5}
matches[str_obj]['link'] = ''
else:
row_id = row_id.split(':', 1)
if len(row_id) == 2:
subtype = row_id[0]
obj_id = row_id[1]
else:
subtype = ''
obj_id = row_id[0]
matches[str_obj]['subtype'] = subtype
matches[str_obj]['id'] = obj_id
matches[str_obj]['icon'] = ail_objects.get_object_svg(ob_type, subtype, obj_id)
matches[str_obj]['link'] = ail_objects.get_object_link(ob_type, subtype, obj_id)
matches[str_obj]['matches'] = []
match = [start, end, value]
matches[str_obj]['matches'].append(match)
return matches
# if __name__ == '__main__':
# t0 = time.time()
# obj_id = 'crawled/2022/09/15/circl.lu179c7903-5b21-452e-9f25-4b61d9934e2b'
# obj_id = 'crawled/2022/09/15/circl.lu1e4f9721-06dc-404f-aabf-3c3bd0b533bd'
# obj_id = 'submitted/2022/09/13/submitted_ba3ee771-c91c-4f50-9d6a-8558cdac7aeb.gz'
# # obj_id = 'tests/2021/01/01/credit_cards.gz'
# # obj_id = 'crawled/2020/07/20/circl.luc9301321-f1b1-4d91-9082-5eb452b946c5'
# obj_id = 'submitted/2019/09/22/97172282-e4c2-4a1e-b82c-c4fb9490a56e.gz'
# obj_id = 'submitted/2019/09/20/4fb7f02d-1241-4ef4-b17e-80ae76038835.gz'
# obj_id = 'crawled/2023/02/21/circl.lu1c300acb-0cbe-480f-917e-9afe3ec958e8'
#
# extract(obj_id)
#
# # get_obj_correl('cve', obj_id, content)
# # r = get_tracker_match(obj_id, content)
# # print(r)
#
# print(time.time() - t0)