2022-12-19 16:38:20 +01:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# -*-coding:UTF-8 -*
|
2023-02-23 16:25:15 +01:00
|
|
|
import json
|
2023-05-30 10:11:12 +02:00
|
|
|
import logging
|
2022-12-19 16:38:20 +01:00
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
|
|
|
|
import yara
|
|
|
|
|
2023-05-30 10:11:12 +02:00
|
|
|
from hashlib import sha256
|
2023-02-23 16:25:15 +01:00
|
|
|
from operator import itemgetter
|
|
|
|
|
2022-12-19 16:38:20 +01:00
|
|
|
sys.path.append(os.environ['AIL_BIN'])
|
|
|
|
##################################
|
|
|
|
# Import Project packages
|
|
|
|
##################################
|
2023-02-23 16:25:15 +01:00
|
|
|
from lib.objects import ail_objects
|
2022-12-19 16:38:20 +01:00
|
|
|
from lib.objects.Items import Item
|
2023-05-25 14:33:12 +02:00
|
|
|
from lib.objects.Titles import Title
|
2022-12-19 16:38:20 +01:00
|
|
|
from lib import correlations_engine
|
|
|
|
from lib import regex_helper
|
|
|
|
from lib.ConfigLoader import ConfigLoader
|
|
|
|
|
|
|
|
from lib import Tracker
|
|
|
|
|
|
|
|
from modules.CreditCards import CreditCards
|
|
|
|
from modules.Iban import Iban
|
|
|
|
from modules.Mail import Mail
|
|
|
|
from modules.Onion import Onion
|
2023-05-24 10:48:29 +02:00
|
|
|
from modules.Phone import Phone
|
2022-12-19 16:38:20 +01:00
|
|
|
from modules.Tools import Tools
|
|
|
|
|
2023-05-30 10:11:12 +02:00
|
|
|
logger = logging.getLogger()
|
|
|
|
|
2022-12-19 16:38:20 +01:00
|
|
|
config_loader = ConfigLoader()
|
|
|
|
r_cache = config_loader.get_redis_conn("Redis_Cache")
|
|
|
|
config_loader = None
|
|
|
|
|
|
|
|
r_key = regex_helper.generate_redis_cache_key('extractor')
|
|
|
|
|
2023-02-23 16:25:15 +01:00
|
|
|
# TODO UI Link
|
|
|
|
|
2024-03-27 13:42:15 +01:00
|
|
|
CORRELATION_TO_EXTRACT = {
|
|
|
|
'item': ['cve', 'cryptocurrency', 'title', 'username'],
|
|
|
|
'message': ['cve', 'cryptocurrency', 'username']
|
|
|
|
}
|
|
|
|
|
2022-12-19 16:38:20 +01:00
|
|
|
MODULES = {
|
2023-04-13 14:25:02 +02:00
|
|
|
'infoleak:automatic-detection="credit-card"': CreditCards(queue=False),
|
|
|
|
'infoleak:automatic-detection="iban"': Iban(queue=False),
|
|
|
|
'infoleak:automatic-detection="mail"': Mail(queue=False),
|
|
|
|
'infoleak:automatic-detection="onion"': Onion(queue=False),
|
2023-05-24 10:48:29 +02:00
|
|
|
'infoleak:automatic-detection="phone-number"': Phone(queue=False),
|
2022-12-19 16:38:20 +01:00
|
|
|
# APIkey ???
|
|
|
|
# Credentials
|
|
|
|
# Zerobins
|
|
|
|
# CERTIFICATE + KEYS ???
|
|
|
|
# SQL Injetction / Libinjection ???
|
|
|
|
|
|
|
|
}
|
2023-04-13 14:25:02 +02:00
|
|
|
tools = Tools(queue=False)
|
2022-12-19 16:38:20 +01:00
|
|
|
for tool_name in tools.get_tools():
|
|
|
|
MODULES[f'infoleak:automatic-detection="{tool_name}-tool"'] = tools
|
|
|
|
|
2024-03-27 16:30:29 +01:00
|
|
|
def merge_overlap(extracted):
|
|
|
|
merged = []
|
|
|
|
curr_start, curr_end, curr_string_match, curr_obj_ref = extracted[0]
|
|
|
|
curr_obj_ref = [(curr_obj_ref, curr_string_match)]
|
|
|
|
|
|
|
|
for start, end, mstring, ref in extracted[1:]:
|
|
|
|
# overlap
|
|
|
|
if start <= curr_end:
|
|
|
|
curr_string_match += mstring[curr_end - start:]
|
|
|
|
curr_end = max(curr_end, end)
|
|
|
|
curr_obj_ref.append((ref, mstring))
|
|
|
|
else:
|
|
|
|
merged.append((curr_start, curr_end, curr_string_match, curr_obj_ref))
|
|
|
|
curr_start, curr_end, curr_string_match, curr_obj_ref = start, end, mstring, [(ref, mstring)]
|
|
|
|
|
|
|
|
merged.append((curr_start, curr_end, curr_string_match, curr_obj_ref))
|
|
|
|
return merged
|
|
|
|
|
2024-03-27 13:42:15 +01:00
|
|
|
def get_correl_match(extract_type, obj, content):
|
2023-02-23 16:25:15 +01:00
|
|
|
extracted = []
|
2024-03-27 13:42:15 +01:00
|
|
|
correl = correlations_engine.get_correlation_by_correl_type(obj.type, obj.get_subtype(r_str=True), obj.id, extract_type)
|
2022-12-19 16:38:20 +01:00
|
|
|
to_extract = []
|
2023-02-23 16:25:15 +01:00
|
|
|
map_subtype = {}
|
2023-05-25 14:33:12 +02:00
|
|
|
map_value_id = {}
|
2022-12-19 16:38:20 +01:00
|
|
|
for c in correl:
|
|
|
|
subtype, value = c.split(':', 1)
|
2023-05-25 14:33:12 +02:00
|
|
|
if extract_type == 'title':
|
|
|
|
title = Title(value).get_content()
|
|
|
|
to_extract.append(title)
|
2023-05-30 10:11:12 +02:00
|
|
|
sha256_val = sha256(title.encode()).hexdigest()
|
2023-05-25 14:33:12 +02:00
|
|
|
else:
|
|
|
|
map_subtype[value] = subtype
|
|
|
|
to_extract.append(value)
|
2023-05-30 10:11:12 +02:00
|
|
|
sha256_val = sha256(value.encode()).hexdigest()
|
|
|
|
map_value_id[sha256_val] = value
|
2022-12-19 16:38:20 +01:00
|
|
|
if to_extract:
|
2024-03-27 13:42:15 +01:00
|
|
|
objs = regex_helper.regex_finditer(r_key, '|'.join(to_extract), obj.get_global_id(), content)
|
2024-03-27 16:30:29 +01:00
|
|
|
if extract_type == 'title' and objs:
|
|
|
|
objs = [objs[0]]
|
2024-03-27 13:42:15 +01:00
|
|
|
for ob in objs:
|
|
|
|
if map_subtype.get(ob[2]):
|
|
|
|
subtype = map_subtype[ob[2]]
|
2023-02-23 16:25:15 +01:00
|
|
|
else:
|
|
|
|
subtype = ''
|
2024-03-27 13:42:15 +01:00
|
|
|
sha256_val = sha256(ob[2].encode()).hexdigest()
|
2023-05-30 10:11:12 +02:00
|
|
|
value_id = map_value_id.get(sha256_val)
|
|
|
|
if not value_id:
|
|
|
|
logger.critical(f'Error module extractor: {sha256_val}\n{extract_type}\n{subtype}\n{value_id}\n{map_value_id}\n{objs}')
|
|
|
|
value_id = 'ERROR'
|
2024-03-27 13:42:15 +01:00
|
|
|
extracted.append([ob[0], ob[1], ob[2], f'{extract_type}:{subtype}:{value_id}'])
|
2023-02-23 16:25:15 +01:00
|
|
|
return extracted
|
2022-12-19 16:38:20 +01:00
|
|
|
|
|
|
|
def _get_yara_match(data):
|
2023-04-20 16:05:58 +02:00
|
|
|
for string_match in data.get('strings'):
|
|
|
|
for string_match_instance in string_match.instances:
|
|
|
|
start = string_match_instance.offset
|
|
|
|
value = string_match_instance.matched_data.decode()
|
|
|
|
end = start + string_match_instance.matched_length
|
|
|
|
r_cache.sadd(f'extractor:yara:match:{r_key}', f'{start}:{end}:{value}')
|
|
|
|
r_cache.expire(f'extractor:yara:match:{r_key}', 300)
|
2022-12-19 16:38:20 +01:00
|
|
|
return yara.CALLBACK_CONTINUE
|
|
|
|
|
2023-02-23 16:25:15 +01:00
|
|
|
def _get_word_regex(word):
|
2024-03-27 13:42:15 +01:00
|
|
|
return '(?i)(?:^|(?<=[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]))' + word + '(?:$|(?=[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]))'
|
2023-02-23 16:25:15 +01:00
|
|
|
|
2023-04-20 16:05:58 +02:00
|
|
|
def convert_byte_offset_to_string(b_content, offset):
|
|
|
|
byte_chunk = b_content[:offset + 1]
|
2023-08-08 10:36:58 +02:00
|
|
|
try:
|
|
|
|
string_chunk = byte_chunk.decode()
|
2023-08-08 11:26:16 +02:00
|
|
|
offset = len(string_chunk) - 1
|
|
|
|
return offset
|
2023-08-08 10:36:58 +02:00
|
|
|
except UnicodeDecodeError as e:
|
2023-08-08 11:26:16 +02:00
|
|
|
logger.error(f'Yara offset converter error, {str(e)}\n{offset}/{len(b_content)}')
|
2023-08-08 11:27:57 +02:00
|
|
|
return convert_byte_offset_to_string(b_content, offset - 1)
|
2023-04-20 16:05:58 +02:00
|
|
|
|
|
|
|
|
2022-12-19 16:38:20 +01:00
|
|
|
# TODO RETRO HUNTS
|
2023-02-23 16:25:15 +01:00
|
|
|
# TODO TRACKER TYPE IN UI
|
2024-03-27 13:42:15 +01:00
|
|
|
def get_tracker_match(obj, content):
|
2023-02-23 16:25:15 +01:00
|
|
|
extracted = []
|
2023-04-20 16:05:58 +02:00
|
|
|
extracted_yara = []
|
2024-03-27 13:42:15 +01:00
|
|
|
obj_gid = obj.get_global_id()
|
|
|
|
trackers = Tracker.get_obj_trackers(obj.type, obj.get_subtype(r_str=True), obj.id)
|
2022-12-19 16:38:20 +01:00
|
|
|
for tracker_uuid in trackers:
|
2023-05-04 16:35:56 +02:00
|
|
|
tracker = Tracker.Tracker(tracker_uuid)
|
|
|
|
tracker_type = tracker.get_type()
|
2023-04-20 16:05:58 +02:00
|
|
|
# print(tracker_type)
|
2023-05-04 16:35:56 +02:00
|
|
|
tracked = tracker.get_tracked()
|
2023-02-23 16:25:15 +01:00
|
|
|
if tracker_type == 'regex': # TODO Improve word detection -> word delimiter
|
2024-03-27 13:42:15 +01:00
|
|
|
regex_match = regex_helper.regex_finditer(r_key, tracked, obj_gid, content)
|
2023-02-23 16:25:15 +01:00
|
|
|
for match in regex_match:
|
2023-05-04 16:35:56 +02:00
|
|
|
extracted.append([int(match[0]), int(match[1]), match[2], f'tracker:{tracker.uuid}'])
|
2022-12-19 16:38:20 +01:00
|
|
|
elif tracker_type == 'yara':
|
2023-05-04 16:35:56 +02:00
|
|
|
rule = tracker.get_rule()
|
2023-04-20 16:05:58 +02:00
|
|
|
rule.match(data=content.encode(), callback=_get_yara_match,
|
2022-12-19 16:38:20 +01:00
|
|
|
which_callbacks=yara.CALLBACK_MATCHES, timeout=30)
|
|
|
|
yara_match = r_cache.smembers(f'extractor:yara:match:{r_key}')
|
|
|
|
r_cache.delete(f'extractor:yara:match:{r_key}')
|
|
|
|
extracted = []
|
|
|
|
for match in yara_match:
|
|
|
|
start, end, value = match.split(':', 2)
|
2023-05-04 16:35:56 +02:00
|
|
|
extracted_yara.append([int(start), int(end), value, f'tracker:{tracker.uuid}'])
|
2023-02-23 16:25:15 +01:00
|
|
|
|
|
|
|
elif tracker_type == 'word' or tracker_type == 'set':
|
|
|
|
if tracker_type == 'set':
|
2023-05-04 16:35:56 +02:00
|
|
|
tracked = tracked.rsplit(';', 1)[0]
|
|
|
|
words = tracked.split(',')
|
2023-02-23 16:25:15 +01:00
|
|
|
else:
|
2023-05-04 16:35:56 +02:00
|
|
|
words = [tracked]
|
2023-02-23 16:25:15 +01:00
|
|
|
for word in words:
|
|
|
|
regex = _get_word_regex(word)
|
2024-03-27 13:42:15 +01:00
|
|
|
regex_match = regex_helper.regex_finditer(r_key, regex, obj_gid, content)
|
2023-04-20 16:05:58 +02:00
|
|
|
# print(regex_match)
|
2023-02-23 16:25:15 +01:00
|
|
|
for match in regex_match:
|
2023-05-04 16:35:56 +02:00
|
|
|
extracted.append([int(match[0]), int(match[1]), match[2], f'tracker:{tracker.uuid}'])
|
2023-02-23 16:25:15 +01:00
|
|
|
|
2024-02-15 11:31:17 +01:00
|
|
|
# Retro Hunt
|
2024-03-27 13:42:15 +01:00
|
|
|
retro_hunts = Tracker.get_obj_retro_hunts(obj.type, obj.get_subtype(r_str=True), obj.id)
|
2024-02-15 11:31:17 +01:00
|
|
|
for retro_uuid in retro_hunts:
|
|
|
|
retro_hunt = Tracker.RetroHunt(retro_uuid)
|
|
|
|
rule = retro_hunt.get_rule(r_compile=True)
|
|
|
|
rule.match(data=content.encode(), callback=_get_yara_match,
|
|
|
|
which_callbacks=yara.CALLBACK_MATCHES, timeout=30)
|
|
|
|
yara_match = r_cache.smembers(f'extractor:yara:match:{r_key}')
|
|
|
|
r_cache.delete(f'extractor:yara:match:{r_key}')
|
|
|
|
extracted = []
|
|
|
|
for match in yara_match:
|
|
|
|
start, end, value = match.split(':', 2)
|
|
|
|
extracted_yara.append([int(start), int(end), value, f'retro_hunt:{retro_hunt.uuid}'])
|
|
|
|
|
2023-04-20 16:05:58 +02:00
|
|
|
# Convert byte offset to string offset
|
|
|
|
if extracted_yara:
|
|
|
|
b_content = content.encode()
|
|
|
|
if len(b_content) == len(content):
|
|
|
|
extracted[0:0] = extracted_yara
|
|
|
|
else:
|
|
|
|
for yara_m in extracted_yara:
|
|
|
|
start = convert_byte_offset_to_string(b_content, yara_m[0])
|
|
|
|
end = convert_byte_offset_to_string(b_content, yara_m[1])
|
|
|
|
extracted.append([int(start), int(end), yara_m[2], yara_m[3]])
|
2022-12-19 16:38:20 +01:00
|
|
|
|
2023-02-23 16:25:15 +01:00
|
|
|
return extracted
|
2022-12-19 16:38:20 +01:00
|
|
|
|
2023-02-23 16:25:15 +01:00
|
|
|
# Type:subtype:id
|
|
|
|
# tag:iban
|
|
|
|
# tracker:uuid
|
2024-03-27 13:42:15 +01:00
|
|
|
# def extract(obj_id, content=None):
|
|
|
|
def extract(obj_type, subtype, obj_id, content=None):
|
|
|
|
obj = ail_objects.get_object(obj_type, subtype, obj_id)
|
|
|
|
if not obj.exists():
|
2023-04-20 16:05:58 +02:00
|
|
|
return []
|
2024-03-27 13:42:15 +01:00
|
|
|
obj_gid = obj.get_global_id()
|
2023-04-20 16:05:58 +02:00
|
|
|
|
|
|
|
# CHECK CACHE
|
2024-03-27 13:42:15 +01:00
|
|
|
cached = r_cache.get(f'extractor:cache:{obj_gid}')
|
2023-05-30 10:11:12 +02:00
|
|
|
# cached = None
|
2023-04-20 16:05:58 +02:00
|
|
|
if cached:
|
2024-03-27 13:42:15 +01:00
|
|
|
r_cache.expire(f'extractor:cache:{obj_gid}', 300)
|
2023-04-20 16:05:58 +02:00
|
|
|
return json.loads(cached)
|
|
|
|
|
2022-12-19 16:38:20 +01:00
|
|
|
if not content:
|
2024-03-27 13:42:15 +01:00
|
|
|
content = obj.get_content()
|
2022-12-19 16:38:20 +01:00
|
|
|
|
2024-03-27 13:42:15 +01:00
|
|
|
extracted = get_tracker_match(obj, content)
|
2022-12-19 16:38:20 +01:00
|
|
|
|
|
|
|
# print(item.get_tags())
|
2024-03-27 13:42:15 +01:00
|
|
|
for tag in obj.get_tags():
|
2022-12-19 16:38:20 +01:00
|
|
|
if MODULES.get(tag):
|
|
|
|
# print(tag)
|
|
|
|
module = MODULES.get(tag)
|
2024-03-27 13:42:15 +01:00
|
|
|
matches = module.extract(obj, content, tag)
|
2022-12-19 16:38:20 +01:00
|
|
|
if matches:
|
|
|
|
extracted = extracted + matches
|
|
|
|
|
2024-03-27 13:42:15 +01:00
|
|
|
for obj_t in CORRELATION_TO_EXTRACT[obj.type]:
|
|
|
|
matches = get_correl_match(obj_t, obj, content)
|
2022-12-19 16:38:20 +01:00
|
|
|
if matches:
|
|
|
|
extracted = extracted + matches
|
|
|
|
|
2023-02-23 16:25:15 +01:00
|
|
|
# SORT By Start Pos
|
2024-03-28 09:47:53 +01:00
|
|
|
if extracted:
|
|
|
|
extracted = sorted(extracted, key=itemgetter(0))
|
|
|
|
extracted = merge_overlap(extracted)
|
2023-04-20 16:05:58 +02:00
|
|
|
|
|
|
|
# Save In Cache
|
|
|
|
if extracted:
|
|
|
|
extracted_dump = json.dumps(extracted)
|
2024-03-27 13:42:15 +01:00
|
|
|
r_cache.set(f'extractor:cache:{obj_gid}', extracted_dump)
|
|
|
|
r_cache.expire(f'extractor:cache:{obj_gid}', 300) # TODO Reduce CACHE ???????????????
|
2023-04-20 16:05:58 +02:00
|
|
|
|
2022-12-19 16:38:20 +01:00
|
|
|
return extracted
|
|
|
|
|
2023-02-23 16:25:15 +01:00
|
|
|
# TODO ADD LINK UI
|
|
|
|
def get_extracted_by_match(extracted):
|
|
|
|
matches = {}
|
2024-03-27 16:30:29 +01:00
|
|
|
for start, end, value, raw_objs in extracted:
|
|
|
|
|
|
|
|
for raw in raw_objs:
|
|
|
|
str_obj, str_match = raw
|
|
|
|
|
|
|
|
if str_obj not in matches:
|
|
|
|
matches[str_obj] = {}
|
|
|
|
ob_type, row_id = str_obj.split(':', 1)
|
|
|
|
if ob_type == 'tag': # TODO put me in object class
|
|
|
|
matches[str_obj]['subtype'] = 'tag'
|
|
|
|
matches[str_obj]['id'] = row_id
|
|
|
|
matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf02b', 'color': '#28a745', 'radius': 5}
|
|
|
|
matches[str_obj]['link'] = ''
|
|
|
|
elif ob_type == 'tracker': # TODO put me in object class
|
|
|
|
matches[str_obj]['subtype'] = 'tracker'
|
|
|
|
matches[str_obj]['id'] = row_id
|
|
|
|
matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf05b', 'color': '#ffc107', 'radius': 5}
|
|
|
|
matches[str_obj]['link'] = ''
|
|
|
|
elif ob_type == 'retro_hunt': # TODO put me in object class
|
|
|
|
matches[str_obj]['subtype'] = 'retro_hunt'
|
|
|
|
matches[str_obj]['id'] = row_id
|
|
|
|
matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf05b', 'color': '#008107', 'radius': 5}
|
|
|
|
matches[str_obj]['link'] = ''
|
2023-02-23 16:25:15 +01:00
|
|
|
else:
|
2024-03-27 16:30:29 +01:00
|
|
|
row_id = row_id.split(':', 1)
|
|
|
|
if len(row_id) == 2:
|
|
|
|
subtype = row_id[0]
|
|
|
|
obj_id = row_id[1]
|
|
|
|
else:
|
|
|
|
subtype = ''
|
|
|
|
obj_id = row_id[0]
|
|
|
|
matches[str_obj]['subtype'] = subtype
|
|
|
|
matches[str_obj]['id'] = obj_id
|
|
|
|
matches[str_obj]['icon'] = ail_objects.get_object_svg(ob_type, subtype, obj_id)
|
|
|
|
matches[str_obj]['link'] = ail_objects.get_object_link(ob_type, subtype, obj_id)
|
|
|
|
|
|
|
|
matches[str_obj]['matches'] = []
|
|
|
|
|
|
|
|
match = [start, end, str_match]
|
|
|
|
matches[str_obj]['matches'].append(match)
|
2023-02-23 16:25:15 +01:00
|
|
|
return matches
|
|
|
|
|
|
|
|
|
|
|
|
# if __name__ == '__main__':
|
|
|
|
# t0 = time.time()
|
|
|
|
# obj_id = 'crawled/2023/02/21/circl.lu1c300acb-0cbe-480f-917e-9afe3ec958e8'
|
|
|
|
# extract(obj_id)
|
|
|
|
#
|
|
|
|
# # get_obj_correl('cve', obj_id, content)
|
|
|
|
# # r = get_tracker_match(obj_id, content)
|
|
|
|
# # print(r)
|
|
|
|
#
|
|
|
|
# print(time.time() - t0)
|
2022-12-19 16:38:20 +01:00
|
|
|
|