#!/usr/bin/env python3 # -*-coding:UTF-8 -* import json import logging import os import sys import yara from hashlib import sha256 from operator import itemgetter sys.path.append(os.environ['AIL_BIN']) ################################## # Import Project packages ################################## from lib.objects import ail_objects from lib.objects.Items import Item from lib.objects.Titles import Title from lib import correlations_engine from lib import regex_helper from lib.ConfigLoader import ConfigLoader from lib import Tracker from modules.CreditCards import CreditCards from modules.Iban import Iban from modules.Mail import Mail from modules.Onion import Onion from modules.Phone import Phone from modules.Tools import Tools logger = logging.getLogger() config_loader = ConfigLoader() r_cache = config_loader.get_redis_conn("Redis_Cache") config_loader = None r_key = regex_helper.generate_redis_cache_key('extractor') # TODO UI Link MODULES = { 'infoleak:automatic-detection="credit-card"': CreditCards(queue=False), 'infoleak:automatic-detection="iban"': Iban(queue=False), 'infoleak:automatic-detection="mail"': Mail(queue=False), 'infoleak:automatic-detection="onion"': Onion(queue=False), 'infoleak:automatic-detection="phone-number"': Phone(queue=False), # APIkey ??? # Credentials # Zerobins # CERTIFICATE + KEYS ??? # SQL Injetction / Libinjection ??? } tools = Tools(queue=False) for tool_name in tools.get_tools(): MODULES[f'infoleak:automatic-detection="{tool_name}-tool"'] = tools def get_correl_match(extract_type, obj_id, content): extracted = [] correl = correlations_engine.get_correlation_by_correl_type('item', '', obj_id, extract_type) to_extract = [] map_subtype = {} map_value_id = {} for c in correl: subtype, value = c.split(':', 1) if extract_type == 'title': title = Title(value).get_content() to_extract.append(title) sha256_val = sha256(title.encode()).hexdigest() else: map_subtype[value] = subtype to_extract.append(value) sha256_val = sha256(value.encode()).hexdigest() map_value_id[sha256_val] = value if to_extract: objs = regex_helper.regex_finditer(r_key, '|'.join(to_extract), obj_id, content) for obj in objs: if map_subtype.get(obj[2]): subtype = map_subtype[obj[2]] else: subtype = '' sha256_val = sha256(obj[2].encode()).hexdigest() value_id = map_value_id.get(sha256_val) if not value_id: logger.critical(f'Error module extractor: {sha256_val}\n{extract_type}\n{subtype}\n{value_id}\n{map_value_id}\n{objs}') value_id = 'ERROR' extracted.append([obj[0], obj[1], obj[2], f'{extract_type}:{subtype}:{value_id}']) return extracted def _get_yara_match(data): for string_match in data.get('strings'): for string_match_instance in string_match.instances: start = string_match_instance.offset value = string_match_instance.matched_data.decode() end = start + string_match_instance.matched_length r_cache.sadd(f'extractor:yara:match:{r_key}', f'{start}:{end}:{value}') r_cache.expire(f'extractor:yara:match:{r_key}', 300) return yara.CALLBACK_CONTINUE def _get_word_regex(word): return '(?:^|(?<=[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]))' + word + '(?:$|(?=[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]))' def convert_byte_offset_to_string(b_content, offset): byte_chunk = b_content[:offset + 1] try: string_chunk = byte_chunk.decode() offset = len(string_chunk) - 1 return offset except UnicodeDecodeError as e: logger.error(f'Yara offset converter error, {str(e)}\n{offset}/{len(b_content)}') return convert_byte_offset_to_string(b_content, offset - 1) # TODO RETRO HUNTS # TODO TRACKER TYPE IN UI def get_tracker_match(obj_id, content): extracted = [] extracted_yara = [] trackers = Tracker.get_obj_trackers('item', '', obj_id) for tracker_uuid in trackers: tracker = Tracker.Tracker(tracker_uuid) tracker_type = tracker.get_type() # print(tracker_type) tracked = tracker.get_tracked() if tracker_type == 'regex': # TODO Improve word detection -> word delimiter regex_match = regex_helper.regex_finditer(r_key, tracked, obj_id, content) for match in regex_match: extracted.append([int(match[0]), int(match[1]), match[2], f'tracker:{tracker.uuid}']) elif tracker_type == 'yara': rule = tracker.get_rule() rule.match(data=content.encode(), callback=_get_yara_match, which_callbacks=yara.CALLBACK_MATCHES, timeout=30) yara_match = r_cache.smembers(f'extractor:yara:match:{r_key}') r_cache.delete(f'extractor:yara:match:{r_key}') extracted = [] for match in yara_match: start, end, value = match.split(':', 2) extracted_yara.append([int(start), int(end), value, f'tracker:{tracker.uuid}']) elif tracker_type == 'word' or tracker_type == 'set': if tracker_type == 'set': tracked = tracked.rsplit(';', 1)[0] words = tracked.split(',') else: words = [tracked] for word in words: regex = _get_word_regex(word) regex_match = regex_helper.regex_finditer(r_key, regex, obj_id, content) # print(regex_match) for match in regex_match: extracted.append([int(match[0]), int(match[1]), match[2], f'tracker:{tracker.uuid}']) # Retro Hunt retro_hunts = Tracker.get_obj_retro_hunts('item', '', obj_id) for retro_uuid in retro_hunts: retro_hunt = Tracker.RetroHunt(retro_uuid) rule = retro_hunt.get_rule(r_compile=True) rule.match(data=content.encode(), callback=_get_yara_match, which_callbacks=yara.CALLBACK_MATCHES, timeout=30) yara_match = r_cache.smembers(f'extractor:yara:match:{r_key}') r_cache.delete(f'extractor:yara:match:{r_key}') extracted = [] for match in yara_match: start, end, value = match.split(':', 2) extracted_yara.append([int(start), int(end), value, f'retro_hunt:{retro_hunt.uuid}']) # Convert byte offset to string offset if extracted_yara: b_content = content.encode() if len(b_content) == len(content): extracted[0:0] = extracted_yara else: for yara_m in extracted_yara: start = convert_byte_offset_to_string(b_content, yara_m[0]) end = convert_byte_offset_to_string(b_content, yara_m[1]) extracted.append([int(start), int(end), yara_m[2], yara_m[3]]) return extracted # Type:subtype:id # tag:iban # tracker:uuid def extract(obj_id, content=None): item = Item(obj_id) if not item.exists(): return [] # CHECK CACHE cached = r_cache.get(f'extractor:cache:{obj_id}') # cached = None if cached: r_cache.expire(f'extractor:cache:{obj_id}', 300) return json.loads(cached) if not content: content = item.get_content() extracted = get_tracker_match(obj_id, content) # print(item.get_tags()) for tag in item.get_tags(): if MODULES.get(tag): # print(tag) module = MODULES.get(tag) matches = module.extract(obj_id, content, tag) if matches: extracted = extracted + matches for obj_t in ['cve', 'cryptocurrency', 'title', 'username']: # Decoded, PGP->extract bloc matches = get_correl_match(obj_t, obj_id, content) if matches: extracted = extracted + matches # SORT By Start Pos extracted = sorted(extracted, key=itemgetter(0)) # print(extracted) # Save In Cache if extracted: extracted_dump = json.dumps(extracted) r_cache.set(f'extractor:cache:{obj_id}', extracted_dump) r_cache.expire(f'extractor:cache:{obj_id}', 300) # TODO Reduce CACHE ??????????????? return extracted # TODO ADD LINK UI def get_extracted_by_match(extracted): matches = {} for start, end, value, str_obj in extracted: if str_obj not in matches: matches[str_obj] = {} ob_type, row_id = str_obj.split(':', 1) if ob_type == 'tag': # TODO put me in object class matches[str_obj]['subtype'] = 'tag' matches[str_obj]['id'] = row_id matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf02b', 'color': '#28a745', 'radius': 5} matches[str_obj]['link'] = '' elif ob_type == 'tracker': # TODO put me in object class matches[str_obj]['subtype'] = 'tracker' matches[str_obj]['id'] = row_id matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf05b', 'color': '#ffc107', 'radius': 5} matches[str_obj]['link'] = '' elif ob_type == 'retro_hunt': # TODO put me in object class matches[str_obj]['subtype'] = 'retro_hunt' matches[str_obj]['id'] = row_id matches[str_obj]['icon'] = {'style': 'fas', 'icon': '\uf05b', 'color': '#008107', 'radius': 5} matches[str_obj]['link'] = '' else: row_id = row_id.split(':', 1) if len(row_id) == 2: subtype = row_id[0] obj_id = row_id[1] else: subtype = '' obj_id = row_id[0] matches[str_obj]['subtype'] = subtype matches[str_obj]['id'] = obj_id matches[str_obj]['icon'] = ail_objects.get_object_svg(ob_type, subtype, obj_id) matches[str_obj]['link'] = ail_objects.get_object_link(ob_type, subtype, obj_id) matches[str_obj]['matches'] = [] match = [start, end, value] matches[str_obj]['matches'].append(match) return matches # if __name__ == '__main__': # t0 = time.time() # obj_id = 'crawled/2022/09/15/circl.lu179c7903-5b21-452e-9f25-4b61d9934e2b' # obj_id = 'crawled/2022/09/15/circl.lu1e4f9721-06dc-404f-aabf-3c3bd0b533bd' # obj_id = 'submitted/2022/09/13/submitted_ba3ee771-c91c-4f50-9d6a-8558cdac7aeb.gz' # # obj_id = 'tests/2021/01/01/credit_cards.gz' # # obj_id = 'crawled/2020/07/20/circl.luc9301321-f1b1-4d91-9082-5eb452b946c5' # obj_id = 'submitted/2019/09/22/97172282-e4c2-4a1e-b82c-c4fb9490a56e.gz' # obj_id = 'submitted/2019/09/20/4fb7f02d-1241-4ef4-b17e-80ae76038835.gz' # obj_id = 'crawled/2023/02/21/circl.lu1c300acb-0cbe-480f-917e-9afe3ec958e8' # # extract(obj_id) # # # get_obj_correl('cve', obj_id, content) # # r = get_tracker_match(obj_id, content) # # print(r) # # print(time.time() - t0)