chg: [ocr] add cache + correlation ocr-chats-messages + launch ocr extractor by default

ocr
terrtia 2024-04-24 14:43:11 +02:00
parent 8bd1ae3815
commit c25ccb8618
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
9 changed files with 122 additions and 26 deletions

View File

@ -275,8 +275,11 @@ function launching_scripts {
screen -S "Script_AIL" -X screen -t "MISP_Thehive_Auto_Push" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./MISP_Thehive_Auto_Push.py; read x" screen -S "Script_AIL" -X screen -t "MISP_Thehive_Auto_Push" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./MISP_Thehive_Auto_Push.py; read x"
sleep 0.1 sleep 0.1
# IMAGES
screen -S "Script_AIL" -X screen -t "Exif" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Exif.py; read x" screen -S "Script_AIL" -X screen -t "Exif" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./Exif.py; read x"
sleep 0.1 sleep 0.1
screen -S "Script_AIL" -X screen -t "OcrExtractor" bash -c "cd ${AIL_BIN}/modules; ${ENV_PY} ./OcrExtractor.py; read x"
sleep 0.1
################################## ##################################
# TRACKERS MODULES # # TRACKERS MODULES #

View File

@ -41,26 +41,26 @@ config_loader = None
################################## ##################################
CORRELATION_TYPES_BY_OBJ = { CORRELATION_TYPES_BY_OBJ = {
"chat": ["chat-subchannel", "chat-thread", "image", "user-account"], # message or direct correlation like cve, bitcoin, ... ??? "chat": ["chat-subchannel", "chat-thread", "image", "message", "ocr", "user-account"], # message or direct correlation like cve, bitcoin, ... ???
"chat-subchannel": ["chat", "chat-thread", "image", "message", "user-account"], "chat-subchannel": ["chat", "chat-thread", "image", "message", "ocr", "user-account"],
"chat-thread": ["chat", "chat-subchannel", "image", "message", "user-account"], # TODO user account "chat-thread": ["chat", "chat-subchannel", "image", "message", "ocr", "user-account"], # TODO user account
"cookie-name": ["domain"], "cookie-name": ["domain"],
"cryptocurrency": ["domain", "item", "message"], "cryptocurrency": ["domain", "item", "message", "ocr"],
"cve": ["domain", "item", "message"], "cve": ["domain", "item", "message", "ocr"],
"decoded": ["domain", "item", "message"], "decoded": ["domain", "item", "message", "ocr"],
"domain": ["cve", "cookie-name", "cryptocurrency", "decoded", "etag", "favicon", "hhhash", "item", "pgp", "title", "screenshot", "username"], "domain": ["cve", "cookie-name", "cryptocurrency", "decoded", "etag", "favicon", "hhhash", "item", "pgp", "title", "screenshot", "username"],
"etag": ["domain"], "etag": ["domain"],
"favicon": ["domain", "item"], # TODO Decoded "favicon": ["domain", "item"], # TODO Decoded
"file-name": ["chat", "message"], "file-name": ["chat", "message"],
"hhhash": ["domain"], "hhhash": ["domain"],
"image": ["chat", "message", "ocr", "user-account"], "image": ["chat", "chat-subchannel", "chat-thread", "message", "ocr", "user-account"], # TODO subchannel + threads ????
"item": ["cve", "cryptocurrency", "decoded", "domain", "favicon", "pgp", "screenshot", "title", "username"], # chat ??? "item": ["cve", "cryptocurrency", "decoded", "domain", "favicon", "pgp", "screenshot", "title", "username"], # chat ???
"message": ["chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "file-name", "image", "pgp", "user-account"], # chat ?? "message": ["chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "file-name", "image", "ocr", "pgp", "user-account"],
"ocr": ["image"], "ocr": ["chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "image", "message", "pgp", "user-account"],
"pgp": ["domain", "item", "message"], "pgp": ["domain", "item", "message", "ocr"],
"screenshot": ["domain", "item"], "screenshot": ["domain", "item"],
"title": ["domain", "item"], "title": ["domain", "item"],
"user-account": ["chat", "chat-subchannel", "chat-thread", "image", "message", "username"], "user-account": ["chat", "chat-subchannel", "chat-thread", "image", "message", "ocr", "username"],
"username": ["domain", "item", "message", "user-account"], "username": ["domain", "item", "message", "user-account"],
} }

View File

@ -209,7 +209,7 @@ class Domain(AbstractObject):
def get_screenshot(self): def get_screenshot(self):
last_item = self.get_last_item_root() last_item = self.get_last_item_root()
if last_item: if last_item:
screenshot = self._get_external_correlation('item', '', last_item, 'screenshot').get('screenshot') screenshot = self.get_obj_correlations('item', '', last_item, ['screenshot']).get('screenshot')
if screenshot: if screenshot:
return screenshot.pop()[1:] return screenshot.pop()[1:]
@ -392,7 +392,7 @@ class Domain(AbstractObject):
print(har) print(har)
_write_in_zip_buffer(zf, os.path.join(hars_dir, har), f'{basename}.json.gz') _write_in_zip_buffer(zf, os.path.join(hars_dir, har), f'{basename}.json.gz')
# Screenshot # Screenshot
screenshot = self._get_external_correlation('item', '', item_id, 'screenshot') screenshot = self.get_obj_correlations('item', '', item_id, ['screenshot'])
if screenshot and screenshot['screenshot']: if screenshot and screenshot['screenshot']:
screenshot = screenshot['screenshot'].pop()[1:] screenshot = screenshot['screenshot'].pop()[1:]
screenshot = os.path.join(screenshot[0:2], screenshot[2:4], screenshot[4:6], screenshot[6:8], screenshot = os.path.join(screenshot[0:2], screenshot[2:4], screenshot[4:6], screenshot[6:8],

View File

@ -141,7 +141,7 @@ class Message(AbstractObject):
# TODO get thread ID # TODO get thread ID
def _get_image_ocr(self, obj_id): def _get_image_ocr(self, obj_id):
return bool(self._get_external_correlation('image', '', obj_id, 'ocr').get('ocr')) return bool(self.get_correlation('ocr').get('ocr'))
def get_images(self): def get_images(self):
images = [] images = []

View File

@ -228,6 +228,14 @@ class Ocr(AbstractObject):
def remove(self, val): def remove(self, val):
return r_object.srem(f'ocr:{self.id}', val) return r_object.srem(f'ocr:{self.id}', val)
def update_correlation(self):
image_correl = self.get_obj_correlations('image', '', self.id)
for obj_type in image_correl:
if obj_type != 'ocr':
for obj_raw in image_correl[obj_type]:
obj_subtype, obj_id = obj_raw.split(':', 1)
self.add_correlation(obj_type, obj_subtype, obj_id)
def create(self, extracted_texts, tags=[]): def create(self, extracted_texts, tags=[]):
r_object.sadd(f'{self.type}:all', self.id) r_object.sadd(f'{self.type}:all', self.id)
for extracted in extracted_texts: for extracted in extracted_texts:
@ -235,6 +243,9 @@ class Ocr(AbstractObject):
if len(text) > 1: if len(text) > 1:
str_coords = self.create_coord_str(bbox) str_coords = self.create_coord_str(bbox)
self.add(str_coords, text) self.add(str_coords, text)
# Correlations
self.update_correlation()
self.add_correlation('image', '', self.id) self.add_correlation('image', '', self.id)
for tag in tags: for tag in tags:

View File

@ -225,11 +225,11 @@ class AbstractObject(ABC):
## Correlation ## ## Correlation ##
def _get_external_correlation(self, req_type, req_subtype, req_id, obj_type): def get_obj_correlations(self, obj_type, obj_subtype, obj_id, filter_types=[]):
""" """
Get object correlation Get object correlation
""" """
return get_correlations(req_type, req_subtype, req_id, filter_types=[obj_type]) return get_correlations(obj_type, obj_subtype, obj_id, filter_types=filter_types)
def get_correlation(self, obj_type): def get_correlation(self, obj_type):
""" """

View File

@ -17,6 +17,7 @@ sys.path.append(os.environ['AIL_BIN'])
# Import Project packages # Import Project packages
################################## ##################################
from modules.abstract_module import AbstractModule from modules.abstract_module import AbstractModule
from lib.ConfigLoader import ConfigLoader
from lib import chats_viewer from lib import chats_viewer
from lib.objects import Messages from lib.objects import Messages
from lib.objects import Ocrs from lib.objects import Ocrs
@ -68,30 +69,45 @@ class OcrExtractor(AbstractModule):
# Waiting time in seconds between to message processed # Waiting time in seconds between to message processed
self.pending_seconds = 1 self.pending_seconds = 1
config_loader = ConfigLoader()
self.r_cache = config_loader.get_redis_conn("Redis_Cache")
# Send module state to logs # Send module state to logs
self.logger.info(f'Module {self.module_name} initialized') self.logger.info(f'Module {self.module_name} initialized')
def is_cached(self):
return self.r_cache.exists(f'ocr:no:{self.obj.id}')
def add_to_cache(self):
self.r_cache.setex(f'ocr:no:{self.obj.id}', 86400, 0)
def compute(self, message): def compute(self, message):
image = self.get_obj() image = self.get_obj()
path = image.get_filepath() print(image.id)
print(image)
languages = get_model_languages(image)
print(languages)
ocr = Ocrs.Ocr(image.id) ocr = Ocrs.Ocr(image.id)
ocr.delete() if self.is_cached():
return None
if not ocr.exists(): if not ocr.exists():
path = image.get_filepath()
languages = get_model_languages(image)
print(languages)
texts = Ocrs.extract_text(path, languages) texts = Ocrs.extract_text(path, languages)
if texts: if texts:
print('create')
ocr = Ocrs.create(image.id, texts) ocr = Ocrs.create(image.id, texts)
self.add_message_to_queue(ocr) self.add_message_to_queue(ocr)
# Save in cache
else:
print('no text detected')
self.add_to_cache()
else:
print('update correlation')
ocr.update_correlation()
if __name__ == '__main__': if __name__ == '__main__':
module = OcrExtractor() module = OcrExtractor()
module.run() module.run()
# from lib.objects import Images
# module.obj = Images.Image('')
# module.compute('')

26
update/v5.5/Update.py Executable file
View File

@ -0,0 +1,26 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import sys
sys.path.append(os.environ['AIL_HOME'])
##################################
# Import Project packages
##################################
from update.bin.ail_updater import AIL_Updater
from lib import ail_updates
from lib import chats_viewer
class Updater(AIL_Updater):
"""default Updater."""
def __init__(self, version):
super(Updater, self).__init__(version)
if __name__ == '__main__':
chats_viewer.fix_correlations_subchannel_message()
updater = Updater('v5.5')
updater.run_update()

40
update/v5.5/Update.sh Executable file
View File

@ -0,0 +1,40 @@
#!/bin/bash
[ -z "$AIL_HOME" ] && echo "Needs the env var AIL_HOME. Run the script from the virtual environment." && exit 1;
[ -z "$AIL_REDIS" ] && echo "Needs the env var AIL_REDIS. Run the script from the virtual environment." && exit 1;
[ -z "$AIL_BIN" ] && echo "Needs the env var AIL_ARDB. Run the script from the virtual environment." && exit 1;
[ -z "$AIL_FLASK" ] && echo "Needs the env var AIL_FLASK. Run the script from the virtual environment." && exit 1;
export PATH=$AIL_HOME:$PATH
export PATH=$AIL_REDIS:$PATH
export PATH=$AIL_BIN:$PATH
export PATH=$AIL_FLASK:$PATH
GREEN="\\033[1;32m"
DEFAULT="\\033[0;39m"
echo -e $GREEN"Shutting down AIL ..."$DEFAULT
bash ${AIL_BIN}/LAUNCH.sh -ks
wait
# SUBMODULES #
git submodule update
echo ""
echo -e $GREEN"Updating python packages ..."$DEFAULT
echo ""
pip install -U easyocr
bash ${AIL_BIN}/LAUNCH.sh -lrv
bash ${AIL_BIN}/LAUNCH.sh -lkv
echo ""
echo -e $GREEN"Updating AIL VERSION ..."$DEFAULT
echo ""
python ${AIL_HOME}/update/v5.5/Update.py
wait
echo ""
echo ""
exit 0