diff --git a/bin/lib/ail_core.py b/bin/lib/ail_core.py index 9530b75d..945e8634 100755 --- a/bin/lib/ail_core.py +++ b/bin/lib/ail_core.py @@ -18,14 +18,14 @@ config_loader = None AIL_OBJECTS = sorted({'chat', 'chat-subchannel', 'chat-thread', 'cookie-name', 'cve', 'cryptocurrency', 'decoded', 'domain', 'etag', 'favicon', 'file-name', 'hhhash', - 'item', 'image', 'message', 'pgp', 'screenshot', 'title', 'user-account', 'username'}) + 'item', 'image', 'message', 'ocr', 'pgp', 'screenshot', 'title', 'user-account', 'username'}) AIL_OBJECTS_WITH_SUBTYPES = {'chat', 'chat-subchannel', 'cryptocurrency', 'pgp', 'username', 'user-account'} # TODO by object TYPE ???? AIL_OBJECTS_CORRELATIONS_DEFAULT = sorted({'chat', 'chat-subchannel', 'chat-thread', 'cve', 'cryptocurrency', 'decoded', 'domain', 'favicon', 'file-name', - 'item', 'image', 'message', 'pgp', 'screenshot', 'title', 'user-account', 'username'}) + 'item', 'image', 'message', 'ocr', 'pgp', 'screenshot', 'title', 'user-account', 'username'}) def get_ail_uuid(): ail_uuid = r_serv_db.get('ail:uuid') @@ -105,7 +105,7 @@ def unpack_obj_global_id(global_id, r_type='tuple'): obj = global_id.split(':', 2) return {'type': obj[0], 'subtype': obj[1], 'id': obj[2]} else: # tuple(type, subtype, id) - return global_id.split(':', 2) + return global_id.split(':', 2) # TODO REPLACE get_obj_type_subtype_id_from_global_id(global_id) def unpack_objs_global_id(objs_global_id, r_type='tuple'): objs = [] diff --git a/bin/lib/correlations_engine.py b/bin/lib/correlations_engine.py index 378687bf..6afe27da 100755 --- a/bin/lib/correlations_engine.py +++ b/bin/lib/correlations_engine.py @@ -53,9 +53,10 @@ CORRELATION_TYPES_BY_OBJ = { "favicon": ["domain", "item"], # TODO Decoded "file-name": ["chat", "message"], "hhhash": ["domain"], - "image": ["chat", "message", "user-account"], + "image": ["chat", "message", "ocr", "user-account"], "item": ["cve", "cryptocurrency", "decoded", "domain", "favicon", "pgp", "screenshot", "title", "username"], # chat ??? "message": ["chat", "chat-subchannel", "chat-thread", "cve", "cryptocurrency", "decoded", "file-name", "image", "pgp", "user-account"], # chat ?? + "ocr": ["image"], "pgp": ["domain", "item", "message"], "screenshot": ["domain", "item"], "title": ["domain", "item"], diff --git a/bin/lib/objects/Ocrs.py b/bin/lib/objects/Ocrs.py new file mode 100755 index 00000000..52d58808 --- /dev/null +++ b/bin/lib/objects/Ocrs.py @@ -0,0 +1,244 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import sys + +from datetime import datetime +from io import BytesIO +from PIL import Image +from PIL import ImageDraw + +from pymisp import MISPObject + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from lib.objects.abstract_object import AbstractObject +from lib.ConfigLoader import ConfigLoader +# from lib import Language +# from lib.data_retention_engine import update_obj_date, get_obj_date_first + +from flask import url_for + +config_loader = ConfigLoader() +r_cache = config_loader.get_redis_conn("Redis_Cache") +r_object = config_loader.get_db_conn("Kvrocks_Objects") +baseurl = config_loader.get_config_str("Notifications", "ail_domain") +IMAGE_FOLDER = config_loader.get_files_directory('images') +config_loader = None + +# SET x1,y1:x2,y2:x3,y3:x4,y4:extracted_text + +class Ocr(AbstractObject): + """ + AIL Message Object. (strings) + """ + + def __init__(self, id): + super(Ocr, self).__init__('ocr', id) + + def exists(self): + return r_object.exists(f'ocr:{self.id}') + + def get_content(self, r_type='str'): + """ + Returns content + """ + global_id = self.get_global_id() + content = r_cache.get(f'content:{global_id}') + if not content: + content = '' + for extracted in r_object.smembers(f'ocr:{self.id}'): + text = extracted.split(':', 4)[-1] + content = f'{content}\n{text}' + # Set Cache + if content: + global_id = self.get_global_id() + r_cache.set(f'content:{global_id}', content) + r_cache.expire(f'content:{global_id}', 300) + + if r_type == 'str': + return content + elif r_type == 'bytes': + if content: + return content.encode() + + def get_date(self): # TODO + timestamp = self.get_timestamp() + return datetime.utcfromtimestamp(float(timestamp)).strftime('%Y%m%d') + + def get_link(self, flask_context=False): + if flask_context: + url = url_for('correlation.show_correlation', type=self.type, id=self.id) + else: + url = f'{baseurl}/correlation/show?type={self.type}&id={self.id}' + return url + + def get_svg_icon(self): + return {'style': 'fas', 'icon': '\uf20a', 'color': 'yellow', 'radius': 5} + + def get_image_path(self): + rel_path = os.path.join(self.id[0:2], self.id[2:4], self.id[4:6], self.id[6:8], self.id[8:10], self.id[10:12], self.id[12:]) + filename = os.path.join(IMAGE_FOLDER, rel_path) + return os.path.realpath(filename) + + def get_misp_object(self): # TODO + obj = MISPObject('instant-message', standalone=True) + obj_date = self.get_date() + if obj_date: + obj.first_seen = obj_date + else: + self.logger.warning( + f'Export error, None seen {self.type}:{self.subtype}:{self.id}, first={obj_date}') + + # obj_attrs = [obj.add_attribute('first-seen', value=obj_date), + # obj.add_attribute('raw-data', value=self.id, data=self.get_raw_content()), + # obj.add_attribute('sensor', value=get_ail_uuid())] + obj_attrs = [] + for obj_attr in obj_attrs: + for tag in self.get_tags(): + obj_attr.add_tag(tag) + return obj + + # options: set of optional meta fields + def get_meta(self, options=None, timestamp=None, translation_target=''): + """ + :type options: set + :type timestamp: float + """ + if options is None: + options = set() + meta = self.get_default_meta(tags=True) + meta['content'] = self.get_content() + + # optional meta fields + if 'investigations' in options: + meta['investigations'] = self.get_investigations() + if 'link' in options: + meta['link'] = self.get_link(flask_context=True) + if 'icon' in options: + meta['icon'] = self.get_svg_icon() + if 'img' in options: + meta['img'] = self.draw_bounding_boxs() + if 'map' in options: + meta['map'] = self.get_img_map_coords() + + # # TODO + # if 'language' in options: + # meta['language'] = self.get_language() + # if 'translation' in options and translation_target: + # if meta.get('language'): + # source = meta['language'] + # else: + # source = None + # meta['translation'] = self.translate(content=meta.get('content'), source=source, target=translation_target) + # if 'language' in options: + # meta['language'] = self.get_language() + return meta + + def get_objs_container(self): # TODO + pass + # objs_containers = set() + # # chat + # objs_containers.add(self.get_chat()) + # subchannel = self.get_subchannel() + # if subchannel: + # objs_containers.add(subchannel) + # thread = self.get_current_thread() + # if thread: + # objs_containers.add(thread) + # return objs_containers + + def create_coord_str(self, bbox): + c1, c2, c3, c4 = bbox + x1, y1 = c1 + x2, y2 = c2 + x3, y3 = c3 + x4, y4 = c4 + return f'{int(x1)},{int(y1)}:{int(x2)},{int(y2)}:{int(x3)},{int(y3)}:{int(x4)},{int(y4)}' + + def _unpack_coord(self, coord): + return coord.split(',', 1) + + def get_coords(self): + coords = [] + for extracted in r_object.smembers(f'ocr:{self.id}'): + coord = [] + bbox = extracted.split(':', 4)[:-1] + for c in bbox: + x, y = self._unpack_coord(c) + coord.append((int(x), int(y))) + coords.append(coord) + return coords + + def get_img_map_coords(self): + coords = [] + for extracted in r_object.smembers(f'ocr:{self.id}'): + extract = extracted.split(':', 4) + x1, y1 = self._unpack_coord(extract[0]) + x2, y2 = self._unpack_coord(extract[1]) + x3, y3 = self._unpack_coord(extract[2]) + x4, y4 = self._unpack_coord(extract[3]) + coords.append((f'{x1},{y1},{x2},{y2},{x3},{y3},{x4},{y4}', extract[4])) + return coords + + def edit(self, coordinates, text, new_text, new_coordinates=None): + pass + + def add(self, coordinates, text): + val = f'{coordinates}:{text}' + return r_object.sadd(f'ocr:{self.id}', val) + + def remove(self, val): + return r_object.srem(f'ocr:{self.id}', val) + + def create(self, extracted_texts, tags=[]): + for extracted in extracted_texts: + bbox, text = extracted + str_coords = self.create_coord_str(bbox) + self.add(str_coords, text) + self.add_correlation('image', '', self.id) + + for tag in tags: + self.add_tag(tag) + + # # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\ + def delete(self): + pass + + def draw_bounding_boxs(self): + img = Image.open(self.get_image_path()).convert("RGBA") + draw = ImageDraw.Draw(img) + for bbox in self.get_coords(): + c1, c2, c3, c4 = bbox + draw.line((tuple(c1), tuple(c2)), fill="yellow") + draw.line((tuple(c2), tuple(c3)), fill="yellow") + draw.line((tuple(c3), tuple(c4)), fill="yellow") + draw.line((tuple(c4), tuple(c1)), fill="yellow") + # img.show() + buff = BytesIO() + img.save(buff, "PNG") + return buff.getvalue() + + +def create(obj_id, detections, tags=[]): + obj = Ocr(obj_id) + if not obj.exists(): + obj.create(detections, tags=tags) + # TODO Edit + return obj + +# TODO preload languages +def extract_text(image_path, languages, threshold=0.2): + import easyocr + reader = easyocr.Reader(languages) + texts = reader.readtext(image_path) + extracted = [] + for bbox, text, score in texts: + if score > threshold: + extracted.append((bbox, text)) + return extracted + +# TODO OCRS Class diff --git a/bin/lib/objects/ail_objects.py b/bin/lib/objects/ail_objects.py index 6f4f750d..9d901a1a 100755 --- a/bin/lib/objects/ail_objects.py +++ b/bin/lib/objects/ail_objects.py @@ -9,8 +9,6 @@ sys.path.append(os.environ['AIL_BIN']) ################################## from lib.exceptions import AILObjectUnknown - - from lib.ConfigLoader import ConfigLoader from lib.ail_core import get_all_objects, get_object_all_subtypes, get_objects_with_subtypes, get_default_correlation_objects from lib import correlations_engine @@ -35,6 +33,7 @@ from lib.objects import HHHashs from lib.objects.Items import Item, get_all_items_objects, get_nb_items_objects from lib.objects import Images from lib.objects import Messages +from lib.objects import Ocrs from lib.objects import Pgps from lib.objects.Screenshots import Screenshot from lib.objects import Titles @@ -93,6 +92,8 @@ def get_object(obj_type, subtype, obj_id): return Images.Image(obj_id) elif obj_type == 'message': return Messages.Message(obj_id) + elif obj_type == 'ocr': + return Ocrs.Ocr(obj_id) elif obj_type == 'screenshot': return Screenshot(obj_id) elif obj_type == 'title': @@ -254,7 +255,7 @@ def get_objects_meta(objs, options=set(), flask_context=False): def get_object_card_meta(obj_type, subtype, id, related_btc=False): obj = get_object(obj_type, subtype, id) - meta = obj.get_meta(options={'chat', 'chats', 'created_at', 'icon', 'info', 'nb_messages', 'nb_participants', 'threads', 'username'}) + meta = obj.get_meta(options={'chat', 'chats', 'created_at', 'icon', 'info', 'map', 'nb_messages', 'nb_participants', 'threads', 'username'}) # meta['icon'] = obj.get_svg_icon() meta['svg_icon'] = obj.get_svg_icon() if subtype or obj_type == 'cookie-name' or obj_type == 'cve' or obj_type == 'etag' or obj_type == 'title' or obj_type == 'favicon' or obj_type == 'hhhash': diff --git a/bin/modules/Categ.py b/bin/modules/Categ.py index d5d2de82..b3948982 100755 --- a/bin/modules/Categ.py +++ b/bin/modules/Categ.py @@ -89,7 +89,7 @@ class Categ(AbstractModule): # Search for pattern categories in obj content for categ, pattern in self.categ_words: - if obj.type == 'message': + if obj.type == 'message' or obj.type == 'ocr': self.add_message_to_queue(message='0', queue=categ) else: diff --git a/bin/modules/OcrExtractor.py b/bin/modules/OcrExtractor.py new file mode 100755 index 00000000..36fb6e8b --- /dev/null +++ b/bin/modules/OcrExtractor.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* +""" +The OcrExtractor Module +====================== + +""" + +################################## +# Import External packages +################################## +import os +import sys + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from modules.abstract_module import AbstractModule +from lib.objects import Ocrs + + +class OcrExtractor(AbstractModule): + """ + OcrExtractor for AIL framework + """ + + def __init__(self): + super(OcrExtractor, self).__init__() + + # Waiting time in seconds between to message processed + self.pending_seconds = 1 + + # Send module state to logs + self.logger.info(f'Module {self.module_name} initialized') + + def compute(self, message): + image = self.get_obj() + print(image) + path = image.get_filepath() + languages = ['en', 'ru'] + + ocr = Ocrs.Ocr(image.id) + if not ocr.exists(): + # TODO Get Language to extract -> add en by default + + texts = Ocrs.extract_text(path, languages) + print(texts) + if texts: + ocr = Ocrs.create(image.id, texts) + self.add_message_to_queue(ocr) + + +if __name__ == '__main__': + + module = OcrExtractor() + module.run() diff --git a/configs/modules.cfg b/configs/modules.cfg index 848fe0b0..6ab7506d 100644 --- a/configs/modules.cfg +++ b/configs/modules.cfg @@ -162,6 +162,9 @@ publish = Tags subscribe = Image publish = Tags +[OcrExtractor] +subscribe = Image +publish = Item ######## CORE ######## diff --git a/requirements.txt b/requirements.txt index 1bcb3415..0c0bda85 100644 --- a/requirements.txt +++ b/requirements.txt @@ -82,6 +82,9 @@ bcrypt>3.1.6 # Ail typo squatting ail_typo_squatting +# OCR +easyocr + # Tests nose2>=0.12.0 coverage>=5.5 diff --git a/var/www/Flask_server.py b/var/www/Flask_server.py index fe6d6d1d..b808f5c5 100755 --- a/var/www/Flask_server.py +++ b/var/www/Flask_server.py @@ -52,6 +52,7 @@ from blueprints.objects_etag import objects_etag from blueprints.objects_hhhash import objects_hhhash from blueprints.chats_explorer import chats_explorer from blueprints.objects_image import objects_image +from blueprints.objects_ocr import objects_ocr from blueprints.objects_favicon import objects_favicon from blueprints.api_rest import api_rest @@ -114,6 +115,7 @@ app.register_blueprint(objects_etag, url_prefix=baseUrl) app.register_blueprint(objects_hhhash, url_prefix=baseUrl) app.register_blueprint(chats_explorer, url_prefix=baseUrl) app.register_blueprint(objects_image, url_prefix=baseUrl) +app.register_blueprint(objects_ocr, url_prefix=baseUrl) app.register_blueprint(objects_favicon, url_prefix=baseUrl) app.register_blueprint(api_rest, url_prefix=baseUrl) diff --git a/var/www/blueprints/correlation.py b/var/www/blueprints/correlation.py index e23fbda4..9fc50fb1 100644 --- a/var/www/blueprints/correlation.py +++ b/var/www/blueprints/correlation.py @@ -139,6 +139,9 @@ def show_correlation(): correl_option = request.form.get('imageCheck') if correl_option: filter_types.append('image') + correl_option = request.form.get('ocrCheck') + if correl_option: + filter_types.append('ocr') correl_option = request.form.get('user_accountCheck') if correl_option: filter_types.append('user-account') diff --git a/var/www/blueprints/objects_ocr.py b/var/www/blueprints/objects_ocr.py new file mode 100644 index 00000000..d9008019 --- /dev/null +++ b/var/www/blueprints/objects_ocr.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +''' + Blueprint Flask: crawler splash endpoints: dashboard, onion crawler ... +''' + +import os +import sys + +from flask import Flask, render_template, jsonify, request, Blueprint, redirect, url_for, Response, abort, send_file, send_from_directory +from flask_login import login_required, current_user + +# Import Role_Manager +from Role_Manager import login_admin, login_analyst, login_read_only, no_cache + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from lib.objects import Ocrs + +# ============ BLUEPRINT ============ +objects_ocr = Blueprint('objects_ocr', __name__, template_folder=os.path.join(os.environ['AIL_FLASK'], 'templates/objects/ocr')) + +# ============ VARIABLES ============ +bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info'] + +from io import BytesIO + +# ============ FUNCTIONS ============ +@objects_ocr.route('/ocr/') +@login_required +@login_read_only +@no_cache +def ocr_image(filename): + if not filename: + abort(404) + if not 64 <= len(filename) <= 70: + abort(404) + filename = filename.replace('/', '') + ocr = Ocrs.Ocr(filename) + return send_file(BytesIO(ocr.draw_bounding_boxs()), mimetype='image/png') + + +# ============= ROUTES ============== + diff --git a/var/www/templates/correlation/show_correlation.html b/var/www/templates/correlation/show_correlation.html index cdd0bc64..ae7306eb 100644 --- a/var/www/templates/correlation/show_correlation.html +++ b/var/www/templates/correlation/show_correlation.html @@ -130,6 +130,8 @@ {% include 'correlation/metadata_card_hhhash.html' %} {% elif dict_object["object_type"] == "image" %} {% include 'chats_explorer/card_image.html' %} + {% elif dict_object["object_type"] == "ocr" %} + {% include 'objects/ocr/card_ocr.html' %} {% elif dict_object["object_type"] == "item" %} {% include 'correlation/metadata_card_item.html' %} {% elif dict_object["object_type"] == "favicon" %} @@ -309,6 +311,10 @@ +
+ + +

diff --git a/var/www/templates/objects/ocr/card_ocr.html b/var/www/templates/objects/ocr/card_ocr.html new file mode 100644 index 00000000..216b27bf --- /dev/null +++ b/var/www/templates/objects/ocr/card_ocr.html @@ -0,0 +1,103 @@ + + + +{% with modal_add_tags=ail_tags %} + {% include 'modals/add_tags.html' %} +{% endwith %} + +{% include 'modals/edit_tag.html' %} + + + +
+
+

{{ meta["id"] }} :

+
    +
  • + + + + + + + + + + + +
    + + + + {{ meta["svg_icon"]["icon"] }} + + + {{ meta['type'] }} +
    +
  • +
  • +
    +
    +
    + +
    +
    +
    + {% include 'objects/image/block_blur_img_slider.html' %} + + + + {% for c in meta['map'] %} + + {% endfor %} + +
    +
    +
    +
    +
  • +
  • +
    {{ meta['content'] }}
    +
  • + +
  • +
    + Tags: + {% for tag in meta['tags'] %} + + {% endfor %} + +
    +
  • +
+ + {% with obj_type='image', obj_id=meta['id'], obj_subtype='' %} + {% include 'modals/investigations_register_obj.html' %} + {% endwith %} + + +
+
\ No newline at end of file