From 4cb47e8af3915ae5928af50384a891e06f262a5a Mon Sep 17 00:00:00 2001 From: terrtia Date: Thu, 11 Apr 2024 12:15:47 +0200 Subject: [PATCH] chg: [ocr] detect and translate language + show ocr view + add languages blueprint --- bin/lib/objects/Messages.py | 13 +- bin/lib/objects/Ocrs.py | 53 ++++--- bin/lib/objects/ail_objects.py | 29 ++++ var/www/Flask_server.py | 2 + var/www/blueprints/languages_ui.py | 83 +++++++++++ var/www/blueprints/objects_ocr.py | 27 +++- .../chats_explorer/block_message.html | 7 +- var/www/templates/objects/ocr/ShowOcr.html | 133 ++++++++++++++++++ var/www/templates/objects/ocr/card_ocr.html | 71 +++++++++- 9 files changed, 386 insertions(+), 32 deletions(-) create mode 100644 var/www/blueprints/languages_ui.py create mode 100644 var/www/templates/objects/ocr/ShowOcr.html diff --git a/bin/lib/objects/Messages.py b/bin/lib/objects/Messages.py index b3b30457..ecf95cc0 100755 --- a/bin/lib/objects/Messages.py +++ b/bin/lib/objects/Messages.py @@ -140,12 +140,15 @@ class Message(AbstractObject): # TODO get channel ID # TODO get thread ID + def _get_image_ocr(self, obj_id): + return bool(self._get_external_correlation('image', '', obj_id, 'ocr').get('ocr')) + def get_images(self): images = [] for child in self.get_childrens(): obj_type, _, obj_id = child.split(':', 2) if obj_type == 'image': - images.append(obj_id) + images.append({'id': obj_id, 'ocr': self._get_image_ocr(obj_id)}) return images def get_user_account(self, meta=False): @@ -206,12 +209,6 @@ class Message(AbstractObject): else: return None - def _set_translation(self, translation): - """ - Set translated content - """ - return self._set_field('translated', translation) # translation by hash ??? -> avoid translating multiple time - # def get_ail_2_ail_payload(self): # payload = {'raw': self.get_gzip_content(b64=True)} # return payload @@ -323,7 +320,6 @@ class Message(AbstractObject): # content = self.get_content() # translated = argostranslate.translate.translate(content, 'ru', 'en') # # Save translation - # self._set_translation(translated) # return translated ## Language ## @@ -347,7 +343,6 @@ class Message(AbstractObject): if not language and content: language = self.detect_language() if translation and content: - self._set_translation(translation) self.set_translation(language, translation) for tag in tags: self.add_tag(tag) diff --git a/bin/lib/objects/Ocrs.py b/bin/lib/objects/Ocrs.py index 3df745b3..9d48b4bd 100755 --- a/bin/lib/objects/Ocrs.py +++ b/bin/lib/objects/Ocrs.py @@ -61,12 +61,18 @@ class Ocr(AbstractObject): dict_content[rounded_y].append((int(x), int(y), extracted[-1])) content = '' + new_line = True l_key = sorted(dict_content.keys()) for key in l_key: dict_content[key] = sorted(dict_content[key], key=lambda c: c[0]) for text in dict_content[key]: - content = f'{content} {text[2]}' + if new_line: + content = f'{content}{text[2]}' + new_line = False + else: + content = f'{content} {text[2]}' content = f'{content}\n' + new_line = True # Set Cache if content: @@ -94,6 +100,13 @@ class Ocr(AbstractObject): def get_basename(self): # TODO return 'ocr' + def get_language(self): + languages = self.get_languages() + if languages: + return languages.pop() + else: + return None + def get_link(self, flask_context=False): if flask_context: url = url_for('correlation.show_correlation', type=self.type, id=self.id) @@ -128,10 +141,9 @@ class Ocr(AbstractObject): return obj # options: set of optional meta fields - def get_meta(self, options=None, timestamp=None, translation_target=''): + def get_meta(self, options=None, translation_target=''): """ :type options: set - :type timestamp: float """ if options is None: options = set() @@ -144,23 +156,21 @@ class Ocr(AbstractObject): if 'link' in options: meta['link'] = self.get_link(flask_context=True) if 'icon' in options: - meta['icon'] = self.get_svg_icon() + meta['svg_icon'] = self.get_svg_icon() if 'img' in options: meta['img'] = self.draw_bounding_boxs() if 'map' in options: meta['map'] = self.get_img_map_coords() - - # # TODO - # if 'language' in options: - # meta['language'] = self.get_language() - # if 'translation' in options and translation_target: - # if meta.get('language'): - # source = meta['language'] - # else: - # source = None - # meta['translation'] = self.translate(content=meta.get('content'), source=source, target=translation_target) - # if 'language' in options: - # meta['language'] = self.get_language() + if 'language' in options: + meta['language'] = self.get_language() + if 'translation' in options and translation_target: + if meta.get('language'): + source = meta['language'] + else: + source = None + meta['translation'] = self.translate(content=meta.get('content'), source=source, target=translation_target) + if 'language' in options: + meta['language'] = self.get_language() return meta def get_objs_container(self): @@ -277,3 +287,14 @@ def get_all_ocrs_objects(filters={}): for obj_id in get_ids(): yield Ocr(obj_id) + +#### API #### +def api_get_ocr(obj_id, translation_target=None): + ocr = Ocr(obj_id) + if not ocr.exists(): + return {"status": "error", "reason": "Unknown ocr"}, 404 + meta = ocr.get_meta({'content', 'icon', 'img', 'language', 'link', 'map', 'translation'}, translation_target=translation_target) + return meta, 200 + + + diff --git a/bin/lib/objects/ail_objects.py b/bin/lib/objects/ail_objects.py index 0d55f20d..143979ec 100755 --- a/bin/lib/objects/ail_objects.py +++ b/bin/lib/objects/ail_objects.py @@ -14,6 +14,7 @@ from lib.ail_core import get_all_objects, get_object_all_subtypes, get_objects_w from lib import correlations_engine from lib import relationships_engine from lib import btc_ail +from lib import Language from lib import Tag from lib import chats_viewer @@ -275,6 +276,34 @@ def get_object_card_meta(obj_type, subtype, id, related_btc=False): meta["add_tags_modal"] = Tag.get_modal_add_tags(obj.id, obj.get_type(), obj.get_subtype(r_str=True)) return meta +#### OBJ LANGUAGES #### + +def api_detect_language(obj_type, subtype, obj_id): + obj = get_object(obj_type, subtype, obj_id) + if not obj.exists(): + return {"status": "error", "reason": "Unknown obj"}, 404 + lang = obj.detect_language() + return {"language": lang}, 200 + +def api_manually_translate(obj_type, subtype, obj_id, source, translation_target, translation): + obj = get_object(obj_type, subtype, obj_id) + if not obj.exists(): + return {"status": "error", "reason": "Unknown obj"}, 404 + if translation: + if len(translation) > 200000: # TODO REVIEW LIMIT + return {"status": "error", "reason": "Max Size reached"}, 400 + all_languages = Language.get_translation_languages() + if source not in all_languages: + return {"status": "error", "reason": "Unknown source Language"}, 400 + obj_language = obj.get_language() + if obj_language != source: + obj.edit_language(obj_language, source) + if translation: + if translation_target not in all_languages: + return {"status": "error", "reason": "Unknown target Language"}, 400 + obj.set_translation(translation_target, translation) + # TODO SANITYZE translation + return None, 200 #### OBJ FILTERS #### diff --git a/var/www/Flask_server.py b/var/www/Flask_server.py index b808f5c5..613c40da 100755 --- a/var/www/Flask_server.py +++ b/var/www/Flask_server.py @@ -35,6 +35,7 @@ import Flask_config from blueprints.root import root from blueprints.crawler_splash import crawler_splash from blueprints.correlation import correlation +from blueprints.languages_ui import languages_ui from blueprints.tags_ui import tags_ui from blueprints.import_export import import_export from blueprints.investigations_b import investigations_b @@ -98,6 +99,7 @@ app.config['MAX_CONTENT_LENGTH'] = 900 * 1024 * 1024 app.register_blueprint(root, url_prefix=baseUrl) app.register_blueprint(crawler_splash, url_prefix=baseUrl) app.register_blueprint(correlation, url_prefix=baseUrl) +app.register_blueprint(languages_ui, url_prefix=baseUrl) app.register_blueprint(tags_ui, url_prefix=baseUrl) app.register_blueprint(import_export, url_prefix=baseUrl) app.register_blueprint(investigations_b, url_prefix=baseUrl) diff --git a/var/www/blueprints/languages_ui.py b/var/www/blueprints/languages_ui.py new file mode 100644 index 00000000..17efb8e5 --- /dev/null +++ b/var/www/blueprints/languages_ui.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +""" + Blueprint Flask: crawler splash endpoints: dashboard, onion crawler ... +""" + +import os +import sys +import json + +from flask import Flask, render_template, jsonify, request, Blueprint, redirect, url_for, Response, abort +from flask_login import login_required, current_user + +# Import Role_Manager +from Role_Manager import login_admin, login_analyst, login_read_only + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from lib import ail_core +from lib import Language +from lib import Tag +from lib.objects import ail_objects + +# ============ BLUEPRINT ============ +languages_ui = Blueprint('languages_ui', __name__, template_folder=os.path.join(os.environ['AIL_FLASK'], 'templates/chats_explorer')) + +# ============ VARIABLES ============ +# bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info'] + +def create_json_response(data, status_code): + return Response(json.dumps(data, indent=2, sort_keys=True), mimetype='application/json'), status_code + +# ============ FUNCTIONS ============ + +# ============= ROUTES ============== +@languages_ui.route("/languages/object/translate", methods=['POST']) +@login_required +@login_read_only +def translate_object(): + obj_type = request.form.get('type') + subtype = request.form.get('subtype') + obj_id = request.form.get('id') + source = request.form.get('language_target') + target = request.form.get('target') + translation = request.form.get('translation') + if target == "Don't Translate": + target = None + + resp = ail_objects.api_manually_translate(obj_type, subtype, obj_id, source, target, translation) + if resp[1] != 200: + return create_json_response(resp[0], resp[1]) + else: + if request.referrer: + return redirect(request.referrer) + else: + if obj_type == 'ocr': + return redirect(url_for('objects_ocr.object_ocr', id=obj_id, target=target)) # TODO change to support all objects + +@languages_ui.route("/languages/object/detect/language", methods=['GET']) +@login_required +@login_read_only +def detect_object_language(): + obj_type = request.args.get('type') + subtype = request.args.get('subtype') + obj_id = request.args.get('id') + target = request.args.get('target') + + resp = ail_objects.api_detect_language(obj_type, subtype, obj_id) + if resp[1] != 200: + return create_json_response(resp[0], resp[1]) + else: + if request.referrer: + return redirect(request.referrer) + else: + if obj_type == 'ocr': + return redirect(url_for('objects_ocr.object_ocr', id=obj_id, target=target)) # TODO change to support all objects + + + + diff --git a/var/www/blueprints/objects_ocr.py b/var/www/blueprints/objects_ocr.py index d9008019..eca7f1fe 100644 --- a/var/www/blueprints/objects_ocr.py +++ b/var/www/blueprints/objects_ocr.py @@ -5,9 +5,12 @@ Blueprint Flask: crawler splash endpoints: dashboard, onion crawler ... ''' +import json import os import sys +from io import BytesIO + from flask import Flask, render_template, jsonify, request, Blueprint, redirect, url_for, Response, abort, send_file, send_from_directory from flask_login import login_required, current_user @@ -18,6 +21,8 @@ sys.path.append(os.environ['AIL_BIN']) ################################## # Import Project packages ################################## +from lib import Language +from lib import Tag from lib.objects import Ocrs # ============ BLUEPRINT ============ @@ -26,7 +31,8 @@ objects_ocr = Blueprint('objects_ocr', __name__, template_folder=os.path.join(os # ============ VARIABLES ============ bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info'] -from io import BytesIO +def create_json_response(data, status_code): + return Response(json.dumps(data, indent=2, sort_keys=True), mimetype='application/json'), status_code # ============ FUNCTIONS ============ @objects_ocr.route('/ocr/') @@ -43,5 +49,24 @@ def ocr_image(filename): return send_file(BytesIO(ocr.draw_bounding_boxs()), mimetype='image/png') +@objects_ocr.route("/objects/ocr", methods=['GET']) +@login_required +@login_read_only +def object_ocr(): + obj_id = request.args.get('id') + target = request.args.get('target') + if target == "Don't Translate": + target = None + meta = Ocrs.api_get_ocr(obj_id, target) + if meta[1] != 200: + return create_json_response(meta[0], meta[1]) + else: + meta = meta[0] + languages = Language.get_translation_languages() + return render_template("ShowOcr.html", meta=meta, + ail_tags=Tag.get_modal_add_tags(meta['id'], meta['type'], meta['subtype']), + translation_languages=languages, translation_target=target) + + # ============= ROUTES ============== diff --git a/var/www/templates/chats_explorer/block_message.html b/var/www/templates/chats_explorer/block_message.html index 1d7b4d75..d8d1c1e2 100644 --- a/var/www/templates/chats_explorer/block_message.html +++ b/var/www/templates/chats_explorer/block_message.html @@ -67,7 +67,12 @@ {% endif %} {% if message['images'] %} {% for message_image in message['images'] %} - + + {% if message_image['ocr'] %} + + OCR + + {% endif %} {% endfor %} {% endif %} {% if message['files-names'] %} diff --git a/var/www/templates/objects/ocr/ShowOcr.html b/var/www/templates/objects/ocr/ShowOcr.html new file mode 100644 index 00000000..d5dd3d28 --- /dev/null +++ b/var/www/templates/objects/ocr/ShowOcr.html @@ -0,0 +1,133 @@ + + + + + OCR - AIL + + + + + + + + + + + + + + + + + + + + + {% include 'nav_bar.html' %} + +
+
+ + {% include 'sidebars/sidebar_objects.html' %} + +
+ + {% with meta=meta, is_correlation=False %} + {% include 'objects/ocr/card_ocr.html' %} + {% endwith %} + + +{# {% if meta['extracted_matches'] %}#} +{#
#} +{#
#} +{#
#} +{#
#} +{#
#} +{#
#} +{# Extracted  #} +{#
{{meta['extracted_matches']|length}}
#} +{#
#} +{#
#} +{#
#} +{# #} +{#
#} +{#
#} +{#
#} +{##} +{#
#} +{#
#} +{# #} +{# #} +{# #} +{# #} +{# #} +{# #} +{# #} +{# #} +{# #} +{# {% for match in meta['extracted_matches'] %}#} +{# #} +{# #} +{# #} +{# #} +{# #} +{# {% endfor %}#} +{# #} +{#
TypeIDExtracted
#} +{# #} +{# #} +{# #} +{# {{ meta['extracted_matches'][match]['icon']['icon'] }}#} +{# #} +{# #} +{# {{ meta['extracted_matches'][match]['subtype'] }}#} +{# {{ meta['extracted_matches'][match]['id'] }}#} +{# {% for row in meta['extracted_matches'][match]['matches'] %}#} +{# {{ row[2] }}
#} +{# {% endfor %}#} +{#
#} +{#
#} +{#
#} +{##} +{#
#} +{#
#} +{# {% endif %}#} + + + {% with translate_url=url_for('objects_ocr.object_ocr', id=meta['id']), obj_id=meta['id'] %} + {% include 'chats_explorer/block_translation.html' %} + {% endwith %} +
+ +
+ +
+
+ + + + + + diff --git a/var/www/templates/objects/ocr/card_ocr.html b/var/www/templates/objects/ocr/card_ocr.html index 4b9f787d..d9cebe4b 100644 --- a/var/www/templates/objects/ocr/card_ocr.html +++ b/var/www/templates/objects/ocr/card_ocr.html @@ -19,11 +19,6 @@