chg: [ocr] detect and translate language + show ocr view + add languages blueprint

ocr
terrtia 2024-04-11 12:15:47 +02:00
parent ed13e8bca4
commit 4cb47e8af3
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
9 changed files with 386 additions and 32 deletions

View File

@ -140,12 +140,15 @@ class Message(AbstractObject):
# TODO get channel ID
# TODO get thread ID
def _get_image_ocr(self, obj_id):
return bool(self._get_external_correlation('image', '', obj_id, 'ocr').get('ocr'))
def get_images(self):
images = []
for child in self.get_childrens():
obj_type, _, obj_id = child.split(':', 2)
if obj_type == 'image':
images.append(obj_id)
images.append({'id': obj_id, 'ocr': self._get_image_ocr(obj_id)})
return images
def get_user_account(self, meta=False):
@ -206,12 +209,6 @@ class Message(AbstractObject):
else:
return None
def _set_translation(self, translation):
"""
Set translated content
"""
return self._set_field('translated', translation) # translation by hash ??? -> avoid translating multiple time
# def get_ail_2_ail_payload(self):
# payload = {'raw': self.get_gzip_content(b64=True)}
# return payload
@ -323,7 +320,6 @@ class Message(AbstractObject):
# content = self.get_content()
# translated = argostranslate.translate.translate(content, 'ru', 'en')
# # Save translation
# self._set_translation(translated)
# return translated
## Language ##
@ -347,7 +343,6 @@ class Message(AbstractObject):
if not language and content:
language = self.detect_language()
if translation and content:
self._set_translation(translation)
self.set_translation(language, translation)
for tag in tags:
self.add_tag(tag)

View File

@ -61,12 +61,18 @@ class Ocr(AbstractObject):
dict_content[rounded_y].append((int(x), int(y), extracted[-1]))
content = ''
new_line = True
l_key = sorted(dict_content.keys())
for key in l_key:
dict_content[key] = sorted(dict_content[key], key=lambda c: c[0])
for text in dict_content[key]:
content = f'{content} {text[2]}'
if new_line:
content = f'{content}{text[2]}'
new_line = False
else:
content = f'{content} {text[2]}'
content = f'{content}\n'
new_line = True
# Set Cache
if content:
@ -94,6 +100,13 @@ class Ocr(AbstractObject):
def get_basename(self): # TODO
return 'ocr'
def get_language(self):
languages = self.get_languages()
if languages:
return languages.pop()
else:
return None
def get_link(self, flask_context=False):
if flask_context:
url = url_for('correlation.show_correlation', type=self.type, id=self.id)
@ -128,10 +141,9 @@ class Ocr(AbstractObject):
return obj
# options: set of optional meta fields
def get_meta(self, options=None, timestamp=None, translation_target=''):
def get_meta(self, options=None, translation_target=''):
"""
:type options: set
:type timestamp: float
"""
if options is None:
options = set()
@ -144,23 +156,21 @@ class Ocr(AbstractObject):
if 'link' in options:
meta['link'] = self.get_link(flask_context=True)
if 'icon' in options:
meta['icon'] = self.get_svg_icon()
meta['svg_icon'] = self.get_svg_icon()
if 'img' in options:
meta['img'] = self.draw_bounding_boxs()
if 'map' in options:
meta['map'] = self.get_img_map_coords()
# # TODO
# if 'language' in options:
# meta['language'] = self.get_language()
# if 'translation' in options and translation_target:
# if meta.get('language'):
# source = meta['language']
# else:
# source = None
# meta['translation'] = self.translate(content=meta.get('content'), source=source, target=translation_target)
# if 'language' in options:
# meta['language'] = self.get_language()
if 'language' in options:
meta['language'] = self.get_language()
if 'translation' in options and translation_target:
if meta.get('language'):
source = meta['language']
else:
source = None
meta['translation'] = self.translate(content=meta.get('content'), source=source, target=translation_target)
if 'language' in options:
meta['language'] = self.get_language()
return meta
def get_objs_container(self):
@ -277,3 +287,14 @@ def get_all_ocrs_objects(filters={}):
for obj_id in get_ids():
yield Ocr(obj_id)
#### API ####
def api_get_ocr(obj_id, translation_target=None):
ocr = Ocr(obj_id)
if not ocr.exists():
return {"status": "error", "reason": "Unknown ocr"}, 404
meta = ocr.get_meta({'content', 'icon', 'img', 'language', 'link', 'map', 'translation'}, translation_target=translation_target)
return meta, 200

View File

@ -14,6 +14,7 @@ from lib.ail_core import get_all_objects, get_object_all_subtypes, get_objects_w
from lib import correlations_engine
from lib import relationships_engine
from lib import btc_ail
from lib import Language
from lib import Tag
from lib import chats_viewer
@ -275,6 +276,34 @@ def get_object_card_meta(obj_type, subtype, id, related_btc=False):
meta["add_tags_modal"] = Tag.get_modal_add_tags(obj.id, obj.get_type(), obj.get_subtype(r_str=True))
return meta
#### OBJ LANGUAGES ####
def api_detect_language(obj_type, subtype, obj_id):
obj = get_object(obj_type, subtype, obj_id)
if not obj.exists():
return {"status": "error", "reason": "Unknown obj"}, 404
lang = obj.detect_language()
return {"language": lang}, 200
def api_manually_translate(obj_type, subtype, obj_id, source, translation_target, translation):
obj = get_object(obj_type, subtype, obj_id)
if not obj.exists():
return {"status": "error", "reason": "Unknown obj"}, 404
if translation:
if len(translation) > 200000: # TODO REVIEW LIMIT
return {"status": "error", "reason": "Max Size reached"}, 400
all_languages = Language.get_translation_languages()
if source not in all_languages:
return {"status": "error", "reason": "Unknown source Language"}, 400
obj_language = obj.get_language()
if obj_language != source:
obj.edit_language(obj_language, source)
if translation:
if translation_target not in all_languages:
return {"status": "error", "reason": "Unknown target Language"}, 400
obj.set_translation(translation_target, translation)
# TODO SANITYZE translation
return None, 200
#### OBJ FILTERS ####

View File

@ -35,6 +35,7 @@ import Flask_config
from blueprints.root import root
from blueprints.crawler_splash import crawler_splash
from blueprints.correlation import correlation
from blueprints.languages_ui import languages_ui
from blueprints.tags_ui import tags_ui
from blueprints.import_export import import_export
from blueprints.investigations_b import investigations_b
@ -98,6 +99,7 @@ app.config['MAX_CONTENT_LENGTH'] = 900 * 1024 * 1024
app.register_blueprint(root, url_prefix=baseUrl)
app.register_blueprint(crawler_splash, url_prefix=baseUrl)
app.register_blueprint(correlation, url_prefix=baseUrl)
app.register_blueprint(languages_ui, url_prefix=baseUrl)
app.register_blueprint(tags_ui, url_prefix=baseUrl)
app.register_blueprint(import_export, url_prefix=baseUrl)
app.register_blueprint(investigations_b, url_prefix=baseUrl)

View File

@ -0,0 +1,83 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
"""
Blueprint Flask: crawler splash endpoints: dashboard, onion crawler ...
"""
import os
import sys
import json
from flask import Flask, render_template, jsonify, request, Blueprint, redirect, url_for, Response, abort
from flask_login import login_required, current_user
# Import Role_Manager
from Role_Manager import login_admin, login_analyst, login_read_only
sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from lib import ail_core
from lib import Language
from lib import Tag
from lib.objects import ail_objects
# ============ BLUEPRINT ============
languages_ui = Blueprint('languages_ui', __name__, template_folder=os.path.join(os.environ['AIL_FLASK'], 'templates/chats_explorer'))
# ============ VARIABLES ============
# bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info']
def create_json_response(data, status_code):
return Response(json.dumps(data, indent=2, sort_keys=True), mimetype='application/json'), status_code
# ============ FUNCTIONS ============
# ============= ROUTES ==============
@languages_ui.route("/languages/object/translate", methods=['POST'])
@login_required
@login_read_only
def translate_object():
obj_type = request.form.get('type')
subtype = request.form.get('subtype')
obj_id = request.form.get('id')
source = request.form.get('language_target')
target = request.form.get('target')
translation = request.form.get('translation')
if target == "Don't Translate":
target = None
resp = ail_objects.api_manually_translate(obj_type, subtype, obj_id, source, target, translation)
if resp[1] != 200:
return create_json_response(resp[0], resp[1])
else:
if request.referrer:
return redirect(request.referrer)
else:
if obj_type == 'ocr':
return redirect(url_for('objects_ocr.object_ocr', id=obj_id, target=target)) # TODO change to support all objects
@languages_ui.route("/languages/object/detect/language", methods=['GET'])
@login_required
@login_read_only
def detect_object_language():
obj_type = request.args.get('type')
subtype = request.args.get('subtype')
obj_id = request.args.get('id')
target = request.args.get('target')
resp = ail_objects.api_detect_language(obj_type, subtype, obj_id)
if resp[1] != 200:
return create_json_response(resp[0], resp[1])
else:
if request.referrer:
return redirect(request.referrer)
else:
if obj_type == 'ocr':
return redirect(url_for('objects_ocr.object_ocr', id=obj_id, target=target)) # TODO change to support all objects

View File

@ -5,9 +5,12 @@
Blueprint Flask: crawler splash endpoints: dashboard, onion crawler ...
'''
import json
import os
import sys
from io import BytesIO
from flask import Flask, render_template, jsonify, request, Blueprint, redirect, url_for, Response, abort, send_file, send_from_directory
from flask_login import login_required, current_user
@ -18,6 +21,8 @@ sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from lib import Language
from lib import Tag
from lib.objects import Ocrs
# ============ BLUEPRINT ============
@ -26,7 +31,8 @@ objects_ocr = Blueprint('objects_ocr', __name__, template_folder=os.path.join(os
# ============ VARIABLES ============
bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info']
from io import BytesIO
def create_json_response(data, status_code):
return Response(json.dumps(data, indent=2, sort_keys=True), mimetype='application/json'), status_code
# ============ FUNCTIONS ============
@objects_ocr.route('/ocr/<path:filename>')
@ -43,5 +49,24 @@ def ocr_image(filename):
return send_file(BytesIO(ocr.draw_bounding_boxs()), mimetype='image/png')
@objects_ocr.route("/objects/ocr", methods=['GET'])
@login_required
@login_read_only
def object_ocr():
obj_id = request.args.get('id')
target = request.args.get('target')
if target == "Don't Translate":
target = None
meta = Ocrs.api_get_ocr(obj_id, target)
if meta[1] != 200:
return create_json_response(meta[0], meta[1])
else:
meta = meta[0]
languages = Language.get_translation_languages()
return render_template("ShowOcr.html", meta=meta,
ail_tags=Tag.get_modal_add_tags(meta['id'], meta['type'], meta['subtype']),
translation_languages=languages, translation_target=target)
# ============= ROUTES ==============

View File

@ -67,7 +67,12 @@
{% endif %}
{% if message['images'] %}
{% for message_image in message['images'] %}
<img class="object_image mb-1" src="{{ url_for('objects_image.image', filename=message_image)}}">
<img class="object_image mb-1" src="{{ url_for('objects_image.image', filename=message_image['id'])}}">
{% if message_image['ocr'] %}
<span>
<a class="btn btn-info" target="_blank" href="{{ url_for('objects_ocr.object_ocr', id=message_image['id'])}}"><i class="fas fa-expand"></i> OCR</a>
</span>
{% endif %}
{% endfor %}
{% endif %}
{% if message['files-names'] %}

View File

@ -0,0 +1,133 @@
<!DOCTYPE html>
<html>
<head>
<title>OCR - AIL</title>
<link rel="icon" href="{{ url_for('static', filename='image/ail-icon.png') }}">
<!-- Core CSS -->
<link href="{{ url_for('static', filename='css/bootstrap4.min.css') }}" rel="stylesheet">
<link href="{{ url_for('static', filename='css/font-awesome.min.css') }}" rel="stylesheet">
<link href="{{ url_for('static', filename='css/dataTables.bootstrap.min.css') }}" rel="stylesheet">
<link href="{{ url_for('static', filename='css/tags.css') }}" rel="stylesheet" type="text/css" />
<link href="{{ url_for('static', filename='css/ail-project.css') }}" rel="stylesheet">
<!-- JS -->
<script src="{{ url_for('static', filename='js/jquery.js')}}"></script>
<script src="{{ url_for('static', filename='js/popper.min.js')}}"></script>
<script src="{{ url_for('static', filename='js/bootstrap4.min.js') }}"></script>
<script src="{{ url_for('static', filename='js/jquery.dataTables.min.js')}}"></script>
<script src="{{ url_for('static', filename='js/dataTables.bootstrap.min.js')}}"></script>
<script src="{{ url_for('static', filename='js/tags.js') }}"></script>
</head>
<body>
{% include 'nav_bar.html' %}
<div class="container-fluid">
<div class="row">
{% include 'sidebars/sidebar_objects.html' %}
<div class="col-12 col-lg-10" id="core_content">
{% with meta=meta, is_correlation=False %}
{% include 'objects/ocr/card_ocr.html' %}
{% endwith %}
{# {% if meta['extracted_matches'] %}#}
{# <div id="accordion_extracted" class="mb-3 mx-3">#}
{# <div class="card">#}
{# <div class="card-header py-1" id="heading_extracted">#}
{# <div class="row">#}
{# <div class="col-11">#}
{# <div class="mt-2">#}
{# <img id="misp-logo" src="{{ url_for('static', filename='image/ail-icon.png')}}" height="32"> Extracted&nbsp;&nbsp;#}
{# <div class="badge badge-warning">{{meta['extracted_matches']|length}}</div>#}
{# </div>#}
{# </div>#}
{# <div class="col-1">#}
{# <button class="btn btn-link btn-lg py-2 float-right rotate down" data-toggle="collapse" data-target="#collapse_extracted" aria-expanded="true" aria-controls="collapseDecoded">#}
{# <i class="fas fa-chevron-circle-down"></i>#}
{# </button>#}
{# </div>#}
{# </div>#}
{# </div>#}
{##}
{# <div id="collapse_extracted" class="collapse" aria-labelledby="heading_extracted" data-parent="#accordion_extracted">#}
{# <div class="card-body">#}
{# <table id="table_extracted" class="table table-striped">#}
{# <thead class="thead-dark">#}
{# <tr>#}
{# <th>Type</th>#}
{# <th>ID</th>#}
{# <th>Extracted</th>#}
{# </tr>#}
{# </thead>#}
{# <tbody>#}
{# {% for match in meta['extracted_matches'] %}#}
{# <tr>#}
{# <td>#}
{# <svg height="26" width="26">#}
{# <g class="nodes">#}
{# <circle cx="13" cy="13" r="13" fill="{{ meta['extracted_matches'][match]['icon']['color'] }}"></circle>#}
{# <text x="13" y="13" text-anchor="middle" dominant-baseline="central" class="graph_node_icon {{ meta['extracted_matches'][match]['icon']['style'] }}" font-size="16px">{{ meta['extracted_matches'][match]['icon']['icon'] }}</text>#}
{# </g>#}
{# </svg>#}
{# {{ meta['extracted_matches'][match]['subtype'] }}#}
{# </td>#}
{# <td>{{ meta['extracted_matches'][match]['id'] }}</td>#}
{# <td>#}
{# {% for row in meta['extracted_matches'][match]['matches'] %}#}
{# <a href="#{{ row[0] }}:{{row[1] }}">{{ row[2] }}</a><br>#}
{# {% endfor %}#}
{# </td>#}
{# </tr>#}
{# {% endfor %}#}
{# </tbody>#}
{# </table>#}
{# </div>#}
{# </div>#}
{##}
{# </div>#}
{# </div>#}
{# {% endif %}#}
{% with translate_url=url_for('objects_ocr.object_ocr', id=meta['id']), obj_id=meta['id'] %}
{% include 'chats_explorer/block_translation.html' %}
{% endwith %}
<div class="mb-4"></div>
</div>
</div>
</div>
<script>
var chart = {};
$(document).ready(function(){
$("#page-Decoded").addClass("active");
$("#nav_chat").addClass("active");
});
function toggle_sidebar(){
if($('#nav_menu').is(':visible')){
$('#nav_menu').hide();
$('#side_menu').removeClass('border-right')
$('#side_menu').removeClass('col-lg-2')
$('#core_content').removeClass('col-lg-10')
}else{
$('#nav_menu').show();
$('#side_menu').addClass('border-right')
$('#side_menu').addClass('col-lg-2')
$('#core_content').addClass('col-lg-10')
}
}
</script>
</body>
</html>

View File

@ -19,11 +19,6 @@
<ul class="list-group mb-2">
<li class="list-group-item py-0">
<table class="table">
<thead class="">
<tr>
<th></th>
</tr>
</thead>
<tbody style="font-size: 15px;">
<tr>
<td>
@ -73,6 +68,59 @@
</li>
<li class="list-group-item py-0">
<pre class="my-0" style="white-space: pre-wrap;">{{ meta['content'] }}</pre>
{% if meta['translation'] %}
<hr class="m-1">
<pre class="my-0 text-secondary" style="white-space: pre-wrap;">{{ meta['translation'] }}</pre>
{% endif %}
<div class="my-1">
{% set mess_id_escape= meta['id'] | replace("/", "_") %}
<span class="btn btn-outline-dark p-0 px-1" type="button" data-toggle="collapse" data-target="#collapseTrans{{ mess_id_escape }}" aria-expanded="false" aria-controls="collapseTrans{{ mess_id_escape }}">
<i class="fas fa-language"></i> {% if meta['language'] %}{{ meta['language'] }}{% endif %}
</span>
<div class="collapse" id="collapseTrans{{ mess_id_escape }}">
<div class="card card-body">
<form method="post" action="{{ url_for('languages_ui.translate_object') }}">
<input type="text" id="type" name="type" value="{{meta['type']}}" hidden>
<input type="text" id="id" name="id" value="{{meta['id']}}" hidden>
<span class="badge badge-primary">Source:</span>
<span class="">
<select id="language_target" name="language_target" class="form-select" aria-label="Message Language" onchange="$('#translation').val('');">
<option selected value="{{ meta['language'] }}">{{ meta['language'] }}</option>
{% for language in translation_languages %}
<option value="{{ language }}">{{ translation_languages[language] }}</option>
{% endfor %}
</select>
</span>
{% if translation_target %}
<input type="text" id="target" name="target" value="{{translation_target}}" hidden>
&nbsp;&nbsp;&nbsp;&nbsp;<span class="badge badge-primary">Target:</span><span>{{translation_target}}</span>
<textarea class="form-control" id="translation" name="translation">{{ meta['translation'] }}</textarea>
<button class="btn btn-dark" type="submit">
<i class="fas fa-pen-alt"> Update Language or Translation</i>
</button>
{% else %}
<button class="btn btn-dark" type="submit">
<i class="fas fa-pen-alt"> Update Language</i>
</button>
{% endif %}
</form>
<div>
<a class="btn btn-primary" href="{{ url_for('languages_ui.detect_object_language')}}?type={{ meta['type'] }}&id={{ meta['id'] }}">
<i class="fas fa-redo"></i> Detect Language
</a>
</div>
</div>
</div>
</div>
</li>
<li class="list-group-item py-0">
@ -99,5 +147,18 @@
<i class="fas fa-microscope"></i> Investigations
</button>
<span class="mb-2 float-right">
{% if is_correlation %}
<a href="{{ url_for('objects_ocr.object_ocr')}}?subtype={{ meta['subtype'] }}&id={{ meta['id'] }}">
<button class="btn btn-info"><i class="fas fa-expand"></i> Show Object</button>
</a>
{% else %}
<a href="{{ url_for('correlation.show_correlation')}}?type={{ meta['type'] }}&subtype={{ meta['subtype'] }}&id={{ meta['id'] }}">
<button class="btn btn-info"><i class="far fa-eye"></i> Correlations &nbsp;
</button>
</a>
{% endif %}
</span>
</div>
</div>