mirror of https://github.com/CIRCL/AIL-framework
chg: [ocr] get languages model + group extracted content by line + process ocr objects + get all images
parent
61701e2fcc
commit
ed13e8bca4
|
@ -330,6 +330,11 @@ def get_obj_languages(obj_type, obj_subtype, obj_id):
|
|||
def get_obj_language_stats(obj_type, obj_subtype, obj_id):
|
||||
return r_lang.zrange(f'obj:langs:stat:{obj_type}:{obj_subtype}:{obj_id}', 0, -1, withscores=True)
|
||||
|
||||
def get_obj_main_language(obj_type, obj_subtype, obj_id):
|
||||
language = r_lang.zrevrange(f'obj:langs:stat:{obj_type}:{obj_subtype}:{obj_id}', 0, 0)
|
||||
if language:
|
||||
return language[0]
|
||||
|
||||
# TODO ADD language to CHAT GLOBAL SET
|
||||
def add_obj_language(language, obj_type, obj_subtype, obj_id, objs_containers=set()): # (s)
|
||||
if not obj_subtype:
|
||||
|
|
|
@ -288,6 +288,10 @@ def get_obj_chat(chat_type, chat_subtype, chat_id):
|
|||
elif chat_type == 'chat-thread':
|
||||
return ChatThreads.ChatThread(chat_id, chat_subtype)
|
||||
|
||||
def get_obj_chat_from_global_id(chat_gid):
|
||||
chat_type, chat_subtype, chat_id = chat_gid.split(':', 2)
|
||||
return get_obj_chat(chat_type, chat_subtype, chat_id)
|
||||
|
||||
def get_obj_chat_meta(obj_chat, new_options=set()):
|
||||
options = {}
|
||||
if obj_chat.type == 'chat':
|
||||
|
|
|
@ -50,7 +50,7 @@ class Image(AbstractDaterangeObject):
|
|||
if flask_context:
|
||||
url = url_for('correlation.show_correlation', type=self.type, id=self.id)
|
||||
else:
|
||||
url = f'{baseurl}/correlation/show?type={self.type}&id={self.id}'
|
||||
url = f'/correlation/show?type={self.type}&id={self.id}'
|
||||
return url
|
||||
|
||||
def get_svg_icon(self):
|
||||
|
@ -109,6 +109,20 @@ class Image(AbstractDaterangeObject):
|
|||
def get_screenshot_dir():
|
||||
return IMAGE_FOLDER
|
||||
|
||||
def get_all_images():
|
||||
images = []
|
||||
for root, dirs, files in os.walk(get_screenshot_dir()):
|
||||
for file in files:
|
||||
path = f'{root}{file}'
|
||||
image_id = path.replace(IMAGE_FOLDER, '').replace('/', '')
|
||||
images.append(image_id)
|
||||
return images
|
||||
|
||||
|
||||
def get_all_images_objects(filters={}):
|
||||
for image_id in get_all_images():
|
||||
yield Image(image_id)
|
||||
|
||||
|
||||
def create(content, size_limit=5000000, b64=False, force=False):
|
||||
size = (len(content)*3) / 4
|
||||
|
@ -134,5 +148,6 @@ class Images(AbstractDaterangeObjects):
|
|||
|
||||
|
||||
# if __name__ == '__main__':
|
||||
# print(json.dumps(get_all_images()))
|
||||
# name_to_search = '29ba'
|
||||
# print(search_screenshots_by_name(name_to_search))
|
||||
|
|
|
@ -17,6 +17,7 @@ sys.path.append(os.environ['AIL_BIN'])
|
|||
##################################
|
||||
from lib.objects.abstract_object import AbstractObject
|
||||
from lib.ConfigLoader import ConfigLoader
|
||||
from packages import Date
|
||||
# from lib import Language
|
||||
# from lib.data_retention_engine import update_obj_date, get_obj_date_first
|
||||
|
||||
|
@ -49,10 +50,24 @@ class Ocr(AbstractObject):
|
|||
global_id = self.get_global_id()
|
||||
content = r_cache.get(f'content:{global_id}')
|
||||
if not content:
|
||||
content = ''
|
||||
dict_content = {}
|
||||
for extracted in r_object.smembers(f'ocr:{self.id}'):
|
||||
text = extracted.split(':', 4)[-1]
|
||||
content = f'{content}\n{text}'
|
||||
extracted = extracted.split(':', 4)
|
||||
x, y = extracted[0].split(',', 1)
|
||||
# get text line, y +- 20
|
||||
rounded_y = round(int(y) / 20) * 20
|
||||
if rounded_y not in dict_content:
|
||||
dict_content[rounded_y] = []
|
||||
dict_content[rounded_y].append((int(x), int(y), extracted[-1]))
|
||||
|
||||
content = ''
|
||||
l_key = sorted(dict_content.keys())
|
||||
for key in l_key:
|
||||
dict_content[key] = sorted(dict_content[key], key=lambda c: c[0])
|
||||
for text in dict_content[key]:
|
||||
content = f'{content} {text[2]}'
|
||||
content = f'{content}\n'
|
||||
|
||||
# Set Cache
|
||||
if content:
|
||||
global_id = self.get_global_id()
|
||||
|
@ -66,8 +81,18 @@ class Ocr(AbstractObject):
|
|||
return content.encode()
|
||||
|
||||
def get_date(self): # TODO
|
||||
timestamp = self.get_timestamp()
|
||||
return datetime.utcfromtimestamp(float(timestamp)).strftime('%Y%m%d')
|
||||
return Date.get_today_date_str()
|
||||
|
||||
def get_source(self): # TODO
|
||||
"""
|
||||
Returns source/feeder name
|
||||
"""
|
||||
return 'ocr'
|
||||
# l_source = self.id.split('/')[:-2]
|
||||
# return os.path.join(*l_source)
|
||||
|
||||
def get_basename(self): # TODO
|
||||
return 'ocr'
|
||||
|
||||
def get_link(self, flask_context=False):
|
||||
if flask_context:
|
||||
|
@ -77,7 +102,7 @@ class Ocr(AbstractObject):
|
|||
return url
|
||||
|
||||
def get_svg_icon(self):
|
||||
return {'style': 'fas', 'icon': '\uf20a', 'color': 'yellow', 'radius': 5}
|
||||
return {'style': 'fas', 'icon': '\uf065', 'color': 'yellow', 'radius': 5}
|
||||
|
||||
def get_image_path(self):
|
||||
rel_path = os.path.join(self.id[0:2], self.id[2:4], self.id[4:6], self.id[6:8], self.id[8:10], self.id[10:12], self.id[12:])
|
||||
|
@ -138,18 +163,17 @@ class Ocr(AbstractObject):
|
|||
# meta['language'] = self.get_language()
|
||||
return meta
|
||||
|
||||
def get_objs_container(self): # TODO
|
||||
pass
|
||||
# objs_containers = set()
|
||||
# # chat
|
||||
# objs_containers.add(self.get_chat())
|
||||
# subchannel = self.get_subchannel()
|
||||
# if subchannel:
|
||||
# objs_containers.add(subchannel)
|
||||
# thread = self.get_current_thread()
|
||||
# if thread:
|
||||
# objs_containers.add(thread)
|
||||
# return objs_containers
|
||||
def get_objs_container(self):
|
||||
objs_containers = set()
|
||||
# chat
|
||||
objs_containers.add(self.get_first_correlation('chat'))
|
||||
subchannel = self.get_first_correlation('chat-subchannel')
|
||||
if subchannel:
|
||||
objs_containers.add(subchannel)
|
||||
thread = self.get_first_correlation('chat-thread')
|
||||
if thread:
|
||||
objs_containers.add(thread)
|
||||
return objs_containers
|
||||
|
||||
def create_coord_str(self, bbox):
|
||||
c1, c2, c3, c4 = bbox
|
||||
|
@ -195,18 +219,20 @@ class Ocr(AbstractObject):
|
|||
return r_object.srem(f'ocr:{self.id}', val)
|
||||
|
||||
def create(self, extracted_texts, tags=[]):
|
||||
r_object.sadd(f'{self.type}:all', self.id)
|
||||
for extracted in extracted_texts:
|
||||
bbox, text = extracted
|
||||
str_coords = self.create_coord_str(bbox)
|
||||
self.add(str_coords, text)
|
||||
self.add_correlation('image', '', self.id)
|
||||
if len(text) > 1:
|
||||
str_coords = self.create_coord_str(bbox)
|
||||
self.add(str_coords, text)
|
||||
self.add_correlation('image', '', self.id)
|
||||
|
||||
for tag in tags:
|
||||
self.add_tag(tag)
|
||||
|
||||
# # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\
|
||||
def delete(self):
|
||||
pass
|
||||
r_object.delete(f'ocr:{self.id}')
|
||||
|
||||
def draw_bounding_boxs(self):
|
||||
img = Image.open(self.get_image_path()).convert("RGBA")
|
||||
|
@ -233,8 +259,9 @@ def create(obj_id, detections, tags=[]):
|
|||
# TODO preload languages
|
||||
def extract_text(image_path, languages, threshold=0.2):
|
||||
import easyocr
|
||||
reader = easyocr.Reader(languages)
|
||||
reader = easyocr.Reader(languages, verbose=False)
|
||||
texts = reader.readtext(image_path)
|
||||
# print(texts)
|
||||
extracted = []
|
||||
for bbox, text, score in texts:
|
||||
if score > threshold:
|
||||
|
@ -242,3 +269,11 @@ def extract_text(image_path, languages, threshold=0.2):
|
|||
return extracted
|
||||
|
||||
# TODO OCRS Class
|
||||
|
||||
def get_ids():
|
||||
return r_object.smembers(f'ocr:all')
|
||||
|
||||
def get_all_ocrs_objects(filters={}):
|
||||
for obj_id in get_ids():
|
||||
yield Ocr(obj_id)
|
||||
|
||||
|
|
|
@ -25,7 +25,7 @@ from lib import Duplicate
|
|||
from lib.correlations_engine import get_nb_correlations, get_correlations, add_obj_correlation, delete_obj_correlation, delete_obj_correlations, exists_obj_correlation, is_obj_correlated, get_nb_correlation_by_correl_type, get_obj_inter_correlation
|
||||
from lib.Investigations import is_object_investigated, get_obj_investigations, delete_obj_investigations
|
||||
from lib.relationships_engine import get_obj_nb_relationships, add_obj_relationship
|
||||
from lib.Language import get_obj_languages, add_obj_language, remove_obj_language, detect_obj_language, get_obj_language_stats, get_obj_translation, set_obj_translation, delete_obj_translation
|
||||
from lib.Language import get_obj_languages, add_obj_language, remove_obj_language, detect_obj_language, get_obj_language_stats, get_obj_translation, set_obj_translation, delete_obj_translation, get_obj_main_language
|
||||
from lib.Tracker import is_obj_tracked, get_obj_trackers, delete_obj_trackers
|
||||
|
||||
logging.config.dictConfig(ail_logger.get_config(name='ail'))
|
||||
|
@ -237,6 +237,11 @@ class AbstractObject(ABC):
|
|||
"""
|
||||
return get_correlations(self.type, self.subtype, self.id, filter_types=[obj_type])
|
||||
|
||||
def get_first_correlation(self, obj_type):
|
||||
correlation = self.get_correlation(obj_type)
|
||||
if correlation.get(obj_type):
|
||||
return f'{obj_type}:{correlation[obj_type].pop()}'
|
||||
|
||||
def get_correlations(self, filter_types=[], unpack=False):
|
||||
"""
|
||||
Get object correlations
|
||||
|
@ -330,6 +335,9 @@ class AbstractObject(ABC):
|
|||
def get_obj_language_stats(self):
|
||||
return get_obj_language_stats(self.type, self.get_subtype(r_str=True), self.id)
|
||||
|
||||
def get_main_language(self):
|
||||
return get_obj_main_language(self.type, self.get_subtype(r_str=True), self.id)
|
||||
|
||||
def get_translation(self, language, field=''):
|
||||
return get_obj_translation(self.get_global_id(), language, field=field, objs_containers=self.get_objs_container())
|
||||
|
||||
|
|
|
@ -296,6 +296,8 @@ def is_filtered(obj, filters):
|
|||
def obj_iterator(obj_type, filters):
|
||||
if obj_type == 'decoded':
|
||||
return get_all_decodeds_objects(filters=filters)
|
||||
elif obj_type == 'image':
|
||||
return Images.get_all_images_objects(filters=filters)
|
||||
elif obj_type == 'item':
|
||||
return get_all_items_objects(filters=filters)
|
||||
elif obj_type == 'pgp':
|
||||
|
|
|
@ -128,7 +128,7 @@ class Global(AbstractModule):
|
|||
|
||||
else:
|
||||
self.logger.info(f"Empty Item: {message} not processed")
|
||||
elif self.obj.type == 'message':
|
||||
elif self.obj.type == 'message' or self.obj.type == 'ocr':
|
||||
# TODO send to specific object queue => image, ...
|
||||
self.add_message_to_queue(obj=self.obj, queue='Item')
|
||||
elif self.obj.type == 'image':
|
||||
|
|
|
@ -17,8 +17,45 @@ sys.path.append(os.environ['AIL_BIN'])
|
|||
# Import Project packages
|
||||
##################################
|
||||
from modules.abstract_module import AbstractModule
|
||||
from lib import chats_viewer
|
||||
from lib.objects import Messages
|
||||
from lib.objects import Ocrs
|
||||
|
||||
# Default to eng
|
||||
def get_model_languages(obj, add_en=True):
|
||||
if add_en:
|
||||
model_languages = {'en'}
|
||||
else:
|
||||
model_languages = set()
|
||||
|
||||
ob = obj.get_first_correlation('message')
|
||||
if ob:
|
||||
message = Messages.Message(ob.split(':', 2)[-1])
|
||||
lang = message.get_language()
|
||||
if lang:
|
||||
model_languages.add(lang)
|
||||
return model_languages
|
||||
|
||||
ob = obj.get_first_correlation('chat-subchannel')
|
||||
if ob:
|
||||
ob = chats_viewer.get_obj_chat_from_global_id(ob)
|
||||
lang = ob.get_main_language()
|
||||
if lang:
|
||||
model_languages.add(lang)
|
||||
return model_languages
|
||||
|
||||
ob = obj.get_first_correlation('chat')
|
||||
if ob:
|
||||
ob = chats_viewer.get_obj_chat_from_global_id(ob)
|
||||
lang = ob.get_main_language()
|
||||
if lang:
|
||||
model_languages.add(lang)
|
||||
return model_languages
|
||||
|
||||
return model_languages
|
||||
|
||||
# TODO thread
|
||||
|
||||
|
||||
class OcrExtractor(AbstractModule):
|
||||
"""
|
||||
|
@ -36,16 +73,16 @@ class OcrExtractor(AbstractModule):
|
|||
|
||||
def compute(self, message):
|
||||
image = self.get_obj()
|
||||
print(image)
|
||||
path = image.get_filepath()
|
||||
languages = ['en', 'ru']
|
||||
print(image)
|
||||
|
||||
languages = get_model_languages(image)
|
||||
print(languages)
|
||||
|
||||
ocr = Ocrs.Ocr(image.id)
|
||||
ocr.delete()
|
||||
if not ocr.exists():
|
||||
# TODO Get Language to extract -> add en by default
|
||||
|
||||
texts = Ocrs.extract_text(path, languages)
|
||||
print(texts)
|
||||
if texts:
|
||||
ocr = Ocrs.create(image.id, texts)
|
||||
self.add_message_to_queue(ocr)
|
||||
|
@ -55,3 +92,6 @@ if __name__ == '__main__':
|
|||
|
||||
module = OcrExtractor()
|
||||
module.run()
|
||||
# from lib.objects import Images
|
||||
# module.obj = Images.Image('')
|
||||
# module.compute('')
|
||||
|
|
|
@ -62,7 +62,7 @@ if __name__ == "__main__":
|
|||
obj_type = args.type
|
||||
if not is_object_type(obj_type):
|
||||
raise Exception(f'Invalid Object Type: {obj_type}')
|
||||
if obj_type not in ['item', 'message']: # TODO image
|
||||
if obj_type not in ['image', 'item', 'message']:
|
||||
raise Exception(f'Currently not supported Object Type: {obj_type}')
|
||||
|
||||
modulename = args.module
|
||||
|
|
|
@ -92,7 +92,7 @@
|
|||
</li>
|
||||
</ul>
|
||||
|
||||
{% with obj_type='image', obj_id=meta['id'], obj_subtype='' %}
|
||||
{% with obj_type='ocr', obj_id=meta['id'], obj_subtype='' %}
|
||||
{% include 'modals/investigations_register_obj.html' %}
|
||||
{% endwith %}
|
||||
<button type="button" class="btn btn-primary" data-toggle="modal" data-target="#investigations_register_obj_modal">
|
||||
|
|
Loading…
Reference in New Issue