From ed13e8bca4c1ff91e0e06cef9fa8b3e4573d4c7f Mon Sep 17 00:00:00 2001 From: terrtia Date: Wed, 10 Apr 2024 16:43:54 +0200 Subject: [PATCH] chg: [ocr] get languages model + group extracted content by line + process ocr objects + get all images --- bin/lib/Language.py | 5 ++ bin/lib/chats_viewer.py | 4 + bin/lib/objects/Images.py | 17 ++++- bin/lib/objects/Ocrs.py | 81 +++++++++++++++------ bin/lib/objects/abstract_object.py | 10 ++- bin/lib/objects/ail_objects.py | 2 + bin/modules/Global.py | 2 +- bin/modules/OcrExtractor.py | 50 +++++++++++-- tools/reprocess_objects.py | 2 +- var/www/templates/objects/ocr/card_ocr.html | 2 +- 10 files changed, 142 insertions(+), 33 deletions(-) diff --git a/bin/lib/Language.py b/bin/lib/Language.py index 4a840f02..6803cf51 100755 --- a/bin/lib/Language.py +++ b/bin/lib/Language.py @@ -330,6 +330,11 @@ def get_obj_languages(obj_type, obj_subtype, obj_id): def get_obj_language_stats(obj_type, obj_subtype, obj_id): return r_lang.zrange(f'obj:langs:stat:{obj_type}:{obj_subtype}:{obj_id}', 0, -1, withscores=True) +def get_obj_main_language(obj_type, obj_subtype, obj_id): + language = r_lang.zrevrange(f'obj:langs:stat:{obj_type}:{obj_subtype}:{obj_id}', 0, 0) + if language: + return language[0] + # TODO ADD language to CHAT GLOBAL SET def add_obj_language(language, obj_type, obj_subtype, obj_id, objs_containers=set()): # (s) if not obj_subtype: diff --git a/bin/lib/chats_viewer.py b/bin/lib/chats_viewer.py index 3554b09f..8e47ef8b 100755 --- a/bin/lib/chats_viewer.py +++ b/bin/lib/chats_viewer.py @@ -288,6 +288,10 @@ def get_obj_chat(chat_type, chat_subtype, chat_id): elif chat_type == 'chat-thread': return ChatThreads.ChatThread(chat_id, chat_subtype) +def get_obj_chat_from_global_id(chat_gid): + chat_type, chat_subtype, chat_id = chat_gid.split(':', 2) + return get_obj_chat(chat_type, chat_subtype, chat_id) + def get_obj_chat_meta(obj_chat, new_options=set()): options = {} if obj_chat.type == 'chat': diff --git a/bin/lib/objects/Images.py b/bin/lib/objects/Images.py index 9d71b7d5..391a2431 100755 --- a/bin/lib/objects/Images.py +++ b/bin/lib/objects/Images.py @@ -50,7 +50,7 @@ class Image(AbstractDaterangeObject): if flask_context: url = url_for('correlation.show_correlation', type=self.type, id=self.id) else: - url = f'{baseurl}/correlation/show?type={self.type}&id={self.id}' + url = f'/correlation/show?type={self.type}&id={self.id}' return url def get_svg_icon(self): @@ -109,6 +109,20 @@ class Image(AbstractDaterangeObject): def get_screenshot_dir(): return IMAGE_FOLDER +def get_all_images(): + images = [] + for root, dirs, files in os.walk(get_screenshot_dir()): + for file in files: + path = f'{root}{file}' + image_id = path.replace(IMAGE_FOLDER, '').replace('/', '') + images.append(image_id) + return images + + +def get_all_images_objects(filters={}): + for image_id in get_all_images(): + yield Image(image_id) + def create(content, size_limit=5000000, b64=False, force=False): size = (len(content)*3) / 4 @@ -134,5 +148,6 @@ class Images(AbstractDaterangeObjects): # if __name__ == '__main__': +# print(json.dumps(get_all_images())) # name_to_search = '29ba' # print(search_screenshots_by_name(name_to_search)) diff --git a/bin/lib/objects/Ocrs.py b/bin/lib/objects/Ocrs.py index 5dc0edab..3df745b3 100755 --- a/bin/lib/objects/Ocrs.py +++ b/bin/lib/objects/Ocrs.py @@ -17,6 +17,7 @@ sys.path.append(os.environ['AIL_BIN']) ################################## from lib.objects.abstract_object import AbstractObject from lib.ConfigLoader import ConfigLoader +from packages import Date # from lib import Language # from lib.data_retention_engine import update_obj_date, get_obj_date_first @@ -49,10 +50,24 @@ class Ocr(AbstractObject): global_id = self.get_global_id() content = r_cache.get(f'content:{global_id}') if not content: - content = '' + dict_content = {} for extracted in r_object.smembers(f'ocr:{self.id}'): - text = extracted.split(':', 4)[-1] - content = f'{content}\n{text}' + extracted = extracted.split(':', 4) + x, y = extracted[0].split(',', 1) + # get text line, y +- 20 + rounded_y = round(int(y) / 20) * 20 + if rounded_y not in dict_content: + dict_content[rounded_y] = [] + dict_content[rounded_y].append((int(x), int(y), extracted[-1])) + + content = '' + l_key = sorted(dict_content.keys()) + for key in l_key: + dict_content[key] = sorted(dict_content[key], key=lambda c: c[0]) + for text in dict_content[key]: + content = f'{content} {text[2]}' + content = f'{content}\n' + # Set Cache if content: global_id = self.get_global_id() @@ -66,8 +81,18 @@ class Ocr(AbstractObject): return content.encode() def get_date(self): # TODO - timestamp = self.get_timestamp() - return datetime.utcfromtimestamp(float(timestamp)).strftime('%Y%m%d') + return Date.get_today_date_str() + + def get_source(self): # TODO + """ + Returns source/feeder name + """ + return 'ocr' + # l_source = self.id.split('/')[:-2] + # return os.path.join(*l_source) + + def get_basename(self): # TODO + return 'ocr' def get_link(self, flask_context=False): if flask_context: @@ -77,7 +102,7 @@ class Ocr(AbstractObject): return url def get_svg_icon(self): - return {'style': 'fas', 'icon': '\uf20a', 'color': 'yellow', 'radius': 5} + return {'style': 'fas', 'icon': '\uf065', 'color': 'yellow', 'radius': 5} def get_image_path(self): rel_path = os.path.join(self.id[0:2], self.id[2:4], self.id[4:6], self.id[6:8], self.id[8:10], self.id[10:12], self.id[12:]) @@ -138,18 +163,17 @@ class Ocr(AbstractObject): # meta['language'] = self.get_language() return meta - def get_objs_container(self): # TODO - pass - # objs_containers = set() - # # chat - # objs_containers.add(self.get_chat()) - # subchannel = self.get_subchannel() - # if subchannel: - # objs_containers.add(subchannel) - # thread = self.get_current_thread() - # if thread: - # objs_containers.add(thread) - # return objs_containers + def get_objs_container(self): + objs_containers = set() + # chat + objs_containers.add(self.get_first_correlation('chat')) + subchannel = self.get_first_correlation('chat-subchannel') + if subchannel: + objs_containers.add(subchannel) + thread = self.get_first_correlation('chat-thread') + if thread: + objs_containers.add(thread) + return objs_containers def create_coord_str(self, bbox): c1, c2, c3, c4 = bbox @@ -195,18 +219,20 @@ class Ocr(AbstractObject): return r_object.srem(f'ocr:{self.id}', val) def create(self, extracted_texts, tags=[]): + r_object.sadd(f'{self.type}:all', self.id) for extracted in extracted_texts: bbox, text = extracted - str_coords = self.create_coord_str(bbox) - self.add(str_coords, text) - self.add_correlation('image', '', self.id) + if len(text) > 1: + str_coords = self.create_coord_str(bbox) + self.add(str_coords, text) + self.add_correlation('image', '', self.id) for tag in tags: self.add_tag(tag) # # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\ def delete(self): - pass + r_object.delete(f'ocr:{self.id}') def draw_bounding_boxs(self): img = Image.open(self.get_image_path()).convert("RGBA") @@ -233,8 +259,9 @@ def create(obj_id, detections, tags=[]): # TODO preload languages def extract_text(image_path, languages, threshold=0.2): import easyocr - reader = easyocr.Reader(languages) + reader = easyocr.Reader(languages, verbose=False) texts = reader.readtext(image_path) + # print(texts) extracted = [] for bbox, text, score in texts: if score > threshold: @@ -242,3 +269,11 @@ def extract_text(image_path, languages, threshold=0.2): return extracted # TODO OCRS Class + +def get_ids(): + return r_object.smembers(f'ocr:all') + +def get_all_ocrs_objects(filters={}): + for obj_id in get_ids(): + yield Ocr(obj_id) + diff --git a/bin/lib/objects/abstract_object.py b/bin/lib/objects/abstract_object.py index 192d8033..4548ba9f 100755 --- a/bin/lib/objects/abstract_object.py +++ b/bin/lib/objects/abstract_object.py @@ -25,7 +25,7 @@ from lib import Duplicate from lib.correlations_engine import get_nb_correlations, get_correlations, add_obj_correlation, delete_obj_correlation, delete_obj_correlations, exists_obj_correlation, is_obj_correlated, get_nb_correlation_by_correl_type, get_obj_inter_correlation from lib.Investigations import is_object_investigated, get_obj_investigations, delete_obj_investigations from lib.relationships_engine import get_obj_nb_relationships, add_obj_relationship -from lib.Language import get_obj_languages, add_obj_language, remove_obj_language, detect_obj_language, get_obj_language_stats, get_obj_translation, set_obj_translation, delete_obj_translation +from lib.Language import get_obj_languages, add_obj_language, remove_obj_language, detect_obj_language, get_obj_language_stats, get_obj_translation, set_obj_translation, delete_obj_translation, get_obj_main_language from lib.Tracker import is_obj_tracked, get_obj_trackers, delete_obj_trackers logging.config.dictConfig(ail_logger.get_config(name='ail')) @@ -237,6 +237,11 @@ class AbstractObject(ABC): """ return get_correlations(self.type, self.subtype, self.id, filter_types=[obj_type]) + def get_first_correlation(self, obj_type): + correlation = self.get_correlation(obj_type) + if correlation.get(obj_type): + return f'{obj_type}:{correlation[obj_type].pop()}' + def get_correlations(self, filter_types=[], unpack=False): """ Get object correlations @@ -330,6 +335,9 @@ class AbstractObject(ABC): def get_obj_language_stats(self): return get_obj_language_stats(self.type, self.get_subtype(r_str=True), self.id) + def get_main_language(self): + return get_obj_main_language(self.type, self.get_subtype(r_str=True), self.id) + def get_translation(self, language, field=''): return get_obj_translation(self.get_global_id(), language, field=field, objs_containers=self.get_objs_container()) diff --git a/bin/lib/objects/ail_objects.py b/bin/lib/objects/ail_objects.py index 59b632e0..0d55f20d 100755 --- a/bin/lib/objects/ail_objects.py +++ b/bin/lib/objects/ail_objects.py @@ -296,6 +296,8 @@ def is_filtered(obj, filters): def obj_iterator(obj_type, filters): if obj_type == 'decoded': return get_all_decodeds_objects(filters=filters) + elif obj_type == 'image': + return Images.get_all_images_objects(filters=filters) elif obj_type == 'item': return get_all_items_objects(filters=filters) elif obj_type == 'pgp': diff --git a/bin/modules/Global.py b/bin/modules/Global.py index 763a7b89..f442a226 100755 --- a/bin/modules/Global.py +++ b/bin/modules/Global.py @@ -128,7 +128,7 @@ class Global(AbstractModule): else: self.logger.info(f"Empty Item: {message} not processed") - elif self.obj.type == 'message': + elif self.obj.type == 'message' or self.obj.type == 'ocr': # TODO send to specific object queue => image, ... self.add_message_to_queue(obj=self.obj, queue='Item') elif self.obj.type == 'image': diff --git a/bin/modules/OcrExtractor.py b/bin/modules/OcrExtractor.py index 36fb6e8b..85df401f 100755 --- a/bin/modules/OcrExtractor.py +++ b/bin/modules/OcrExtractor.py @@ -17,8 +17,45 @@ sys.path.append(os.environ['AIL_BIN']) # Import Project packages ################################## from modules.abstract_module import AbstractModule +from lib import chats_viewer +from lib.objects import Messages from lib.objects import Ocrs +# Default to eng +def get_model_languages(obj, add_en=True): + if add_en: + model_languages = {'en'} + else: + model_languages = set() + + ob = obj.get_first_correlation('message') + if ob: + message = Messages.Message(ob.split(':', 2)[-1]) + lang = message.get_language() + if lang: + model_languages.add(lang) + return model_languages + + ob = obj.get_first_correlation('chat-subchannel') + if ob: + ob = chats_viewer.get_obj_chat_from_global_id(ob) + lang = ob.get_main_language() + if lang: + model_languages.add(lang) + return model_languages + + ob = obj.get_first_correlation('chat') + if ob: + ob = chats_viewer.get_obj_chat_from_global_id(ob) + lang = ob.get_main_language() + if lang: + model_languages.add(lang) + return model_languages + + return model_languages + + # TODO thread + class OcrExtractor(AbstractModule): """ @@ -36,16 +73,16 @@ class OcrExtractor(AbstractModule): def compute(self, message): image = self.get_obj() - print(image) path = image.get_filepath() - languages = ['en', 'ru'] + print(image) + + languages = get_model_languages(image) + print(languages) ocr = Ocrs.Ocr(image.id) + ocr.delete() if not ocr.exists(): - # TODO Get Language to extract -> add en by default - texts = Ocrs.extract_text(path, languages) - print(texts) if texts: ocr = Ocrs.create(image.id, texts) self.add_message_to_queue(ocr) @@ -55,3 +92,6 @@ if __name__ == '__main__': module = OcrExtractor() module.run() + # from lib.objects import Images + # module.obj = Images.Image('') + # module.compute('') diff --git a/tools/reprocess_objects.py b/tools/reprocess_objects.py index 678cf989..a832487a 100755 --- a/tools/reprocess_objects.py +++ b/tools/reprocess_objects.py @@ -62,7 +62,7 @@ if __name__ == "__main__": obj_type = args.type if not is_object_type(obj_type): raise Exception(f'Invalid Object Type: {obj_type}') - if obj_type not in ['item', 'message']: # TODO image + if obj_type not in ['image', 'item', 'message']: raise Exception(f'Currently not supported Object Type: {obj_type}') modulename = args.module diff --git a/var/www/templates/objects/ocr/card_ocr.html b/var/www/templates/objects/ocr/card_ocr.html index 216b27bf..4b9f787d 100644 --- a/var/www/templates/objects/ocr/card_ocr.html +++ b/var/www/templates/objects/ocr/card_ocr.html @@ -92,7 +92,7 @@ - {% with obj_type='image', obj_id=meta['id'], obj_subtype='' %} + {% with obj_type='ocr', obj_id=meta['id'], obj_subtype='' %} {% include 'modals/investigations_register_obj.html' %} {% endwith %}