mirror of https://github.com/CIRCL/AIL-framework
fix: [ocr] filter ocr supported languages + fix type of object accepted by the tracker
parent
26f9e84d97
commit
35502d955f
|
@ -81,10 +81,10 @@ def get_default_correlation_objects():
|
||||||
return AIL_OBJECTS_CORRELATIONS_DEFAULT
|
return AIL_OBJECTS_CORRELATIONS_DEFAULT
|
||||||
|
|
||||||
def get_obj_queued():
|
def get_obj_queued():
|
||||||
return ['item', 'image']
|
return ['item', 'image', 'message', 'ocr']
|
||||||
|
|
||||||
def get_objects_tracked():
|
def get_objects_tracked():
|
||||||
return ['decoded', 'item', 'pgp', 'message', 'title']
|
return ['decoded', 'item', 'pgp', 'message', 'ocr', 'title']
|
||||||
|
|
||||||
def get_objects_retro_hunted():
|
def get_objects_retro_hunted():
|
||||||
return ['decoded', 'item', 'message']
|
return ['decoded', 'item', 'message']
|
||||||
|
|
|
@ -296,14 +296,24 @@ def extract_text(image_path, languages, threshold=0.2):
|
||||||
extracted.append((bbox, text))
|
extracted.append((bbox, text))
|
||||||
return extracted
|
return extracted
|
||||||
|
|
||||||
# TODO OCRS Class
|
|
||||||
|
|
||||||
def get_ids():
|
def get_ocr_languages():
|
||||||
return r_object.smembers(f'ocr:all')
|
return {'af', 'ar', 'as', 'az', 'be', 'bg', 'bh', 'bs', 'cs', 'cy', 'da', 'de', 'en', 'es', 'et', 'fa', 'fr', 'ga', 'hi', 'hr', 'hu', 'id', 'is', 'it', 'ja', 'kn', 'ko', 'ku', 'la', 'lt', 'lv', 'mi', 'mn', 'mr', 'ms', 'mt', 'ne', 'nl', 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sq', 'sr', 'sv', 'sw', 'ta', 'te', 'th', 'tl', 'tr', 'ug', 'uk', 'ur', 'uz', 'vi', 'zh'}
|
||||||
|
|
||||||
def get_all_ocrs_objects(filters={}):
|
|
||||||
for obj_id in get_ids():
|
def sanityze_ocr_languages(languages, ocr_languages=None):
|
||||||
yield Ocr(obj_id)
|
langs = set()
|
||||||
|
if not ocr_languages:
|
||||||
|
ocr_languages = get_ocr_languages()
|
||||||
|
for lang in languages:
|
||||||
|
if lang in ocr_languages:
|
||||||
|
if lang == 'zh':
|
||||||
|
langs.add('ch_sim')
|
||||||
|
elif lang == 'sr':
|
||||||
|
langs.add('rs_latin')
|
||||||
|
else:
|
||||||
|
langs.add(lang)
|
||||||
|
return langs
|
||||||
|
|
||||||
class Ocrs(AbstractDaterangeObjects):
|
class Ocrs(AbstractDaterangeObjects):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -22,8 +22,9 @@ from lib import chats_viewer
|
||||||
from lib.objects import Messages
|
from lib.objects import Messages
|
||||||
from lib.objects import Ocrs
|
from lib.objects import Ocrs
|
||||||
|
|
||||||
|
|
||||||
# Default to eng
|
# Default to eng
|
||||||
def get_model_languages(obj, add_en=True):
|
def get_model_languages(obj, ocr_languages, add_en=True):
|
||||||
if add_en:
|
if add_en:
|
||||||
model_languages = {'en'}
|
model_languages = {'en'}
|
||||||
else:
|
else:
|
||||||
|
@ -53,6 +54,8 @@ def get_model_languages(obj, add_en=True):
|
||||||
model_languages.add(lang)
|
model_languages.add(lang)
|
||||||
return model_languages
|
return model_languages
|
||||||
|
|
||||||
|
model_languages = Ocrs.sanityze_ocr_languages(model_languages, ocr_languages=ocr_languages)
|
||||||
|
|
||||||
return model_languages
|
return model_languages
|
||||||
|
|
||||||
# TODO thread
|
# TODO thread
|
||||||
|
@ -72,6 +75,8 @@ class OcrExtractor(AbstractModule):
|
||||||
config_loader = ConfigLoader()
|
config_loader = ConfigLoader()
|
||||||
self.r_cache = config_loader.get_redis_conn("Redis_Cache")
|
self.r_cache = config_loader.get_redis_conn("Redis_Cache")
|
||||||
|
|
||||||
|
self.ocr_languages = Ocrs.get_ocr_languages()
|
||||||
|
|
||||||
# Send module state to logs
|
# Send module state to logs
|
||||||
self.logger.info(f'Module {self.module_name} initialized')
|
self.logger.info(f'Module {self.module_name} initialized')
|
||||||
|
|
||||||
|
@ -95,7 +100,7 @@ class OcrExtractor(AbstractModule):
|
||||||
|
|
||||||
if not ocr.exists():
|
if not ocr.exists():
|
||||||
path = image.get_filepath()
|
path = image.get_filepath()
|
||||||
languages = get_model_languages(image)
|
languages = get_model_languages(image, self.ocr_languages)
|
||||||
print(image.id, languages)
|
print(image.id, languages)
|
||||||
texts = Ocrs.extract_text(path, languages)
|
texts = Ocrs.extract_text(path, languages)
|
||||||
if texts:
|
if texts:
|
||||||
|
|
|
@ -140,6 +140,10 @@
|
||||||
<input class="custom-control-input" type="checkbox" name="message_obj" id="message_obj" checked="">
|
<input class="custom-control-input" type="checkbox" name="message_obj" id="message_obj" checked="">
|
||||||
<label class="custom-control-label" for="message_obj"><i class="fas fa-comment-dots"></i> Message <i class="fas fa-info-circle text-info" data-toggle="tooltip" data-placement="right" title="Messages from Chats"></i></label>
|
<label class="custom-control-label" for="message_obj"><i class="fas fa-comment-dots"></i> Message <i class="fas fa-info-circle text-info" data-toggle="tooltip" data-placement="right" title="Messages from Chats"></i></label>
|
||||||
</div>
|
</div>
|
||||||
|
<div class="custom-control custom-switch mt-1">
|
||||||
|
<input class="custom-control-input" type="checkbox" name="ocr_obj" id="ocr_obj" checked="">
|
||||||
|
<label class="custom-control-label" for="ocr_obj"><i class="fas fa-comment-dots"></i> OCR <i class="fas fa-expand text-info" data-toggle="tooltip" data-placement="right" title="Text extracted from Images"></i></label>
|
||||||
|
</div>
|
||||||
|
|
||||||
{# <div class="custom-control custom-switch mt-1">#}
|
{# <div class="custom-control custom-switch mt-1">#}
|
||||||
{# <input class="custom-control-input" type="checkbox" name="level" id="screenshot_obj" checked="">#}
|
{# <input class="custom-control-input" type="checkbox" name="level" id="screenshot_obj" checked="">#}
|
||||||
|
|
Loading…
Reference in New Issue