diff --git a/bin/lib/objects/Images.py b/bin/lib/objects/Images.py index 391a2431..0e8e1d7a 100755 --- a/bin/lib/objects/Images.py +++ b/bin/lib/objects/Images.py @@ -2,6 +2,7 @@ # -*-coding:UTF-8 -* import base64 +import magic import os import sys @@ -64,6 +65,14 @@ class Image(AbstractDaterangeObject): filename = os.path.join(IMAGE_FOLDER, self.get_rel_path()) return os.path.realpath(filename) + def is_gif(self, filepath=None): + if not filepath: + filepath = self.get_filepath() + mime = magic.from_file(filepath, mime=True) + if mime == 'image/gif': + return True + return False + def get_file_content(self): filepath = self.get_filepath() with open(filepath, 'rb') as f: diff --git a/bin/modules/OcrExtractor.py b/bin/modules/OcrExtractor.py index deb732da..266d65a3 100755 --- a/bin/modules/OcrExtractor.py +++ b/bin/modules/OcrExtractor.py @@ -89,6 +89,10 @@ class OcrExtractor(AbstractModule): if self.is_cached(): return None + if self.obj.is_gif(): + self.logger.warning(f'Ignoring GIF: {self.obj.id}') + return None + if not ocr.exists(): path = image.get_filepath() languages = get_model_languages(image) diff --git a/tools/reprocess_objects.py b/tools/reprocess_objects.py index a832487a..b41f59e3 100755 --- a/tools/reprocess_objects.py +++ b/tools/reprocess_objects.py @@ -30,15 +30,21 @@ from lib.objects import ail_objects # from modules.Telegram import Telegram from modules.Languages import Languages +from modules.OcrExtractor import OcrExtractor MODULES = { - 'Languages': Languages + 'Languages': Languages, + 'OcrExtractor': OcrExtractor + } def reprocess_message_objects(object_type, module_name=None): if module_name: module = MODULES[module_name]() for obj in ail_objects.obj_iterator(object_type, filters={}): + if not obj.exists(): + print(f'ERROR: object does not exist, {obj.id}') + continue module.obj = obj module.compute(None) else: