mirror of https://github.com/CIRCL/AIL-framework
chg: [reprocess tool] add OcrExtractor module + filter image gif
parent
c25ccb8618
commit
7fd8ae4a81
|
@ -2,6 +2,7 @@
|
|||
# -*-coding:UTF-8 -*
|
||||
|
||||
import base64
|
||||
import magic
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
@ -64,6 +65,14 @@ class Image(AbstractDaterangeObject):
|
|||
filename = os.path.join(IMAGE_FOLDER, self.get_rel_path())
|
||||
return os.path.realpath(filename)
|
||||
|
||||
def is_gif(self, filepath=None):
|
||||
if not filepath:
|
||||
filepath = self.get_filepath()
|
||||
mime = magic.from_file(filepath, mime=True)
|
||||
if mime == 'image/gif':
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_file_content(self):
|
||||
filepath = self.get_filepath()
|
||||
with open(filepath, 'rb') as f:
|
||||
|
|
|
@ -89,6 +89,10 @@ class OcrExtractor(AbstractModule):
|
|||
if self.is_cached():
|
||||
return None
|
||||
|
||||
if self.obj.is_gif():
|
||||
self.logger.warning(f'Ignoring GIF: {self.obj.id}')
|
||||
return None
|
||||
|
||||
if not ocr.exists():
|
||||
path = image.get_filepath()
|
||||
languages = get_model_languages(image)
|
||||
|
|
|
@ -30,15 +30,21 @@ from lib.objects import ail_objects
|
|||
# from modules.Telegram import Telegram
|
||||
|
||||
from modules.Languages import Languages
|
||||
from modules.OcrExtractor import OcrExtractor
|
||||
|
||||
MODULES = {
|
||||
'Languages': Languages
|
||||
'Languages': Languages,
|
||||
'OcrExtractor': OcrExtractor
|
||||
|
||||
}
|
||||
|
||||
def reprocess_message_objects(object_type, module_name=None):
|
||||
if module_name:
|
||||
module = MODULES[module_name]()
|
||||
for obj in ail_objects.obj_iterator(object_type, filters={}):
|
||||
if not obj.exists():
|
||||
print(f'ERROR: object does not exist, {obj.id}')
|
||||
continue
|
||||
module.obj = obj
|
||||
module.compute(None)
|
||||
else:
|
||||
|
|
Loading…
Reference in New Issue