mirror of https://github.com/CIRCL/AIL-framework
chg: [reprocess tool] add OcrExtractor module + filter image gif
parent
c25ccb8618
commit
7fd8ae4a81
|
@ -2,6 +2,7 @@
|
||||||
# -*-coding:UTF-8 -*
|
# -*-coding:UTF-8 -*
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
|
import magic
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
@ -64,6 +65,14 @@ class Image(AbstractDaterangeObject):
|
||||||
filename = os.path.join(IMAGE_FOLDER, self.get_rel_path())
|
filename = os.path.join(IMAGE_FOLDER, self.get_rel_path())
|
||||||
return os.path.realpath(filename)
|
return os.path.realpath(filename)
|
||||||
|
|
||||||
|
def is_gif(self, filepath=None):
|
||||||
|
if not filepath:
|
||||||
|
filepath = self.get_filepath()
|
||||||
|
mime = magic.from_file(filepath, mime=True)
|
||||||
|
if mime == 'image/gif':
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def get_file_content(self):
|
def get_file_content(self):
|
||||||
filepath = self.get_filepath()
|
filepath = self.get_filepath()
|
||||||
with open(filepath, 'rb') as f:
|
with open(filepath, 'rb') as f:
|
||||||
|
|
|
@ -89,6 +89,10 @@ class OcrExtractor(AbstractModule):
|
||||||
if self.is_cached():
|
if self.is_cached():
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
if self.obj.is_gif():
|
||||||
|
self.logger.warning(f'Ignoring GIF: {self.obj.id}')
|
||||||
|
return None
|
||||||
|
|
||||||
if not ocr.exists():
|
if not ocr.exists():
|
||||||
path = image.get_filepath()
|
path = image.get_filepath()
|
||||||
languages = get_model_languages(image)
|
languages = get_model_languages(image)
|
||||||
|
|
|
@ -30,15 +30,21 @@ from lib.objects import ail_objects
|
||||||
# from modules.Telegram import Telegram
|
# from modules.Telegram import Telegram
|
||||||
|
|
||||||
from modules.Languages import Languages
|
from modules.Languages import Languages
|
||||||
|
from modules.OcrExtractor import OcrExtractor
|
||||||
|
|
||||||
MODULES = {
|
MODULES = {
|
||||||
'Languages': Languages
|
'Languages': Languages,
|
||||||
|
'OcrExtractor': OcrExtractor
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def reprocess_message_objects(object_type, module_name=None):
|
def reprocess_message_objects(object_type, module_name=None):
|
||||||
if module_name:
|
if module_name:
|
||||||
module = MODULES[module_name]()
|
module = MODULES[module_name]()
|
||||||
for obj in ail_objects.obj_iterator(object_type, filters={}):
|
for obj in ail_objects.obj_iterator(object_type, filters={}):
|
||||||
|
if not obj.exists():
|
||||||
|
print(f'ERROR: object does not exist, {obj.id}')
|
||||||
|
continue
|
||||||
module.obj = obj
|
module.obj = obj
|
||||||
module.compute(None)
|
module.compute(None)
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue