chg: [reprocess tool] add OcrExtractor module + filter image gif

ocr
terrtia 2024-04-24 15:16:18 +02:00
parent c25ccb8618
commit 7fd8ae4a81
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
3 changed files with 20 additions and 1 deletions

View File

@ -2,6 +2,7 @@
# -*-coding:UTF-8 -*
import base64
import magic
import os
import sys
@ -64,6 +65,14 @@ class Image(AbstractDaterangeObject):
filename = os.path.join(IMAGE_FOLDER, self.get_rel_path())
return os.path.realpath(filename)
def is_gif(self, filepath=None):
if not filepath:
filepath = self.get_filepath()
mime = magic.from_file(filepath, mime=True)
if mime == 'image/gif':
return True
return False
def get_file_content(self):
filepath = self.get_filepath()
with open(filepath, 'rb') as f:

View File

@ -89,6 +89,10 @@ class OcrExtractor(AbstractModule):
if self.is_cached():
return None
if self.obj.is_gif():
self.logger.warning(f'Ignoring GIF: {self.obj.id}')
return None
if not ocr.exists():
path = image.get_filepath()
languages = get_model_languages(image)

View File

@ -30,15 +30,21 @@ from lib.objects import ail_objects
# from modules.Telegram import Telegram
from modules.Languages import Languages
from modules.OcrExtractor import OcrExtractor
MODULES = {
'Languages': Languages
'Languages': Languages,
'OcrExtractor': OcrExtractor
}
def reprocess_message_objects(object_type, module_name=None):
if module_name:
module = MODULES[module_name]()
for obj in ail_objects.obj_iterator(object_type, filters={}):
if not obj.exists():
print(f'ERROR: object does not exist, {obj.id}')
continue
module.obj = obj
module.compute(None)
else: