mirror of https://github.com/CIRCL/AIL-framework
fix: [chat] fix subchannel-message correlation + fix empty message language detection
parent
b9c37167ad
commit
2db54def46
|
@ -206,8 +206,7 @@ class AbstractChatFeeder(DefaultFeeder, ABC):
|
|||
subchannel = ChatSubChannels.ChatSubChannel(f'{self.get_chat_id()}/{meta["id"]}', self.get_chat_instance_uuid())
|
||||
thread = None
|
||||
|
||||
# TODO correlation with obj = message/image
|
||||
subchannel.add(date)
|
||||
subchannel.add(date, obj)
|
||||
|
||||
if meta.get('date'): # TODO check if already exists
|
||||
subchannel.set_created_at(int(meta['date']['timestamp']))
|
||||
|
@ -358,7 +357,58 @@ class AbstractChatFeeder(DefaultFeeder, ABC):
|
|||
# CHAT
|
||||
chat_objs = self.process_chat(new_objs, obj, date, timestamp, reply_id=reply_id)
|
||||
|
||||
# Message forward
|
||||
# # TODO HANDLE OTHERS OBJECT TYPE
|
||||
# # TODO MAKE IT GENERIC FOR OTHERS CHATS !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
# # Message forward + Discussion
|
||||
# if self.get_json_meta().get('forward'):
|
||||
# discussion_id = self.get_json_meta().get('discussion')
|
||||
# forward_from = self.get_message_forward()
|
||||
#
|
||||
# if discussion_id: # TODO HANDLE FORWARDED MESSAGES FROM EXTERNAL CHANNELS
|
||||
# chat_forward_id = forward_from['from']['id']
|
||||
# message_forward_id = forward_from['from']['channel_post']
|
||||
#
|
||||
# # if chat_forward_id == discussion_id:
|
||||
# # linked_chat = Chat(chat_forward_id, self.get_chat_instance_uuid())
|
||||
# # if linked_chat.exists():
|
||||
# # # create thread
|
||||
# # # add message replies for each childrens
|
||||
#
|
||||
# # TODO HANDLE THREAD
|
||||
# # TODO Change FORWARD META FIELDS
|
||||
# # meta['forward'] = {}
|
||||
# # # CHAT ID
|
||||
# # # SUBCHANNEL ID -> can be None
|
||||
# # # Message ID
|
||||
#
|
||||
# # meta['forward']['origin']
|
||||
# # # same as 'forward'
|
||||
#
|
||||
# if self.get_json_meta().get('forward'):
|
||||
# forward = self.get_message_forward()
|
||||
# f_chat = forward['chat']
|
||||
# f_subchannel = forward.get('subchannel')
|
||||
# f_id = forward.get('id')
|
||||
# if not f_subchannel:
|
||||
# chat_forward = Chat(f_chat, self.get_chat_instance_uuid())
|
||||
# if chat_forward.exists():
|
||||
# for chat_obj in chat_objs:
|
||||
# if chat_obj.type == 'chat':
|
||||
# chat_forward.add_relationship(chat_obj.get_global_id(), 'forward')
|
||||
# # TODO LIST FORWARDED MESSAGES
|
||||
#
|
||||
#
|
||||
# # Discord -> serverID + subchannel ID + message ID
|
||||
# # Telegram -> chat ID + Message ID
|
||||
# # + ORIGIN IDs
|
||||
#
|
||||
#
|
||||
#
|
||||
# # TODO create relationships graph
|
||||
#
|
||||
#
|
||||
# # TODO REMOVE ME
|
||||
# # Message forward # TODO handle subchannel + message ID
|
||||
# if self.get_json_meta().get('forward'):
|
||||
# forward_from = self.get_message_forward()
|
||||
# print('-----------------------------------------------------------')
|
||||
|
|
|
@ -265,7 +265,10 @@ def _get_html2text(content, ignore_links=False):
|
|||
h = html2text.HTML2Text()
|
||||
h.ignore_links = ignore_links
|
||||
h.ignore_images = ignore_links
|
||||
return h.handle(content)
|
||||
content = h.handle(content)
|
||||
if content == '\n\n':
|
||||
content = ''
|
||||
return content
|
||||
|
||||
def _clean_text_to_translate(content, html=False, keys_blocks=True):
|
||||
if html:
|
||||
|
@ -482,14 +485,23 @@ class LanguagesDetector:
|
|||
return languages
|
||||
|
||||
def detect(self, content, force_gcld3=False): # TODO detect length between 20-200 ????
|
||||
if not content:
|
||||
return None
|
||||
content = _clean_text_to_translate(content, html=True)
|
||||
# print('cleaned content', content)
|
||||
# gcld3
|
||||
if len(content) < 100:
|
||||
if not content:
|
||||
return None
|
||||
# DEBUG
|
||||
# print('-------------------------------------------------------')
|
||||
# print(content)
|
||||
# print(len(content))
|
||||
# lexilang
|
||||
if len(content) < 150:
|
||||
# print('lexilang')
|
||||
languages = self.detect_lexilang(content)
|
||||
# gcld3
|
||||
else:
|
||||
# if len(content) >= 200 or not self.lt or force_gcld3:
|
||||
# print('gcld3')
|
||||
# print('gcld3')
|
||||
languages = self.detect_gcld3(content)
|
||||
# libretranslate
|
||||
# else:
|
||||
|
|
|
@ -323,7 +323,6 @@ def get_username_meta_from_global_id(username_global_id):
|
|||
username = Usernames.Username(username_id, instance_uuid)
|
||||
return username.get_meta()
|
||||
|
||||
|
||||
# TODO Filter
|
||||
## Instance type
|
||||
## Chats IDS
|
||||
|
@ -380,6 +379,22 @@ def get_nb_messages_iterator(filters={}):
|
|||
nb_messages += chat.get_nb_messages()
|
||||
return nb_messages
|
||||
|
||||
|
||||
#### FIX ####
|
||||
|
||||
def fix_correlations_subchannel_message():
|
||||
for instance_uuid in get_chat_service_instances():
|
||||
for chat_id in ChatServiceInstance(instance_uuid).get_chats():
|
||||
chat = Chats.Chat(chat_id, instance_uuid)
|
||||
# subchannels
|
||||
for subchannel_gid in chat.get_subchannels():
|
||||
_, _, subchannel_id = subchannel_gid.split(':', 2)
|
||||
subchannel = ChatSubChannels.ChatSubChannel(subchannel_id, instance_uuid)
|
||||
messages, _ = subchannel._get_messages(nb=-1)
|
||||
for mess in messages:
|
||||
_, _, message_id = mess[0].split(':', )
|
||||
subchannel.add_correlation('message', '', message_id)
|
||||
|
||||
#### API ####
|
||||
|
||||
def api_get_chat_service_instance(chat_instance_uuid):
|
||||
|
@ -392,6 +407,7 @@ def api_get_chat(chat_id, chat_instance_uuid, translation_target=None, nb=-1, pa
|
|||
chat = Chats.Chat(chat_id, chat_instance_uuid)
|
||||
if not chat.exists():
|
||||
return {"status": "error", "reason": "Unknown chat"}, 404
|
||||
# print(chat.get_obj_language_stats())
|
||||
meta = chat.get_meta({'created_at', 'icon', 'info', 'nb_participants', 'subchannels', 'threads', 'translation', 'username'}, translation_target=translation_target)
|
||||
if meta['username']:
|
||||
meta['username'] = get_username_meta_from_global_id(meta['username'])
|
||||
|
@ -437,6 +453,7 @@ def api_get_subchannel(chat_id, chat_instance_uuid, translation_target=None, nb=
|
|||
subchannel = ChatSubChannels.ChatSubChannel(chat_id, chat_instance_uuid)
|
||||
if not subchannel.exists():
|
||||
return {"status": "error", "reason": "Unknown subchannel"}, 404
|
||||
# print(subchannel.get_obj_language_stats())
|
||||
meta = subchannel.get_meta({'chat', 'created_at', 'icon', 'nb_messages', 'nb_participants', 'threads', 'translation'}, translation_target=translation_target)
|
||||
if meta['chat']:
|
||||
meta['chat'] = get_chat_meta_from_global_id(meta['chat'])
|
||||
|
|
|
@ -113,7 +113,7 @@ class Message(AbstractObject):
|
|||
def get_subchannel(self):
|
||||
subchannel = self.get_correlation('chat-subchannel')
|
||||
if subchannel.get('chat-subchannel'):
|
||||
return f'user-account:{subchannel["chat-subchannel"].pop()}'
|
||||
return f'chat-subchannel:{subchannel["chat-subchannel"].pop()}'
|
||||
|
||||
def get_thread(self):
|
||||
for child in self.get_childrens():
|
||||
|
|
|
@ -33,6 +33,10 @@ class Languages(AbstractModule):
|
|||
for lang in obj.get_languages(min_probability=0.8, force_gcld3=True):
|
||||
print(lang)
|
||||
domain.add_language(lang)
|
||||
# Detect Chat Message Language
|
||||
# elif obj.type == 'message':
|
||||
# lang = obj.detect_language()
|
||||
# print(self.obj.id, lang)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -20,17 +20,39 @@ from lib.ail_core import is_object_type
|
|||
from lib import ail_queues
|
||||
from lib.objects import ail_objects
|
||||
|
||||
def reprocess_message_objects(object_type):
|
||||
queue = ail_queues.AILQueue('FeederModuleImporter', -1)
|
||||
for obj in ail_objects.obj_iterator(object_type, filters={}):
|
||||
queue.send_message(obj.get_global_id(), message='reprocess')
|
||||
queue.end()
|
||||
# from modules.ApiKey import ApiKey
|
||||
# from modules.Categ import Categ
|
||||
# from modules.CreditCards import CreditCards
|
||||
# from modules.DomClassifier import DomClassifier
|
||||
# from modules.Global import Global
|
||||
# from modules.Keys import Keys
|
||||
# from modules.Onion import Onion
|
||||
# from modules.Telegram import Telegram
|
||||
|
||||
from modules.Languages import Languages
|
||||
|
||||
MODULES = {
|
||||
'Languages': Languages
|
||||
}
|
||||
|
||||
def reprocess_message_objects(object_type, module_name=None):
|
||||
if module_name:
|
||||
module = MODULES[module_name]()
|
||||
for obj in ail_objects.obj_iterator(object_type, filters={}):
|
||||
module.obj = obj
|
||||
module.compute(None)
|
||||
else:
|
||||
queue = ail_queues.AILQueue('FeederModuleImporter', -1)
|
||||
for obj in ail_objects.obj_iterator(object_type, filters={}):
|
||||
queue.send_message(obj.get_global_id(), message='reprocess')
|
||||
queue.end()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser(description='Reprocess AIL Objects')
|
||||
parser.add_argument('-t', '--type', type=str, help='AIL Object Type', required=True)
|
||||
parser.add_argument('-m', '--module', type=str, help='AIL Module Name')
|
||||
|
||||
args = parser.parse_args()
|
||||
if not args.type:
|
||||
|
@ -43,4 +65,7 @@ if __name__ == "__main__":
|
|||
if obj_type not in ['item', 'message']: # TODO image
|
||||
raise Exception(f'Currently not supported Object Type: {obj_type}')
|
||||
|
||||
reprocess_message_objects(obj_type)
|
||||
modulename = args.module
|
||||
if modulename not in MODULES:
|
||||
raise Exception(f'Currently not supported Module: {modulename}')
|
||||
reprocess_message_objects(obj_type, module_name=modulename)
|
||||
|
|
|
@ -10,6 +10,7 @@ sys.path.append(os.environ['AIL_HOME'])
|
|||
##################################
|
||||
from update.bin.ail_updater import AIL_Updater
|
||||
from lib import ail_updates
|
||||
from lib import chats_viewer
|
||||
|
||||
class Updater(AIL_Updater):
|
||||
"""default Updater."""
|
||||
|
@ -19,6 +20,7 @@ class Updater(AIL_Updater):
|
|||
|
||||
|
||||
if __name__ == '__main__':
|
||||
chats_viewer.fix_correlations_subchannel_message()
|
||||
updater = Updater('v5.4')
|
||||
updater.run_update()
|
||||
|
||||
|
|
Loading…
Reference in New Issue