mirror of https://github.com/CIRCL/AIL-framework
				
				
				
			fix: [chat] fix subchannel-message correlation + fix empty message language detection
							parent
							
								
									b9c37167ad
								
							
						
					
					
						commit
						2db54def46
					
				| 
						 | 
				
			
			@ -206,8 +206,7 @@ class AbstractChatFeeder(DefaultFeeder, ABC):
 | 
			
		|||
        subchannel = ChatSubChannels.ChatSubChannel(f'{self.get_chat_id()}/{meta["id"]}', self.get_chat_instance_uuid())
 | 
			
		||||
        thread = None
 | 
			
		||||
 | 
			
		||||
        # TODO correlation with obj = message/image
 | 
			
		||||
        subchannel.add(date)
 | 
			
		||||
        subchannel.add(date, obj)
 | 
			
		||||
 | 
			
		||||
        if meta.get('date'): # TODO check if already exists
 | 
			
		||||
            subchannel.set_created_at(int(meta['date']['timestamp']))
 | 
			
		||||
| 
						 | 
				
			
			@ -358,7 +357,58 @@ class AbstractChatFeeder(DefaultFeeder, ABC):
 | 
			
		|||
            # CHAT
 | 
			
		||||
            chat_objs = self.process_chat(new_objs, obj, date, timestamp, reply_id=reply_id)
 | 
			
		||||
 | 
			
		||||
            # Message forward
 | 
			
		||||
            # # TODO HANDLE OTHERS OBJECT TYPE
 | 
			
		||||
            # # TODO MAKE IT GENERIC FOR OTHERS CHATS !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 | 
			
		||||
            # # Message forward + Discussion
 | 
			
		||||
            # if self.get_json_meta().get('forward'):
 | 
			
		||||
            #     discussion_id = self.get_json_meta().get('discussion')
 | 
			
		||||
            #     forward_from = self.get_message_forward()
 | 
			
		||||
            #
 | 
			
		||||
            #     if discussion_id:       # TODO HANDLE FORWARDED MESSAGES FROM EXTERNAL CHANNELS
 | 
			
		||||
            #         chat_forward_id = forward_from['from']['id']
 | 
			
		||||
            #         message_forward_id = forward_from['from']['channel_post']
 | 
			
		||||
            #
 | 
			
		||||
            #         # if chat_forward_id == discussion_id:
 | 
			
		||||
            #         #     linked_chat = Chat(chat_forward_id, self.get_chat_instance_uuid())
 | 
			
		||||
            #         #     if linked_chat.exists():
 | 
			
		||||
            #         #         # create thread
 | 
			
		||||
            #         #         # add message replies for each childrens
 | 
			
		||||
            #
 | 
			
		||||
            # # TODO HANDLE THREAD
 | 
			
		||||
            # # TODO Change FORWARD META FIELDS
 | 
			
		||||
            # # meta['forward'] = {}
 | 
			
		||||
            # #       # CHAT ID
 | 
			
		||||
            # #       # SUBCHANNEL ID     -> can be None
 | 
			
		||||
            # #       # Message ID
 | 
			
		||||
            #
 | 
			
		||||
            # # meta['forward']['origin']
 | 
			
		||||
            # #       # same as 'forward'
 | 
			
		||||
            #
 | 
			
		||||
            # if self.get_json_meta().get('forward'):
 | 
			
		||||
            #     forward = self.get_message_forward()
 | 
			
		||||
            #     f_chat = forward['chat']
 | 
			
		||||
            #     f_subchannel = forward.get('subchannel')
 | 
			
		||||
            #     f_id = forward.get('id')
 | 
			
		||||
            #     if not f_subchannel:
 | 
			
		||||
            #         chat_forward = Chat(f_chat, self.get_chat_instance_uuid())
 | 
			
		||||
            #         if chat_forward.exists():
 | 
			
		||||
            #             for chat_obj in chat_objs:
 | 
			
		||||
            #                 if chat_obj.type == 'chat':
 | 
			
		||||
            #                     chat_forward.add_relationship(chat_obj.get_global_id(), 'forward')
 | 
			
		||||
            #             # TODO LIST FORWARDED MESSAGES
 | 
			
		||||
            #
 | 
			
		||||
            #
 | 
			
		||||
            # # Discord -> serverID + subchannel ID + message ID
 | 
			
		||||
            # # Telegram -> chat ID + Message ID
 | 
			
		||||
            # #                 + ORIGIN IDs
 | 
			
		||||
            #
 | 
			
		||||
            #
 | 
			
		||||
            #
 | 
			
		||||
            # # TODO create relationships graph
 | 
			
		||||
            #
 | 
			
		||||
            #
 | 
			
		||||
            # # TODO REMOVE ME
 | 
			
		||||
            # # Message forward  # TODO handle subchannel + message ID
 | 
			
		||||
            # if self.get_json_meta().get('forward'):
 | 
			
		||||
            #     forward_from = self.get_message_forward()
 | 
			
		||||
            #     print('-----------------------------------------------------------')
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -265,7 +265,10 @@ def _get_html2text(content, ignore_links=False):
 | 
			
		|||
    h = html2text.HTML2Text()
 | 
			
		||||
    h.ignore_links = ignore_links
 | 
			
		||||
    h.ignore_images = ignore_links
 | 
			
		||||
    return h.handle(content)
 | 
			
		||||
    content = h.handle(content)
 | 
			
		||||
    if content == '\n\n':
 | 
			
		||||
        content = ''
 | 
			
		||||
    return content
 | 
			
		||||
 | 
			
		||||
def _clean_text_to_translate(content, html=False, keys_blocks=True):
 | 
			
		||||
    if html:
 | 
			
		||||
| 
						 | 
				
			
			@ -482,14 +485,23 @@ class LanguagesDetector:
 | 
			
		|||
        return languages
 | 
			
		||||
 | 
			
		||||
    def detect(self, content, force_gcld3=False):  # TODO detect length between 20-200 ????
 | 
			
		||||
        if not content:
 | 
			
		||||
            return None
 | 
			
		||||
        content = _clean_text_to_translate(content, html=True)
 | 
			
		||||
        # print('cleaned content', content)
 | 
			
		||||
        # gcld3
 | 
			
		||||
        if len(content) < 100:
 | 
			
		||||
        if not content:
 | 
			
		||||
            return None
 | 
			
		||||
        # DEBUG
 | 
			
		||||
        # print('-------------------------------------------------------')
 | 
			
		||||
        # print(content)
 | 
			
		||||
        # print(len(content))
 | 
			
		||||
        # lexilang
 | 
			
		||||
        if len(content) < 150:
 | 
			
		||||
            # print('lexilang')
 | 
			
		||||
            languages = self.detect_lexilang(content)
 | 
			
		||||
        # gcld3
 | 
			
		||||
        else:
 | 
			
		||||
            # if len(content) >= 200 or not self.lt or force_gcld3:
 | 
			
		||||
            #     print('gcld3')
 | 
			
		||||
            # print('gcld3')
 | 
			
		||||
            languages = self.detect_gcld3(content)
 | 
			
		||||
        # libretranslate
 | 
			
		||||
        # else:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -323,7 +323,6 @@ def get_username_meta_from_global_id(username_global_id):
 | 
			
		|||
    username = Usernames.Username(username_id, instance_uuid)
 | 
			
		||||
    return username.get_meta()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# TODO Filter
 | 
			
		||||
## Instance type
 | 
			
		||||
## Chats IDS
 | 
			
		||||
| 
						 | 
				
			
			@ -380,6 +379,22 @@ def get_nb_messages_iterator(filters={}):
 | 
			
		|||
            nb_messages += chat.get_nb_messages()
 | 
			
		||||
    return nb_messages
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#### FIX ####
 | 
			
		||||
 | 
			
		||||
def fix_correlations_subchannel_message():
 | 
			
		||||
    for instance_uuid in get_chat_service_instances():
 | 
			
		||||
        for chat_id in ChatServiceInstance(instance_uuid).get_chats():
 | 
			
		||||
            chat = Chats.Chat(chat_id, instance_uuid)
 | 
			
		||||
            # subchannels
 | 
			
		||||
            for subchannel_gid in chat.get_subchannels():
 | 
			
		||||
                _, _, subchannel_id = subchannel_gid.split(':', 2)
 | 
			
		||||
                subchannel = ChatSubChannels.ChatSubChannel(subchannel_id, instance_uuid)
 | 
			
		||||
                messages, _ = subchannel._get_messages(nb=-1)
 | 
			
		||||
                for mess in messages:
 | 
			
		||||
                    _, _, message_id = mess[0].split(':', )
 | 
			
		||||
                    subchannel.add_correlation('message', '', message_id)
 | 
			
		||||
 | 
			
		||||
#### API ####
 | 
			
		||||
 | 
			
		||||
def api_get_chat_service_instance(chat_instance_uuid):
 | 
			
		||||
| 
						 | 
				
			
			@ -392,6 +407,7 @@ def api_get_chat(chat_id, chat_instance_uuid, translation_target=None, nb=-1, pa
 | 
			
		|||
    chat = Chats.Chat(chat_id, chat_instance_uuid)
 | 
			
		||||
    if not chat.exists():
 | 
			
		||||
        return {"status": "error", "reason": "Unknown chat"}, 404
 | 
			
		||||
    # print(chat.get_obj_language_stats())
 | 
			
		||||
    meta = chat.get_meta({'created_at', 'icon', 'info', 'nb_participants', 'subchannels', 'threads', 'translation', 'username'}, translation_target=translation_target)
 | 
			
		||||
    if meta['username']:
 | 
			
		||||
        meta['username'] = get_username_meta_from_global_id(meta['username'])
 | 
			
		||||
| 
						 | 
				
			
			@ -437,6 +453,7 @@ def api_get_subchannel(chat_id, chat_instance_uuid, translation_target=None, nb=
 | 
			
		|||
    subchannel = ChatSubChannels.ChatSubChannel(chat_id, chat_instance_uuid)
 | 
			
		||||
    if not subchannel.exists():
 | 
			
		||||
        return {"status": "error", "reason": "Unknown subchannel"}, 404
 | 
			
		||||
    # print(subchannel.get_obj_language_stats())
 | 
			
		||||
    meta = subchannel.get_meta({'chat', 'created_at', 'icon', 'nb_messages', 'nb_participants', 'threads', 'translation'}, translation_target=translation_target)
 | 
			
		||||
    if meta['chat']:
 | 
			
		||||
        meta['chat'] = get_chat_meta_from_global_id(meta['chat'])
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -113,7 +113,7 @@ class Message(AbstractObject):
 | 
			
		|||
    def get_subchannel(self):
 | 
			
		||||
        subchannel = self.get_correlation('chat-subchannel')
 | 
			
		||||
        if subchannel.get('chat-subchannel'):
 | 
			
		||||
            return f'user-account:{subchannel["chat-subchannel"].pop()}'
 | 
			
		||||
            return f'chat-subchannel:{subchannel["chat-subchannel"].pop()}'
 | 
			
		||||
 | 
			
		||||
    def get_thread(self):
 | 
			
		||||
        for child in self.get_childrens():
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -33,6 +33,10 @@ class Languages(AbstractModule):
 | 
			
		|||
                for lang in obj.get_languages(min_probability=0.8, force_gcld3=True):
 | 
			
		||||
                    print(lang)
 | 
			
		||||
                    domain.add_language(lang)
 | 
			
		||||
        # Detect Chat Message Language
 | 
			
		||||
        # elif obj.type == 'message':
 | 
			
		||||
        #     lang = obj.detect_language()
 | 
			
		||||
        #     print(self.obj.id, lang)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -20,17 +20,39 @@ from lib.ail_core import is_object_type
 | 
			
		|||
from lib import ail_queues
 | 
			
		||||
from lib.objects import ail_objects
 | 
			
		||||
 | 
			
		||||
def reprocess_message_objects(object_type):
 | 
			
		||||
    queue = ail_queues.AILQueue('FeederModuleImporter', -1)
 | 
			
		||||
    for obj in ail_objects.obj_iterator(object_type, filters={}):
 | 
			
		||||
        queue.send_message(obj.get_global_id(), message='reprocess')
 | 
			
		||||
    queue.end()
 | 
			
		||||
# from modules.ApiKey import ApiKey
 | 
			
		||||
# from modules.Categ import Categ
 | 
			
		||||
# from modules.CreditCards import CreditCards
 | 
			
		||||
# from modules.DomClassifier import DomClassifier
 | 
			
		||||
# from modules.Global import Global
 | 
			
		||||
# from modules.Keys import Keys
 | 
			
		||||
# from modules.Onion import Onion
 | 
			
		||||
# from modules.Telegram import Telegram
 | 
			
		||||
 | 
			
		||||
from modules.Languages import Languages
 | 
			
		||||
 | 
			
		||||
MODULES = {
 | 
			
		||||
    'Languages': Languages
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
def reprocess_message_objects(object_type, module_name=None):
 | 
			
		||||
    if module_name:
 | 
			
		||||
        module = MODULES[module_name]()
 | 
			
		||||
        for obj in ail_objects.obj_iterator(object_type, filters={}):
 | 
			
		||||
            module.obj = obj
 | 
			
		||||
            module.compute(None)
 | 
			
		||||
    else:
 | 
			
		||||
        queue = ail_queues.AILQueue('FeederModuleImporter', -1)
 | 
			
		||||
        for obj in ail_objects.obj_iterator(object_type, filters={}):
 | 
			
		||||
            queue.send_message(obj.get_global_id(), message='reprocess')
 | 
			
		||||
        queue.end()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
 | 
			
		||||
    parser = argparse.ArgumentParser(description='Reprocess AIL Objects')
 | 
			
		||||
    parser.add_argument('-t', '--type', type=str, help='AIL Object Type', required=True)
 | 
			
		||||
    parser.add_argument('-m', '--module', type=str, help='AIL Module Name')
 | 
			
		||||
 | 
			
		||||
    args = parser.parse_args()
 | 
			
		||||
    if not args.type:
 | 
			
		||||
| 
						 | 
				
			
			@ -43,4 +65,7 @@ if __name__ == "__main__":
 | 
			
		|||
    if obj_type not in ['item', 'message']:  # TODO image
 | 
			
		||||
        raise Exception(f'Currently not supported Object Type: {obj_type}')
 | 
			
		||||
 | 
			
		||||
    reprocess_message_objects(obj_type)
 | 
			
		||||
    modulename = args.module
 | 
			
		||||
    if modulename not in MODULES:
 | 
			
		||||
        raise Exception(f'Currently not supported Module: {modulename}')
 | 
			
		||||
    reprocess_message_objects(obj_type, module_name=modulename)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -10,6 +10,7 @@ sys.path.append(os.environ['AIL_HOME'])
 | 
			
		|||
##################################
 | 
			
		||||
from update.bin.ail_updater import AIL_Updater
 | 
			
		||||
from lib import ail_updates
 | 
			
		||||
from lib import chats_viewer
 | 
			
		||||
 | 
			
		||||
class Updater(AIL_Updater):
 | 
			
		||||
    """default Updater."""
 | 
			
		||||
| 
						 | 
				
			
			@ -19,6 +20,7 @@ class Updater(AIL_Updater):
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    chats_viewer.fix_correlations_subchannel_message()
 | 
			
		||||
    updater = Updater('v5.4')
 | 
			
		||||
    updater.run_update()
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue