From 197ff0222d6380f4f2a7a6f074d9411d7cd3e712 Mon Sep 17 00:00:00 2001 From: terrtia Date: Fri, 8 Mar 2024 15:26:06 +0100 Subject: [PATCH 1/4] chg: [lang] improve language detection + UI: manual translation and detection --- bin/lib/Language.py | 82 ++++++++++++++----- bin/lib/chats_viewer.py | 27 ++++-- bin/lib/objects/Messages.py | 15 +++- bin/lib/objects/abstract_chat_object.py | 2 +- bin/lib/objects/abstract_object.py | 12 ++- requirements.txt | 1 + update/v5.4/Update.sh | 1 + var/www/blueprints/chats_explorer.py | 15 +++- .../chats_explorer/block_message.html | 59 ++++++++----- .../chats_explorer/block_translation.html | 1 + 10 files changed, 164 insertions(+), 51 deletions(-) diff --git a/bin/lib/Language.py b/bin/lib/Language.py index 6e77d5ea..5d3d29c9 100755 --- a/bin/lib/Language.py +++ b/bin/lib/Language.py @@ -7,6 +7,7 @@ import sys import html2text import gcld3 +from lexilang.detector import detect as lexilang_detect from libretranslatepy import LibreTranslateAPI sys.path.append(os.environ['AIL_BIN']) @@ -342,18 +343,31 @@ def remove_obj_language(language, obj_type, obj_subtype, obj_id): obj_global_id = f'{obj_type}:{obj_subtype}:{obj_id}' r_lang.srem(f'obj:lang:{obj_global_id}', language) + delete_obj_translation(obj_global_id, language) + r_lang.srem(f'langs:{obj_type}:{obj_subtype}:{language}', obj_global_id) if not r_lang.exists(f'langs:{obj_type}:{obj_subtype}:{language}'): r_lang.srem(f'objs:lang:{obj_type}:{obj_subtype}', language) r_lang.srem(f'languages:{language}', f'{obj_type}:{obj_subtype}') if not r_lang.exists(f'objs:lang:{obj_type}:{obj_subtype}'): - if r_lang.scard(f'objs:langs:{obj_type}', language) <= 1: + if r_lang.scard(f'objs:langs:{obj_type}') <= 1: r_lang.srem(f'objs:langs:{obj_type}', language) -def edit_obj_language(language, obj_type, obj_subtype, obj_id): - remove_obj_language(language, obj_type, obj_subtype, obj_id) - add_obj_language(language, obj_type, obj_subtype, obj_id) - +# TODO handle fields +def detect_obj_language(obj_type, obj_subtype, obj_id, content): + detector = LanguagesDetector(nb_langs=1) + language = detector.detect(content) + if language: + language = language[0] + previous_lang = get_obj_languages(obj_type, obj_subtype, obj_id) + if previous_lang: + previous_lang = previous_lang[0] + if language != previous_lang: + remove_obj_language(language, obj_type, obj_subtype, obj_id) + add_obj_language(language, obj_type, obj_subtype, obj_id) + else: + add_obj_language(language, obj_type, obj_subtype, obj_id) + return language ## Translation def _get_obj_translation(obj_global_id, language, field=''): @@ -364,6 +378,7 @@ def get_obj_translation(obj_global_id, language, source=None, content=None, fiel Returns translated content """ translation = r_cache.get(f'translation:{language}:{obj_global_id}:{field}') + r_cache.expire(f'translation:{language}:{obj_global_id}:{field}', 0) if translation: # DEBUG # print('cache') @@ -372,7 +387,10 @@ def get_obj_translation(obj_global_id, language, source=None, content=None, fiel # TODO HANDLE FIELDS TRANSLATION translation = _get_obj_translation(obj_global_id, language, field=field) if not translation: - translation = LanguageTranslator().translate(content, source=source, target=language) + source, translation = LanguageTranslator().translate(content, source=source, target=language) + if source and translation: + obj_type, subtype, obj_id = obj_global_id.split(':', 2) + add_obj_language(source, obj_type, subtype, obj_id) if translation: r_cache.set(f'translation:{language}:{obj_global_id}:{field}', translation) r_cache.expire(f'translation:{language}:{obj_global_id}:{field}', 300) @@ -380,10 +398,14 @@ def get_obj_translation(obj_global_id, language, source=None, content=None, fiel # TODO Force to edit ???? + def set_obj_translation(obj_global_id, language, translation, field=''): r_cache.delete(f'translation:{language}:{obj_global_id}:') return r_lang.hset(f'tr:{obj_global_id}:{field}', language, translation) +def delete_obj_translation(obj_global_id, language, field=''): + r_cache.delete(f'translation:{language}:{obj_global_id}:') + r_lang.hdel(f'tr:{obj_global_id}:{field}', language) ## --LANGUAGE ENGINE-- ## @@ -410,11 +432,22 @@ class LanguagesDetector: if self.min_len > 0: if len(content) < self.min_len: return languages + # p = self.detector.FindTopNMostFreqLangs(content, num_langs=3) + # for lang in p: + # print(lang.language, lang.probability, lang.proportion, lang.is_reliable) + # print('------------------------------------------------') for lang in self.detector.FindTopNMostFreqLangs(content, num_langs=self.nb_langs): if lang.proportion >= self.min_proportion and lang.probability >= self.min_probability and lang.is_reliable: languages.append(lang.language) return languages + def detect_lexilang(self, content): # TODO clean text ??? - TODO REMOVE SEPARATOR + language, prob = lexilang_detect(content) + if prob > 0: + return [language] + else: + return [] + def detect_libretranslate(self, content): languages = [] try: @@ -431,19 +464,26 @@ class LanguagesDetector: languages.append(language) return languages - def detect(self, content, force_gcld3=False): + def detect(self, content, force_gcld3=False): # TODO detect length between 20-200 ???? + content = _clean_text_to_translate(content, html=True) + # print('cleaned content', content) # gcld3 - if len(content) >= 200 or not self.lt or force_gcld3: - language = self.detect_gcld3(content) - # libretranslate + if len(content) < 100: + languages = self.detect_lexilang(content) else: - language = self.detect_libretranslate(content) - return language + # if len(content) >= 200 or not self.lt or force_gcld3: + # print('gcld3') + languages = self.detect_gcld3(content) + # libretranslate + # else: + # languages = self.detect_libretranslate(content) + return languages class LanguageTranslator: def __init__(self): self.lt = LibreTranslateAPI(get_translator_instance()) + self.ld = LanguagesDetector(nb_langs=1) def languages(self): languages = [] @@ -473,13 +513,13 @@ class LanguageTranslator: return language[0].get('language') def detect(self, content): - # gcld3 - if len(content) >= 200: - language = self.detect_gcld3(content) - # libretranslate - else: - language = self.detect_libretranslate(content) - return language + # print('++++++++++++++++++++++++++++++++++++++++++++++++++++++') + # print(content) + language = self.ld.detect(content) + if language: + # print(language[0]) + # print('##############################################################') + return language[0] def translate(self, content, source=None, target="en"): # TODO source target if target not in get_translation_languages(): @@ -498,9 +538,9 @@ class LanguageTranslator: translation = None # TODO LOG and display error if translation == content: - print('EQUAL') + # print('EQUAL') translation = None - return translation + return source, translation LIST_LANGUAGES = {} diff --git a/bin/lib/chats_viewer.py b/bin/lib/chats_viewer.py index 0ec42772..b8d51975 100755 --- a/bin/lib/chats_viewer.py +++ b/bin/lib/chats_viewer.py @@ -404,18 +404,33 @@ def api_get_message(message_id, translation_target=None): message = Messages.Message(message_id) if not message.exists(): return {"status": "error", "reason": "Unknown uuid"}, 404 - meta = message.get_meta({'chat', 'content', 'files-names', 'icon', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'translation', 'user-account'}, translation_target=translation_target) + meta = message.get_meta({'chat', 'content', 'files-names', 'icon', 'images', 'language', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'translation', 'user-account'}, translation_target=translation_target) return meta, 200 -def api_manually_translate_message(message_id, translation_target, translation): +def api_message_detect_language(message_id): + message = Messages.Message(message_id) + if not message.exists(): + return {"status": "error", "reason": "Unknown uuid"}, 404 + lang = message.detect_language() + return {"language": lang}, 200 + +def api_manually_translate_message(message_id, source, translation_target, translation): message = Messages.Message(message_id) if not message.exists(): return {"status": "error", "reason": "Unknown uuid"}, 404 - if len(translation) > 200000: # TODO REVIEW LIMIT - return {"status": "error", "reason": "Max Size reached"}, 400 - if translation_target not in Language.get_translation_languages(): - return {"status": "error", "reason": "Unknown Language"}, 400 if translation: + if len(translation) > 200000: # TODO REVIEW LIMIT + return {"status": "error", "reason": "Max Size reached"}, 400 + all_languages = Language.get_translation_languages() + if source not in all_languages: + print(source) + return {"status": "error", "reason": "Unknown source Language"}, 400 + message_language = message.get_language() + if message_language != source: + message.edit_language(message_language, source) + if translation: + if translation_target not in all_languages: + return {"status": "error", "reason": "Unknown target Language"}, 400 message.set_translation(translation_target, translation) # TODO SANITYZE translation return None, 200 diff --git a/bin/lib/objects/Messages.py b/bin/lib/objects/Messages.py index 56281e69..395ec680 100755 --- a/bin/lib/objects/Messages.py +++ b/bin/lib/objects/Messages.py @@ -175,6 +175,13 @@ class Message(AbstractObject): # message media # flag is deleted -> event or missing from feeder pass ??? + def get_language(self): + languages = self.get_languages() + if languages: + return languages.pop() + else: + return None + def get_translation(self, content=None, source=None, target='fr'): """ Returns translated content @@ -289,8 +296,14 @@ class Message(AbstractObject): meta['files-names'] = self.get_files_names() if 'reactions' in options: meta['reactions'] = self.get_reactions() + if 'language' in options: + meta['language'] = self.get_language() if 'translation' in options and translation_target: - meta['translation'] = self.translate(content=meta.get('content'), target=translation_target) + if meta.get('language'): + source = meta['language'] + else: + source = None + meta['translation'] = self.translate(content=meta.get('content'), source=source, target=translation_target) # meta['encoding'] = None return meta diff --git a/bin/lib/objects/abstract_chat_object.py b/bin/lib/objects/abstract_chat_object.py index 9674d735..ef90b3db 100755 --- a/bin/lib/objects/abstract_chat_object.py +++ b/bin/lib/objects/abstract_chat_object.py @@ -226,7 +226,7 @@ class AbstractChatObject(AbstractSubtypeObject, ABC): def get_message_meta(self, message, timestamp=None, translation_target='', options=None): # TODO handle file message message = Messages.Message(message[9:]) if not options: - options = {'content', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'translation', 'user-account'} + options = {'content', 'files-names', 'images', 'language', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'translation', 'user-account'} meta = message.get_meta(options=options, timestamp=timestamp, translation_target=translation_target) return meta diff --git a/bin/lib/objects/abstract_object.py b/bin/lib/objects/abstract_object.py index 551ad155..40a114ef 100755 --- a/bin/lib/objects/abstract_object.py +++ b/bin/lib/objects/abstract_object.py @@ -25,7 +25,7 @@ from lib import Duplicate from lib.correlations_engine import get_nb_correlations, get_correlations, add_obj_correlation, delete_obj_correlation, delete_obj_correlations, exists_obj_correlation, is_obj_correlated, get_nb_correlation_by_correl_type, get_obj_inter_correlation from lib.Investigations import is_object_investigated, get_obj_investigations, delete_obj_investigations from lib.relationships_engine import get_obj_nb_relationships, add_obj_relationship -from lib.Language import get_obj_languages, add_obj_language, remove_obj_language, get_obj_translation, set_obj_translation +from lib.Language import get_obj_languages, add_obj_language, remove_obj_language, detect_obj_language, get_obj_translation, set_obj_translation, delete_obj_translation from lib.Tracker import is_obj_tracked, get_obj_trackers, delete_obj_trackers logging.config.dictConfig(ail_logger.get_config(name='ail')) @@ -313,12 +313,22 @@ class AbstractObject(ABC): def remove_language(self, language): return remove_obj_language(language, self.type, self.get_subtype(r_str=True), self.id) + def edit_language(self, old_language, new_language): + self.remove_language(old_language) + self.add_language(new_language) + + def detect_language(self, field=''): + return detect_obj_language(self.type, self.get_subtype(r_str=True), self.id, self.get_content()) + def get_translation(self, language, field=''): return get_obj_translation(self.get_global_id(), language, field=field) def set_translation(self, language, translation, field=''): return set_obj_translation(self.get_global_id(), language, translation, field=field) + def delete_translation(self, language, field=''): + return delete_obj_translation(self.get_global_id(), language, field=field) + def translate(self, content=None, field='', source=None, target='en'): global_id = self.get_global_id() if not content: diff --git a/requirements.txt b/requirements.txt index 6fc9e511..bf236ba4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -44,6 +44,7 @@ scrapy-splash>=0.7.2 # Languages gcld3 libretranslatepy +lexilang #Graph numpy>1.18.1 diff --git a/update/v5.4/Update.sh b/update/v5.4/Update.sh index 25e98c30..87f2ab7f 100755 --- a/update/v5.4/Update.sh +++ b/update/v5.4/Update.sh @@ -24,6 +24,7 @@ echo "" echo -e $GREEN"Updating python packages ..."$DEFAULT echo "" pip install -U pylacus +pip install -U lexilang bash ${AIL_BIN}/LAUNCH.sh -lrv diff --git a/var/www/blueprints/chats_explorer.py b/var/www/blueprints/chats_explorer.py index e24abe82..24c1bd84 100644 --- a/var/www/blueprints/chats_explorer.py +++ b/var/www/blueprints/chats_explorer.py @@ -240,11 +240,24 @@ def objects_message(): @login_read_only def objects_message_translate(): message_id = request.form.get('id') + source = request.form.get('language_target') target = request.form.get('target') translation = request.form.get('translation') if target == "Don't Translate": target = None - resp = chats_viewer.api_manually_translate_message(message_id, target, translation) + resp = chats_viewer.api_manually_translate_message(message_id, source, target, translation) + if resp[1] != 200: + return create_json_response(resp[0], resp[1]) + else: + return redirect(url_for('chats_explorer.objects_message', id=message_id, target=target)) + +@chats_explorer.route("/objects/message/detect/language", methods=['GET']) +@login_required +@login_read_only +def objects_message_detect_language(): + message_id = request.args.get('id') + target = request.args.get('target') + resp = chats_viewer.api_message_detect_language(message_id) if resp[1] != 200: return create_json_response(resp[0], resp[1]) else: diff --git a/var/www/templates/chats_explorer/block_message.html b/var/www/templates/chats_explorer/block_message.html index 059e973d..1a52dc67 100644 --- a/var/www/templates/chats_explorer/block_message.html +++ b/var/www/templates/chats_explorer/block_message.html @@ -81,24 +81,6 @@
{{ message['translation'] }}
- {% set mess_id_escape= message['id'] | replace("/", "_") %} - -
-
-
- - - {{translation_target}}: - - -
-
-
- {% endif %} {% for reaction in message['reactions'] %} {{ reaction }} {{ message['reactions'][reaction] }} @@ -113,10 +95,47 @@ {{ tag }} {% endfor %}
+ + {% set mess_id_escape= message['id'] | replace("/", "_") %} + +
+
+
+ + Source: + + + + {% if translation_target %} + +     Target:{{translation_target}} + + + {% else %} + + {% endif %} +
+
+ + Detect Language + +
+
+
+
- - diff --git a/var/www/templates/chats_explorer/block_translation.html b/var/www/templates/chats_explorer/block_translation.html index 6c027921..380a575b 100644 --- a/var/www/templates/chats_explorer/block_translation.html +++ b/var/www/templates/chats_explorer/block_translation.html @@ -10,6 +10,7 @@ {% else %} + {% endif %} {% for language in translation_languages %} From b9c37167ad3e84a5b8fc53b9418f12a208f1c073 Mon Sep 17 00:00:00 2001 From: terrtia Date: Mon, 25 Mar 2024 14:13:16 +0100 Subject: [PATCH 2/4] chg: [language messages] add nb languages stats by chat/subchannel objects --- bin/lib/Language.py | 43 +++++++++----- bin/lib/chats_viewer.py | 1 - bin/lib/objects/Messages.py | 58 +++++++++++-------- bin/lib/objects/abstract_object.py | 21 ++++--- var/www/blueprints/chats_explorer.py | 10 +++- .../chats_explorer/block_message.html | 2 +- 6 files changed, 87 insertions(+), 48 deletions(-) diff --git a/bin/lib/Language.py b/bin/lib/Language.py index 5d3d29c9..f6c9ebfb 100755 --- a/bin/lib/Language.py +++ b/bin/lib/Language.py @@ -324,24 +324,32 @@ def get_objs_languages(obj_type, obj_subtype=''): def get_obj_languages(obj_type, obj_subtype, obj_id): return r_lang.smembers(f'obj:lang:{obj_type}:{obj_subtype}:{obj_id}') +def get_obj_language_stats(obj_type, obj_subtype, obj_id): + return r_lang.zrange(f'obj:langs:stat:{obj_type}:{obj_subtype}:{obj_id}', 0, -1, withscores=True) + # TODO ADD language to CHAT GLOBAL SET -def add_obj_language(language, obj_type, obj_subtype, obj_id): # (s) +def add_obj_language(language, obj_type, obj_subtype, obj_id, objs_containers=set()): # (s) if not obj_subtype: obj_subtype = '' obj_global_id = f'{obj_type}:{obj_subtype}:{obj_id}' r_lang.sadd(f'objs:langs:{obj_type}', language) r_lang.sadd(f'objs:lang:{obj_type}:{obj_subtype}', language) - r_lang.sadd(f'obj:lang:{obj_global_id}', language) + new = r_lang.sadd(f'obj:lang:{obj_global_id}', language) r_lang.sadd(f'languages:{language}', f'{obj_type}:{obj_subtype}') ################### REMOVE ME ??? r_lang.sadd(f'langs:{obj_type}:{obj_subtype}:{language}', obj_global_id) -def remove_obj_language(language, obj_type, obj_subtype, obj_id): + if new: + for global_id in objs_containers: + r_lang.zincrby(f'obj:langs:stat:{global_id}', 1, language) + + +def remove_obj_language(language, obj_type, obj_subtype, obj_id, objs_containers=set()): if not obj_subtype: obj_subtype = '' obj_global_id = f'{obj_type}:{obj_subtype}:{obj_id}' - r_lang.srem(f'obj:lang:{obj_global_id}', language) + rem = r_lang.srem(f'obj:lang:{obj_global_id}', language) delete_obj_translation(obj_global_id, language) @@ -353,27 +361,33 @@ def remove_obj_language(language, obj_type, obj_subtype, obj_id): if r_lang.scard(f'objs:langs:{obj_type}') <= 1: r_lang.srem(f'objs:langs:{obj_type}', language) + if rem: + for global_id in objs_containers: + r = r_lang.zincrby(f'obj:langs:stat:{global_id}', -1, language) + if r < 1: + r_lang.zrem(f'obj:langs:stat:{global_id}', language) + # TODO handle fields -def detect_obj_language(obj_type, obj_subtype, obj_id, content): +def detect_obj_language(obj_type, obj_subtype, obj_id, content, objs_containers=set()): detector = LanguagesDetector(nb_langs=1) language = detector.detect(content) if language: language = language[0] previous_lang = get_obj_languages(obj_type, obj_subtype, obj_id) if previous_lang: - previous_lang = previous_lang[0] + previous_lang = previous_lang.pop() if language != previous_lang: - remove_obj_language(language, obj_type, obj_subtype, obj_id) - add_obj_language(language, obj_type, obj_subtype, obj_id) + remove_obj_language(previous_lang, obj_type, obj_subtype, obj_id, objs_containers=objs_containers) + add_obj_language(language, obj_type, obj_subtype, obj_id, objs_containers=objs_containers) else: - add_obj_language(language, obj_type, obj_subtype, obj_id) + add_obj_language(language, obj_type, obj_subtype, obj_id, objs_containers=objs_containers) return language ## Translation -def _get_obj_translation(obj_global_id, language, field=''): +def r_get_obj_translation(obj_global_id, language, field=''): return r_lang.hget(f'tr:{obj_global_id}:{field}', language) -def get_obj_translation(obj_global_id, language, source=None, content=None, field=''): +def _get_obj_translation(obj_global_id, language, source=None, content=None, field='', objs_containers=set()): """ Returns translated content """ @@ -385,17 +399,20 @@ def get_obj_translation(obj_global_id, language, source=None, content=None, fiel # r_cache.expire(f'translation:{language}:{obj_global_id}:{field}', 0) return translation # TODO HANDLE FIELDS TRANSLATION - translation = _get_obj_translation(obj_global_id, language, field=field) + translation = r_get_obj_translation(obj_global_id, language, field=field) if not translation: source, translation = LanguageTranslator().translate(content, source=source, target=language) if source and translation: obj_type, subtype, obj_id = obj_global_id.split(':', 2) - add_obj_language(source, obj_type, subtype, obj_id) + add_obj_language(source, obj_type, subtype, obj_id, objs_containers=objs_containers) if translation: r_cache.set(f'translation:{language}:{obj_global_id}:{field}', translation) r_cache.expire(f'translation:{language}:{obj_global_id}:{field}', 300) return translation +def get_obj_translation(obj_global_id, language, source=None, content=None, field='', objs_containers=set()): + return _get_obj_translation(obj_global_id, language, source=source, content=content, field=field, objs_containers=objs_containers) + # TODO Force to edit ???? diff --git a/bin/lib/chats_viewer.py b/bin/lib/chats_viewer.py index 397d5aca..e4b0d82c 100755 --- a/bin/lib/chats_viewer.py +++ b/bin/lib/chats_viewer.py @@ -480,7 +480,6 @@ def api_manually_translate_message(message_id, source, translation_target, trans return {"status": "error", "reason": "Max Size reached"}, 400 all_languages = Language.get_translation_languages() if source not in all_languages: - print(source) return {"status": "error", "reason": "Unknown source Language"}, 400 message_language = message.get_language() if message_language != source: diff --git a/bin/lib/objects/Messages.py b/bin/lib/objects/Messages.py index 8f1d16d4..6d57d2cf 100755 --- a/bin/lib/objects/Messages.py +++ b/bin/lib/objects/Messages.py @@ -103,8 +103,17 @@ class Message(AbstractObject): return message_id def get_chat_id(self): # TODO optimize -> use me to tag Chat - chat_id = self.get_basename().rsplit('_', 1)[0] - return chat_id + c_id = self.id.split('/') + return c_id[2] + + def get_chat(self): + c_id = self.id.split('/') + return f'chat:{c_id[0]}:{c_id[2]}' + + def get_subchannel(self): + subchannel = self.get_correlation('chat-subchannel') + if subchannel.get('chat-subchannel'): + return f'user-account:{subchannel["chat-subchannel"].pop()}' def get_thread(self): for child in self.get_childrens(): @@ -183,25 +192,6 @@ class Message(AbstractObject): else: return None - def get_translation(self, content=None, source=None, target='fr'): - """ - Returns translated content - """ - - # return self._get_field('translated') - global_id = self.get_global_id() - translation = r_cache.get(f'translation:{target}:{global_id}') - r_cache.expire(f'translation:{target}:{global_id}', 0) - if translation: - return translation - if not content: - content = self.get_content() - translation = Language.LanguageTranslator().translate(content, source=source, target=target) - if translation: - r_cache.set(f'translation:{target}:{global_id}', translation) - r_cache.expire(f'translation:{target}:{global_id}', 300) - return translation - def _set_translation(self, translation): """ Set translated content @@ -305,6 +295,8 @@ class Message(AbstractObject): else: source = None meta['translation'] = self.translate(content=meta.get('content'), source=source, target=translation_target) + if 'language' in options: + meta['language'] = self.get_language() # meta['encoding'] = None return meta @@ -318,11 +310,29 @@ class Message(AbstractObject): # self._set_translation(translated) # return translated - def create(self, content, translation=None, tags=[]): + ## Language ## + + def get_objs_container(self): + objs_containers = set() + # chat + objs_containers.add(self.get_chat()) + subchannel = self.get_subchannel() + if subchannel: + objs_containers.add(subchannel) + # thread = self.get_thread() # TODO Get current thread + # if thread: + # objs_containers.add(thread) + return objs_containers + + #- Language -# + + def create(self, content, language=None, translation=None, tags=[]): self._set_field('content', content) - # r_content.get(f'content:{self.type}:{self.get_subtype(r_str=True)}:{self.id}', content) - if translation: + if not language and content: + language = self.detect_language() + if translation and content: self._set_translation(translation) + self.set_translation(language, translation) for tag in tags: self.add_tag(tag) diff --git a/bin/lib/objects/abstract_object.py b/bin/lib/objects/abstract_object.py index 40a114ef..78a0698a 100755 --- a/bin/lib/objects/abstract_object.py +++ b/bin/lib/objects/abstract_object.py @@ -25,7 +25,7 @@ from lib import Duplicate from lib.correlations_engine import get_nb_correlations, get_correlations, add_obj_correlation, delete_obj_correlation, delete_obj_correlations, exists_obj_correlation, is_obj_correlated, get_nb_correlation_by_correl_type, get_obj_inter_correlation from lib.Investigations import is_object_investigated, get_obj_investigations, delete_obj_investigations from lib.relationships_engine import get_obj_nb_relationships, add_obj_relationship -from lib.Language import get_obj_languages, add_obj_language, remove_obj_language, detect_obj_language, get_obj_translation, set_obj_translation, delete_obj_translation +from lib.Language import get_obj_languages, add_obj_language, remove_obj_language, detect_obj_language, get_obj_language_stats, get_obj_translation, set_obj_translation, delete_obj_translation from lib.Tracker import is_obj_tracked, get_obj_trackers, delete_obj_trackers logging.config.dictConfig(ail_logger.get_config(name='ail')) @@ -302,26 +302,33 @@ class AbstractObject(ABC): ## -Relationship- ## + def get_objs_container(self): + return set() + ## Language ## def get_languages(self): return get_obj_languages(self.type, self.get_subtype(r_str=True), self.id) def add_language(self, language): - return add_obj_language(language, self.type, self.get_subtype(r_str=True), self.id) + return add_obj_language(language, self.type, self.get_subtype(r_str=True), self.id, objs_containers=self.get_objs_container()) def remove_language(self, language): - return remove_obj_language(language, self.type, self.get_subtype(r_str=True), self.id) + return remove_obj_language(language, self.type, self.get_subtype(r_str=True), self.id, objs_containers=self.get_objs_container()) def edit_language(self, old_language, new_language): - self.remove_language(old_language) + if old_language: + self.remove_language(old_language) self.add_language(new_language) def detect_language(self, field=''): - return detect_obj_language(self.type, self.get_subtype(r_str=True), self.id, self.get_content()) + return detect_obj_language(self.type, self.get_subtype(r_str=True), self.id, self.get_content(), objs_containers=self.get_objs_container()) + + def get_obj_language_stats(self): + return get_obj_language_stats(self.type, self.get_subtype(r_str=True), self.id) def get_translation(self, language, field=''): - return get_obj_translation(self.get_global_id(), language, field=field) + return get_obj_translation(self.get_global_id(), language, field=field, objs_containers=self.get_objs_container()) def set_translation(self, language, translation, field=''): return set_obj_translation(self.get_global_id(), language, translation, field=field) @@ -333,7 +340,7 @@ class AbstractObject(ABC): global_id = self.get_global_id() if not content: content = self.get_content() - translation = get_obj_translation(global_id, target, source=source, content=content, field=field) + translation = get_obj_translation(global_id, target, source=source, content=content, field=field, objs_containers=self.get_objs_container()) return translation ## -Language- ## diff --git a/var/www/blueprints/chats_explorer.py b/var/www/blueprints/chats_explorer.py index b851a5b1..ad3e3d4c 100644 --- a/var/www/blueprints/chats_explorer.py +++ b/var/www/blueprints/chats_explorer.py @@ -253,7 +253,10 @@ def objects_message_translate(): if resp[1] != 200: return create_json_response(resp[0], resp[1]) else: - return redirect(url_for('chats_explorer.objects_message', id=message_id, target=target)) + if request.referrer: + return redirect(request.referrer) + else: + return redirect(url_for('chats_explorer.objects_message', id=message_id, target=target)) @chats_explorer.route("/objects/message/detect/language", methods=['GET']) @login_required @@ -265,7 +268,10 @@ def objects_message_detect_language(): if resp[1] != 200: return create_json_response(resp[0], resp[1]) else: - return redirect(url_for('chats_explorer.objects_message', id=message_id, target=target)) + if request.referrer: + return redirect(request.referrer) + else: + return redirect(url_for('chats_explorer.objects_message', id=message_id, target=target)) @chats_explorer.route("/objects/user-account", methods=['GET']) @login_required diff --git a/var/www/templates/chats_explorer/block_message.html b/var/www/templates/chats_explorer/block_message.html index 1a52dc67..96265809 100644 --- a/var/www/templates/chats_explorer/block_message.html +++ b/var/www/templates/chats_explorer/block_message.html @@ -102,7 +102,7 @@
-
+ Source: From 2db54def46625e10eb0bf0b038c5794d826037f5 Mon Sep 17 00:00:00 2001 From: terrtia Date: Mon, 25 Mar 2024 16:36:24 +0100 Subject: [PATCH 3/4] fix: [chat] fix subchannel-message correlation + fix empty message language detection --- bin/importer/feeders/abstract_chats_feeder.py | 56 ++++++++++++++++++- bin/lib/Language.py | 22 ++++++-- bin/lib/chats_viewer.py | 19 ++++++- bin/lib/objects/Messages.py | 2 +- bin/modules/Languages.py | 4 ++ tools/reprocess_objects.py | 37 ++++++++++-- update/v5.4/Update.py | 2 + 7 files changed, 126 insertions(+), 16 deletions(-) diff --git a/bin/importer/feeders/abstract_chats_feeder.py b/bin/importer/feeders/abstract_chats_feeder.py index 6b8f1041..8b337e9f 100755 --- a/bin/importer/feeders/abstract_chats_feeder.py +++ b/bin/importer/feeders/abstract_chats_feeder.py @@ -206,8 +206,7 @@ class AbstractChatFeeder(DefaultFeeder, ABC): subchannel = ChatSubChannels.ChatSubChannel(f'{self.get_chat_id()}/{meta["id"]}', self.get_chat_instance_uuid()) thread = None - # TODO correlation with obj = message/image - subchannel.add(date) + subchannel.add(date, obj) if meta.get('date'): # TODO check if already exists subchannel.set_created_at(int(meta['date']['timestamp'])) @@ -358,7 +357,58 @@ class AbstractChatFeeder(DefaultFeeder, ABC): # CHAT chat_objs = self.process_chat(new_objs, obj, date, timestamp, reply_id=reply_id) - # Message forward + # # TODO HANDLE OTHERS OBJECT TYPE + # # TODO MAKE IT GENERIC FOR OTHERS CHATS !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + # # Message forward + Discussion + # if self.get_json_meta().get('forward'): + # discussion_id = self.get_json_meta().get('discussion') + # forward_from = self.get_message_forward() + # + # if discussion_id: # TODO HANDLE FORWARDED MESSAGES FROM EXTERNAL CHANNELS + # chat_forward_id = forward_from['from']['id'] + # message_forward_id = forward_from['from']['channel_post'] + # + # # if chat_forward_id == discussion_id: + # # linked_chat = Chat(chat_forward_id, self.get_chat_instance_uuid()) + # # if linked_chat.exists(): + # # # create thread + # # # add message replies for each childrens + # + # # TODO HANDLE THREAD + # # TODO Change FORWARD META FIELDS + # # meta['forward'] = {} + # # # CHAT ID + # # # SUBCHANNEL ID -> can be None + # # # Message ID + # + # # meta['forward']['origin'] + # # # same as 'forward' + # + # if self.get_json_meta().get('forward'): + # forward = self.get_message_forward() + # f_chat = forward['chat'] + # f_subchannel = forward.get('subchannel') + # f_id = forward.get('id') + # if not f_subchannel: + # chat_forward = Chat(f_chat, self.get_chat_instance_uuid()) + # if chat_forward.exists(): + # for chat_obj in chat_objs: + # if chat_obj.type == 'chat': + # chat_forward.add_relationship(chat_obj.get_global_id(), 'forward') + # # TODO LIST FORWARDED MESSAGES + # + # + # # Discord -> serverID + subchannel ID + message ID + # # Telegram -> chat ID + Message ID + # # + ORIGIN IDs + # + # + # + # # TODO create relationships graph + # + # + # # TODO REMOVE ME + # # Message forward # TODO handle subchannel + message ID # if self.get_json_meta().get('forward'): # forward_from = self.get_message_forward() # print('-----------------------------------------------------------') diff --git a/bin/lib/Language.py b/bin/lib/Language.py index f6c9ebfb..d7b8c1c8 100755 --- a/bin/lib/Language.py +++ b/bin/lib/Language.py @@ -265,7 +265,10 @@ def _get_html2text(content, ignore_links=False): h = html2text.HTML2Text() h.ignore_links = ignore_links h.ignore_images = ignore_links - return h.handle(content) + content = h.handle(content) + if content == '\n\n': + content = '' + return content def _clean_text_to_translate(content, html=False, keys_blocks=True): if html: @@ -482,14 +485,23 @@ class LanguagesDetector: return languages def detect(self, content, force_gcld3=False): # TODO detect length between 20-200 ???? + if not content: + return None content = _clean_text_to_translate(content, html=True) - # print('cleaned content', content) - # gcld3 - if len(content) < 100: + if not content: + return None + # DEBUG + # print('-------------------------------------------------------') + # print(content) + # print(len(content)) + # lexilang + if len(content) < 150: + # print('lexilang') languages = self.detect_lexilang(content) + # gcld3 else: # if len(content) >= 200 or not self.lt or force_gcld3: - # print('gcld3') + # print('gcld3') languages = self.detect_gcld3(content) # libretranslate # else: diff --git a/bin/lib/chats_viewer.py b/bin/lib/chats_viewer.py index e4b0d82c..6d660c9e 100755 --- a/bin/lib/chats_viewer.py +++ b/bin/lib/chats_viewer.py @@ -323,7 +323,6 @@ def get_username_meta_from_global_id(username_global_id): username = Usernames.Username(username_id, instance_uuid) return username.get_meta() - # TODO Filter ## Instance type ## Chats IDS @@ -380,6 +379,22 @@ def get_nb_messages_iterator(filters={}): nb_messages += chat.get_nb_messages() return nb_messages + +#### FIX #### + +def fix_correlations_subchannel_message(): + for instance_uuid in get_chat_service_instances(): + for chat_id in ChatServiceInstance(instance_uuid).get_chats(): + chat = Chats.Chat(chat_id, instance_uuid) + # subchannels + for subchannel_gid in chat.get_subchannels(): + _, _, subchannel_id = subchannel_gid.split(':', 2) + subchannel = ChatSubChannels.ChatSubChannel(subchannel_id, instance_uuid) + messages, _ = subchannel._get_messages(nb=-1) + for mess in messages: + _, _, message_id = mess[0].split(':', ) + subchannel.add_correlation('message', '', message_id) + #### API #### def api_get_chat_service_instance(chat_instance_uuid): @@ -392,6 +407,7 @@ def api_get_chat(chat_id, chat_instance_uuid, translation_target=None, nb=-1, pa chat = Chats.Chat(chat_id, chat_instance_uuid) if not chat.exists(): return {"status": "error", "reason": "Unknown chat"}, 404 + # print(chat.get_obj_language_stats()) meta = chat.get_meta({'created_at', 'icon', 'info', 'nb_participants', 'subchannels', 'threads', 'translation', 'username'}, translation_target=translation_target) if meta['username']: meta['username'] = get_username_meta_from_global_id(meta['username']) @@ -437,6 +453,7 @@ def api_get_subchannel(chat_id, chat_instance_uuid, translation_target=None, nb= subchannel = ChatSubChannels.ChatSubChannel(chat_id, chat_instance_uuid) if not subchannel.exists(): return {"status": "error", "reason": "Unknown subchannel"}, 404 + # print(subchannel.get_obj_language_stats()) meta = subchannel.get_meta({'chat', 'created_at', 'icon', 'nb_messages', 'nb_participants', 'threads', 'translation'}, translation_target=translation_target) if meta['chat']: meta['chat'] = get_chat_meta_from_global_id(meta['chat']) diff --git a/bin/lib/objects/Messages.py b/bin/lib/objects/Messages.py index 6d57d2cf..fbcad8f7 100755 --- a/bin/lib/objects/Messages.py +++ b/bin/lib/objects/Messages.py @@ -113,7 +113,7 @@ class Message(AbstractObject): def get_subchannel(self): subchannel = self.get_correlation('chat-subchannel') if subchannel.get('chat-subchannel'): - return f'user-account:{subchannel["chat-subchannel"].pop()}' + return f'chat-subchannel:{subchannel["chat-subchannel"].pop()}' def get_thread(self): for child in self.get_childrens(): diff --git a/bin/modules/Languages.py b/bin/modules/Languages.py index bff7b0ba..28fbdff6 100755 --- a/bin/modules/Languages.py +++ b/bin/modules/Languages.py @@ -33,6 +33,10 @@ class Languages(AbstractModule): for lang in obj.get_languages(min_probability=0.8, force_gcld3=True): print(lang) domain.add_language(lang) + # Detect Chat Message Language + # elif obj.type == 'message': + # lang = obj.detect_language() + # print(self.obj.id, lang) if __name__ == '__main__': diff --git a/tools/reprocess_objects.py b/tools/reprocess_objects.py index 6d0ffd16..678cf989 100755 --- a/tools/reprocess_objects.py +++ b/tools/reprocess_objects.py @@ -20,17 +20,39 @@ from lib.ail_core import is_object_type from lib import ail_queues from lib.objects import ail_objects -def reprocess_message_objects(object_type): - queue = ail_queues.AILQueue('FeederModuleImporter', -1) - for obj in ail_objects.obj_iterator(object_type, filters={}): - queue.send_message(obj.get_global_id(), message='reprocess') - queue.end() +# from modules.ApiKey import ApiKey +# from modules.Categ import Categ +# from modules.CreditCards import CreditCards +# from modules.DomClassifier import DomClassifier +# from modules.Global import Global +# from modules.Keys import Keys +# from modules.Onion import Onion +# from modules.Telegram import Telegram + +from modules.Languages import Languages + +MODULES = { + 'Languages': Languages +} + +def reprocess_message_objects(object_type, module_name=None): + if module_name: + module = MODULES[module_name]() + for obj in ail_objects.obj_iterator(object_type, filters={}): + module.obj = obj + module.compute(None) + else: + queue = ail_queues.AILQueue('FeederModuleImporter', -1) + for obj in ail_objects.obj_iterator(object_type, filters={}): + queue.send_message(obj.get_global_id(), message='reprocess') + queue.end() if __name__ == "__main__": parser = argparse.ArgumentParser(description='Reprocess AIL Objects') parser.add_argument('-t', '--type', type=str, help='AIL Object Type', required=True) + parser.add_argument('-m', '--module', type=str, help='AIL Module Name') args = parser.parse_args() if not args.type: @@ -43,4 +65,7 @@ if __name__ == "__main__": if obj_type not in ['item', 'message']: # TODO image raise Exception(f'Currently not supported Object Type: {obj_type}') - reprocess_message_objects(obj_type) \ No newline at end of file + modulename = args.module + if modulename not in MODULES: + raise Exception(f'Currently not supported Module: {modulename}') + reprocess_message_objects(obj_type, module_name=modulename) diff --git a/update/v5.4/Update.py b/update/v5.4/Update.py index a10e4dc9..62e04cc6 100755 --- a/update/v5.4/Update.py +++ b/update/v5.4/Update.py @@ -10,6 +10,7 @@ sys.path.append(os.environ['AIL_HOME']) ################################## from update.bin.ail_updater import AIL_Updater from lib import ail_updates +from lib import chats_viewer class Updater(AIL_Updater): """default Updater.""" @@ -19,6 +20,7 @@ class Updater(AIL_Updater): if __name__ == '__main__': + chats_viewer.fix_correlations_subchannel_message() updater = Updater('v5.4') updater.run_update() From de43f350b25e69acb1d2365c8e2dae7a82935c08 Mon Sep 17 00:00:00 2001 From: terrtia Date: Mon, 25 Mar 2024 16:55:20 +0100 Subject: [PATCH 4/4] chg: [language] add thread languages stats --- bin/lib/Language.py | 2 +- bin/lib/chats_viewer.py | 1 + bin/lib/objects/Messages.py | 12 +++++++++--- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/bin/lib/Language.py b/bin/lib/Language.py index d7b8c1c8..03293bed 100755 --- a/bin/lib/Language.py +++ b/bin/lib/Language.py @@ -405,7 +405,7 @@ def _get_obj_translation(obj_global_id, language, source=None, content=None, fie translation = r_get_obj_translation(obj_global_id, language, field=field) if not translation: source, translation = LanguageTranslator().translate(content, source=source, target=language) - if source and translation: + if source: obj_type, subtype, obj_id = obj_global_id.split(':', 2) add_obj_language(source, obj_type, subtype, obj_id, objs_containers=objs_containers) if translation: diff --git a/bin/lib/chats_viewer.py b/bin/lib/chats_viewer.py index 6d660c9e..1f4a0084 100755 --- a/bin/lib/chats_viewer.py +++ b/bin/lib/chats_viewer.py @@ -468,6 +468,7 @@ def api_get_thread(thread_id, thread_instance_uuid, translation_target=None, nb= thread = ChatThreads.ChatThread(thread_id, thread_instance_uuid) if not thread.exists(): return {"status": "error", "reason": "Unknown thread"}, 404 + # print(thread.get_obj_language_stats()) meta = thread.get_meta({'chat', 'nb_messages', 'nb_participants'}) # if meta['chat']: # meta['chat'] = get_chat_meta_from_global_id(meta['chat']) diff --git a/bin/lib/objects/Messages.py b/bin/lib/objects/Messages.py index fbcad8f7..3340de2a 100755 --- a/bin/lib/objects/Messages.py +++ b/bin/lib/objects/Messages.py @@ -115,6 +115,12 @@ class Message(AbstractObject): if subchannel.get('chat-subchannel'): return f'chat-subchannel:{subchannel["chat-subchannel"].pop()}' + def get_current_thread(self): + subchannel = self.get_correlation('chat-thread') + if subchannel.get('chat-thread'): + return f'chat-thread:{subchannel["chat-thread"].pop()}' + + # children thread def get_thread(self): for child in self.get_childrens(): obj_type, obj_subtype, obj_id = child.split(':', 2) @@ -319,9 +325,9 @@ class Message(AbstractObject): subchannel = self.get_subchannel() if subchannel: objs_containers.add(subchannel) - # thread = self.get_thread() # TODO Get current thread - # if thread: - # objs_containers.add(thread) + thread = self.get_current_thread() + if thread: + objs_containers.add(thread) return objs_containers #- Language -#