From 197ff0222d6380f4f2a7a6f074d9411d7cd3e712 Mon Sep 17 00:00:00 2001 From: terrtia Date: Fri, 8 Mar 2024 15:26:06 +0100 Subject: [PATCH] chg: [lang] improve language detection + UI: manual translation and detection --- bin/lib/Language.py | 82 ++++++++++++++----- bin/lib/chats_viewer.py | 27 ++++-- bin/lib/objects/Messages.py | 15 +++- bin/lib/objects/abstract_chat_object.py | 2 +- bin/lib/objects/abstract_object.py | 12 ++- requirements.txt | 1 + update/v5.4/Update.sh | 1 + var/www/blueprints/chats_explorer.py | 15 +++- .../chats_explorer/block_message.html | 59 ++++++++----- .../chats_explorer/block_translation.html | 1 + 10 files changed, 164 insertions(+), 51 deletions(-) diff --git a/bin/lib/Language.py b/bin/lib/Language.py index 6e77d5ea..5d3d29c9 100755 --- a/bin/lib/Language.py +++ b/bin/lib/Language.py @@ -7,6 +7,7 @@ import sys import html2text import gcld3 +from lexilang.detector import detect as lexilang_detect from libretranslatepy import LibreTranslateAPI sys.path.append(os.environ['AIL_BIN']) @@ -342,18 +343,31 @@ def remove_obj_language(language, obj_type, obj_subtype, obj_id): obj_global_id = f'{obj_type}:{obj_subtype}:{obj_id}' r_lang.srem(f'obj:lang:{obj_global_id}', language) + delete_obj_translation(obj_global_id, language) + r_lang.srem(f'langs:{obj_type}:{obj_subtype}:{language}', obj_global_id) if not r_lang.exists(f'langs:{obj_type}:{obj_subtype}:{language}'): r_lang.srem(f'objs:lang:{obj_type}:{obj_subtype}', language) r_lang.srem(f'languages:{language}', f'{obj_type}:{obj_subtype}') if not r_lang.exists(f'objs:lang:{obj_type}:{obj_subtype}'): - if r_lang.scard(f'objs:langs:{obj_type}', language) <= 1: + if r_lang.scard(f'objs:langs:{obj_type}') <= 1: r_lang.srem(f'objs:langs:{obj_type}', language) -def edit_obj_language(language, obj_type, obj_subtype, obj_id): - remove_obj_language(language, obj_type, obj_subtype, obj_id) - add_obj_language(language, obj_type, obj_subtype, obj_id) - +# TODO handle fields +def detect_obj_language(obj_type, obj_subtype, obj_id, content): + detector = LanguagesDetector(nb_langs=1) + language = detector.detect(content) + if language: + language = language[0] + previous_lang = get_obj_languages(obj_type, obj_subtype, obj_id) + if previous_lang: + previous_lang = previous_lang[0] + if language != previous_lang: + remove_obj_language(language, obj_type, obj_subtype, obj_id) + add_obj_language(language, obj_type, obj_subtype, obj_id) + else: + add_obj_language(language, obj_type, obj_subtype, obj_id) + return language ## Translation def _get_obj_translation(obj_global_id, language, field=''): @@ -364,6 +378,7 @@ def get_obj_translation(obj_global_id, language, source=None, content=None, fiel Returns translated content """ translation = r_cache.get(f'translation:{language}:{obj_global_id}:{field}') + r_cache.expire(f'translation:{language}:{obj_global_id}:{field}', 0) if translation: # DEBUG # print('cache') @@ -372,7 +387,10 @@ def get_obj_translation(obj_global_id, language, source=None, content=None, fiel # TODO HANDLE FIELDS TRANSLATION translation = _get_obj_translation(obj_global_id, language, field=field) if not translation: - translation = LanguageTranslator().translate(content, source=source, target=language) + source, translation = LanguageTranslator().translate(content, source=source, target=language) + if source and translation: + obj_type, subtype, obj_id = obj_global_id.split(':', 2) + add_obj_language(source, obj_type, subtype, obj_id) if translation: r_cache.set(f'translation:{language}:{obj_global_id}:{field}', translation) r_cache.expire(f'translation:{language}:{obj_global_id}:{field}', 300) @@ -380,10 +398,14 @@ def get_obj_translation(obj_global_id, language, source=None, content=None, fiel # TODO Force to edit ???? + def set_obj_translation(obj_global_id, language, translation, field=''): r_cache.delete(f'translation:{language}:{obj_global_id}:') return r_lang.hset(f'tr:{obj_global_id}:{field}', language, translation) +def delete_obj_translation(obj_global_id, language, field=''): + r_cache.delete(f'translation:{language}:{obj_global_id}:') + r_lang.hdel(f'tr:{obj_global_id}:{field}', language) ## --LANGUAGE ENGINE-- ## @@ -410,11 +432,22 @@ class LanguagesDetector: if self.min_len > 0: if len(content) < self.min_len: return languages + # p = self.detector.FindTopNMostFreqLangs(content, num_langs=3) + # for lang in p: + # print(lang.language, lang.probability, lang.proportion, lang.is_reliable) + # print('------------------------------------------------') for lang in self.detector.FindTopNMostFreqLangs(content, num_langs=self.nb_langs): if lang.proportion >= self.min_proportion and lang.probability >= self.min_probability and lang.is_reliable: languages.append(lang.language) return languages + def detect_lexilang(self, content): # TODO clean text ??? - TODO REMOVE SEPARATOR + language, prob = lexilang_detect(content) + if prob > 0: + return [language] + else: + return [] + def detect_libretranslate(self, content): languages = [] try: @@ -431,19 +464,26 @@ class LanguagesDetector: languages.append(language) return languages - def detect(self, content, force_gcld3=False): + def detect(self, content, force_gcld3=False): # TODO detect length between 20-200 ???? + content = _clean_text_to_translate(content, html=True) + # print('cleaned content', content) # gcld3 - if len(content) >= 200 or not self.lt or force_gcld3: - language = self.detect_gcld3(content) - # libretranslate + if len(content) < 100: + languages = self.detect_lexilang(content) else: - language = self.detect_libretranslate(content) - return language + # if len(content) >= 200 or not self.lt or force_gcld3: + # print('gcld3') + languages = self.detect_gcld3(content) + # libretranslate + # else: + # languages = self.detect_libretranslate(content) + return languages class LanguageTranslator: def __init__(self): self.lt = LibreTranslateAPI(get_translator_instance()) + self.ld = LanguagesDetector(nb_langs=1) def languages(self): languages = [] @@ -473,13 +513,13 @@ class LanguageTranslator: return language[0].get('language') def detect(self, content): - # gcld3 - if len(content) >= 200: - language = self.detect_gcld3(content) - # libretranslate - else: - language = self.detect_libretranslate(content) - return language + # print('++++++++++++++++++++++++++++++++++++++++++++++++++++++') + # print(content) + language = self.ld.detect(content) + if language: + # print(language[0]) + # print('##############################################################') + return language[0] def translate(self, content, source=None, target="en"): # TODO source target if target not in get_translation_languages(): @@ -498,9 +538,9 @@ class LanguageTranslator: translation = None # TODO LOG and display error if translation == content: - print('EQUAL') + # print('EQUAL') translation = None - return translation + return source, translation LIST_LANGUAGES = {} diff --git a/bin/lib/chats_viewer.py b/bin/lib/chats_viewer.py index 0ec42772..b8d51975 100755 --- a/bin/lib/chats_viewer.py +++ b/bin/lib/chats_viewer.py @@ -404,18 +404,33 @@ def api_get_message(message_id, translation_target=None): message = Messages.Message(message_id) if not message.exists(): return {"status": "error", "reason": "Unknown uuid"}, 404 - meta = message.get_meta({'chat', 'content', 'files-names', 'icon', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'translation', 'user-account'}, translation_target=translation_target) + meta = message.get_meta({'chat', 'content', 'files-names', 'icon', 'images', 'language', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'translation', 'user-account'}, translation_target=translation_target) return meta, 200 -def api_manually_translate_message(message_id, translation_target, translation): +def api_message_detect_language(message_id): + message = Messages.Message(message_id) + if not message.exists(): + return {"status": "error", "reason": "Unknown uuid"}, 404 + lang = message.detect_language() + return {"language": lang}, 200 + +def api_manually_translate_message(message_id, source, translation_target, translation): message = Messages.Message(message_id) if not message.exists(): return {"status": "error", "reason": "Unknown uuid"}, 404 - if len(translation) > 200000: # TODO REVIEW LIMIT - return {"status": "error", "reason": "Max Size reached"}, 400 - if translation_target not in Language.get_translation_languages(): - return {"status": "error", "reason": "Unknown Language"}, 400 if translation: + if len(translation) > 200000: # TODO REVIEW LIMIT + return {"status": "error", "reason": "Max Size reached"}, 400 + all_languages = Language.get_translation_languages() + if source not in all_languages: + print(source) + return {"status": "error", "reason": "Unknown source Language"}, 400 + message_language = message.get_language() + if message_language != source: + message.edit_language(message_language, source) + if translation: + if translation_target not in all_languages: + return {"status": "error", "reason": "Unknown target Language"}, 400 message.set_translation(translation_target, translation) # TODO SANITYZE translation return None, 200 diff --git a/bin/lib/objects/Messages.py b/bin/lib/objects/Messages.py index 56281e69..395ec680 100755 --- a/bin/lib/objects/Messages.py +++ b/bin/lib/objects/Messages.py @@ -175,6 +175,13 @@ class Message(AbstractObject): # message media # flag is deleted -> event or missing from feeder pass ??? + def get_language(self): + languages = self.get_languages() + if languages: + return languages.pop() + else: + return None + def get_translation(self, content=None, source=None, target='fr'): """ Returns translated content @@ -289,8 +296,14 @@ class Message(AbstractObject): meta['files-names'] = self.get_files_names() if 'reactions' in options: meta['reactions'] = self.get_reactions() + if 'language' in options: + meta['language'] = self.get_language() if 'translation' in options and translation_target: - meta['translation'] = self.translate(content=meta.get('content'), target=translation_target) + if meta.get('language'): + source = meta['language'] + else: + source = None + meta['translation'] = self.translate(content=meta.get('content'), source=source, target=translation_target) # meta['encoding'] = None return meta diff --git a/bin/lib/objects/abstract_chat_object.py b/bin/lib/objects/abstract_chat_object.py index 9674d735..ef90b3db 100755 --- a/bin/lib/objects/abstract_chat_object.py +++ b/bin/lib/objects/abstract_chat_object.py @@ -226,7 +226,7 @@ class AbstractChatObject(AbstractSubtypeObject, ABC): def get_message_meta(self, message, timestamp=None, translation_target='', options=None): # TODO handle file message message = Messages.Message(message[9:]) if not options: - options = {'content', 'files-names', 'images', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'translation', 'user-account'} + options = {'content', 'files-names', 'images', 'language', 'link', 'parent', 'parent_meta', 'reactions', 'thread', 'translation', 'user-account'} meta = message.get_meta(options=options, timestamp=timestamp, translation_target=translation_target) return meta diff --git a/bin/lib/objects/abstract_object.py b/bin/lib/objects/abstract_object.py index 551ad155..40a114ef 100755 --- a/bin/lib/objects/abstract_object.py +++ b/bin/lib/objects/abstract_object.py @@ -25,7 +25,7 @@ from lib import Duplicate from lib.correlations_engine import get_nb_correlations, get_correlations, add_obj_correlation, delete_obj_correlation, delete_obj_correlations, exists_obj_correlation, is_obj_correlated, get_nb_correlation_by_correl_type, get_obj_inter_correlation from lib.Investigations import is_object_investigated, get_obj_investigations, delete_obj_investigations from lib.relationships_engine import get_obj_nb_relationships, add_obj_relationship -from lib.Language import get_obj_languages, add_obj_language, remove_obj_language, get_obj_translation, set_obj_translation +from lib.Language import get_obj_languages, add_obj_language, remove_obj_language, detect_obj_language, get_obj_translation, set_obj_translation, delete_obj_translation from lib.Tracker import is_obj_tracked, get_obj_trackers, delete_obj_trackers logging.config.dictConfig(ail_logger.get_config(name='ail')) @@ -313,12 +313,22 @@ class AbstractObject(ABC): def remove_language(self, language): return remove_obj_language(language, self.type, self.get_subtype(r_str=True), self.id) + def edit_language(self, old_language, new_language): + self.remove_language(old_language) + self.add_language(new_language) + + def detect_language(self, field=''): + return detect_obj_language(self.type, self.get_subtype(r_str=True), self.id, self.get_content()) + def get_translation(self, language, field=''): return get_obj_translation(self.get_global_id(), language, field=field) def set_translation(self, language, translation, field=''): return set_obj_translation(self.get_global_id(), language, translation, field=field) + def delete_translation(self, language, field=''): + return delete_obj_translation(self.get_global_id(), language, field=field) + def translate(self, content=None, field='', source=None, target='en'): global_id = self.get_global_id() if not content: diff --git a/requirements.txt b/requirements.txt index 6fc9e511..bf236ba4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -44,6 +44,7 @@ scrapy-splash>=0.7.2 # Languages gcld3 libretranslatepy +lexilang #Graph numpy>1.18.1 diff --git a/update/v5.4/Update.sh b/update/v5.4/Update.sh index 25e98c30..87f2ab7f 100755 --- a/update/v5.4/Update.sh +++ b/update/v5.4/Update.sh @@ -24,6 +24,7 @@ echo "" echo -e $GREEN"Updating python packages ..."$DEFAULT echo "" pip install -U pylacus +pip install -U lexilang bash ${AIL_BIN}/LAUNCH.sh -lrv diff --git a/var/www/blueprints/chats_explorer.py b/var/www/blueprints/chats_explorer.py index e24abe82..24c1bd84 100644 --- a/var/www/blueprints/chats_explorer.py +++ b/var/www/blueprints/chats_explorer.py @@ -240,11 +240,24 @@ def objects_message(): @login_read_only def objects_message_translate(): message_id = request.form.get('id') + source = request.form.get('language_target') target = request.form.get('target') translation = request.form.get('translation') if target == "Don't Translate": target = None - resp = chats_viewer.api_manually_translate_message(message_id, target, translation) + resp = chats_viewer.api_manually_translate_message(message_id, source, target, translation) + if resp[1] != 200: + return create_json_response(resp[0], resp[1]) + else: + return redirect(url_for('chats_explorer.objects_message', id=message_id, target=target)) + +@chats_explorer.route("/objects/message/detect/language", methods=['GET']) +@login_required +@login_read_only +def objects_message_detect_language(): + message_id = request.args.get('id') + target = request.args.get('target') + resp = chats_viewer.api_message_detect_language(message_id) if resp[1] != 200: return create_json_response(resp[0], resp[1]) else: diff --git a/var/www/templates/chats_explorer/block_message.html b/var/www/templates/chats_explorer/block_message.html index 059e973d..1a52dc67 100644 --- a/var/www/templates/chats_explorer/block_message.html +++ b/var/www/templates/chats_explorer/block_message.html @@ -81,24 +81,6 @@
{{ message['translation'] }}
- {% set mess_id_escape= message['id'] | replace("/", "_") %} - -
-
-
- - - {{translation_target}}: - - -
-
-
- {% endif %} {% for reaction in message['reactions'] %} {{ reaction }} {{ message['reactions'][reaction] }} @@ -113,10 +95,47 @@ {{ tag }} {% endfor %}
+ + {% set mess_id_escape= message['id'] | replace("/", "_") %} + +
+
+
+ + Source: + + + + {% if translation_target %} + +     Target:{{translation_target}} + + + {% else %} + + {% endif %} +
+
+ + Detect Language + +
+
+
+
- - diff --git a/var/www/templates/chats_explorer/block_translation.html b/var/www/templates/chats_explorer/block_translation.html index 6c027921..380a575b 100644 --- a/var/www/templates/chats_explorer/block_translation.html +++ b/var/www/templates/chats_explorer/block_translation.html @@ -10,6 +10,7 @@ {% else %} + {% endif %} {% for language in translation_languages %}