AIL-framework/bin/lib/Language.py

428 lines
11 KiB
Python
Executable File

#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import re
import sys
import html2text
import gcld3
from libretranslatepy import LibreTranslateAPI
sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from lib.ConfigLoader import ConfigLoader
config_loader = ConfigLoader()
TRANSLATOR_URL = config_loader.get_config_str('Translation', 'libretranslate')
config_loader = None
dict_iso_languages = {
'af': 'Afrikaans',
'am': 'Amharic',
'ar': 'Arabic',
'bg': 'Bulgarian',
'bn': 'Bangla',
'bs': 'Bosnian',
'ca': 'Catalan',
'ceb': 'Cebuano',
'co': 'Corsican',
'cs': 'Czech',
'cy': 'Welsh',
'da': 'Danish',
'de': 'German',
'el': 'Greek',
'en': 'English',
'eo': 'Esperanto',
'es': 'Spanish',
'et': 'Estonian',
'eu': 'Basque',
'fa': 'Persian',
'fi': 'Finnish',
'fil': 'Filipino',
'fr': 'French',
'fy': 'Western Frisian',
'ga': 'Irish',
'gd': 'Scottish Gaelic',
'gl': 'Galician',
'gu': 'Gujarati',
'ha': 'Hausa',
'haw': 'Hawaiian',
'hi': 'Hindi',
'hmn': 'Hmong',
'hr': 'Croatian',
'ht': 'Haitian Creole',
'hu': 'Hungarian',
'hy': 'Armenian',
'id': 'Indonesian',
'ig': 'Igbo',
'is': 'Icelandic',
'it': 'Italian',
'iw': 'Hebrew',
'ja': 'Japanese',
'jv': 'Javanese',
'ka': 'Georgian',
'kk': 'Kazakh',
'km': 'Khmer',
'kn': 'Kannada',
'ko': 'Korean',
'ku': 'Kurdish',
'ky': 'Kyrgyz',
'la': 'Latin',
'lb': 'Luxembourgish',
'lo': 'Lao',
'lt': 'Lithuanian',
'lv': 'Latvian',
'mg': 'Malagasy',
'mi': 'Maori',
'mk': 'Macedonian',
'ml': 'Malayalam',
'mn': 'Mongolian',
'mr': 'Marathi',
'ms': 'Malay',
'mt': 'Maltese',
'my': 'Burmese',
'ne': 'Nepali',
'nl': 'Dutch',
'no': 'Norwegian',
'ny': 'Nyanja',
'pa': 'Punjabi',
'pl': 'Polish',
'ps': 'Pashto',
'pt': 'Portuguese',
'ro': 'Romanian',
'ru': 'Russian',
'sd': 'Sindhi',
'si': 'Sinhala',
'sk': 'Slovak',
'sl': 'Slovenian',
'sm': 'Samoan',
'sn': 'Shona',
'so': 'Somali',
'sq': 'Albanian',
'sr': 'Serbian',
'st': 'Southern Sotho',
'su': 'Sundanese',
'sv': 'Swedish',
'sw': 'Swahili',
'ta': 'Tamil',
'te': 'Telugu',
'tg': 'Tajik',
'th': 'Thai',
'tr': 'Turkish',
'uk': 'Ukrainian',
'ur': 'Urdu',
'uz': 'Uzbek',
'vi': 'Vietnamese',
'xh': 'Xhosa',
'yi': 'Yiddish',
'yo': 'Yoruba',
'zh': 'Chinese',
'zu': 'Zulu'
}
dict_languages_iso = {
'Afrikaans': 'af',
'Amharic': 'am',
'Arabic': 'ar',
'Bulgarian': 'bg',
'Bangla': 'bn',
'Bosnian': 'bs',
'Catalan': 'ca',
'Cebuano': 'ceb',
'Corsican': 'co',
'Czech': 'cs',
'Welsh': 'cy',
'Danish': 'da',
'German': 'de',
'Greek': 'el',
'English': 'en',
'Esperanto': 'eo',
'Spanish': 'es',
'Estonian': 'et',
'Basque': 'eu',
'Persian': 'fa',
'Finnish': 'fi',
'Filipino': 'fil',
'French': 'fr',
'Western Frisian': 'fy',
'Irish': 'ga',
'Scottish Gaelic': 'gd',
'Galician': 'gl',
'Gujarati': 'gu',
'Hausa': 'ha',
'Hawaiian': 'haw',
'Hindi': 'hi',
'Hmong': 'hmn',
'Croatian': 'hr',
'Haitian Creole': 'ht',
'Hungarian': 'hu',
'Armenian': 'hy',
'Indonesian': 'id',
'Igbo': 'ig',
'Icelandic': 'is',
'Italian': 'it',
'Hebrew': 'iw',
'Japanese': 'ja',
'Javanese': 'jv',
'Georgian': 'ka',
'Kazakh': 'kk',
'Khmer': 'km',
'Kannada': 'kn',
'Korean': 'ko',
'Kurdish': 'ku',
'Kyrgyz': 'ky',
'Latin': 'la',
'Luxembourgish': 'lb',
'Lao': 'lo',
'Lithuanian': 'lt',
'Latvian': 'lv',
'Malagasy': 'mg',
'Maori': 'mi',
'Macedonian': 'mk',
'Malayalam': 'ml',
'Mongolian': 'mn',
'Marathi': 'mr',
'Malay': 'ms',
'Maltese': 'mt',
'Burmese': 'my',
'Nepali': 'ne',
'Dutch': 'nl',
'Norwegian': 'no',
'Nyanja': 'ny',
'Punjabi': 'pa',
'Polish': 'pl',
'Pashto': 'ps',
'Portuguese': 'pt',
'Romanian': 'ro',
'Russian': 'ru',
'Sindhi': 'sd',
'Sinhala': 'si',
'Slovak': 'sk',
'Slovenian': 'sl',
'Samoan': 'sm',
'Shona': 'sn',
'Somali': 'so',
'Albanian': 'sq',
'Serbian': 'sr',
'Southern Sotho': 'st',
'Sundanese': 'su',
'Swedish': 'sv',
'Swahili': 'sw',
'Tamil': 'ta',
'Telugu': 'te',
'Tajik': 'tg',
'Thai': 'th',
'Turkish': 'tr',
'Ukrainian': 'uk',
'Urdu': 'ur',
'Uzbek': 'uz',
'Vietnamese': 'vi',
'Xhosa': 'xh',
'Yiddish': 'yi',
'Yoruba': 'yo',
'Chinese': 'zh',
'Zulu': 'zu'
}
def get_language_from_iso(iso_language):
return dict_iso_languages.get(iso_language, None)
def get_languages_from_iso(l_iso_languages, sort=False):
l_languages = []
for iso_language in l_iso_languages:
language = get_language_from_iso(iso_language)
if language:
l_languages.append(language)
if sort:
l_languages = sorted(l_languages)
return l_languages
def get_iso_from_language(language):
return dict_languages_iso.get(language, None)
def get_iso_from_languages(l_languages, sort=False):
l_iso = []
for language in l_languages:
iso_lang = get_iso_from_language(language)
if iso_lang:
l_iso.append(iso_lang)
if sort:
l_iso = sorted(l_iso)
return l_iso
class LanguageDetector:
pass
def get_translator_instance():
return TRANSLATOR_URL
def _get_html2text(content, ignore_links=False):
h = html2text.HTML2Text()
h.ignore_links = ignore_links
h.ignore_images = ignore_links
return h.handle(content)
def _clean_text_to_translate(content, html=False, keys_blocks=True):
if html:
content = _get_html2text(content, ignore_links=True)
# REMOVE URLS
regex = r'\b(?:http://|https://)?(?:[a-zA-Z\d-]{,63}(?:\.[a-zA-Z\d-]{,63})+)(?:\:[0-9]+)*(?:/(?:$|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*\b'
url_regex = re.compile(regex)
urls = url_regex.findall(content)
urls = sorted(urls, key=len, reverse=True)
for url in urls:
content = content.replace(url, '')
# REMOVE PGP Blocks
if keys_blocks:
regex_pgp_public_blocs = r'-----BEGIN PGP PUBLIC KEY BLOCK-----[\s\S]+?-----END PGP PUBLIC KEY BLOCK-----'
regex_pgp_signature = r'-----BEGIN PGP SIGNATURE-----[\s\S]+?-----END PGP SIGNATURE-----'
regex_pgp_message = r'-----BEGIN PGP MESSAGE-----[\s\S]+?-----END PGP MESSAGE-----'
re.compile(regex_pgp_public_blocs)
re.compile(regex_pgp_signature)
re.compile(regex_pgp_message)
res = re.findall(regex_pgp_public_blocs, content)
for it in res:
content = content.replace(it, '')
res = re.findall(regex_pgp_signature, content)
for it in res:
content = content.replace(it, '')
res = re.findall(regex_pgp_message, content)
for it in res:
content = content.replace(it, '')
return content
class LanguagesDetector:
def __init__(self, nb_langs=3, min_proportion=0.2, min_probability=0.7, min_len=0):
self.lt = LibreTranslateAPI(get_translator_instance())
try:
self.lt.languages()
except Exception:
self.lt = None
self.detector = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=1000)
self.nb_langs = nb_langs
self.min_proportion = min_proportion
self.min_probability = min_probability
self.min_len = min_len
def detect_gcld3(self, content):
languages = []
content = _clean_text_to_translate(content, html=True)
if self.min_len > 0:
if len(content) < self.min_len:
return languages
for lang in self.detector.FindTopNMostFreqLangs(content, num_langs=self.nb_langs):
if lang.proportion >= self.min_proportion and lang.probability >= self.min_probability and lang.is_reliable:
languages.append(lang.language)
return languages
def detect_libretranslate(self, content):
languages = []
try:
# [{"confidence": 0.6, "language": "en"}]
resp = self.lt.detect(content)
except: # TODO ERROR MESSAGE
resp = []
if resp:
for language in resp:
if language.confidence >= self.min_probability:
languages.append(language)
return languages
def detect(self, content):
# gcld3
if len(content) >= 200 or not self.lt:
language = self.detect_gcld3(content)
# libretranslate
else:
language = self.detect_libretranslate(content)
return language
class LanguageTranslator:
def __init__(self):
self.lt = LibreTranslateAPI(get_translator_instance())
def languages(self):
languages = []
try:
for dict_lang in self.lt.languages():
languages.append({'iso': dict_lang['code'], 'language': dict_lang['name']})
except Exception as e:
print(e)
return languages
def detect_gcld3(self, content):
content = _clean_text_to_translate(content, html=True)
detector = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=1000)
lang = detector.FindLanguage(content)
# print(lang.language)
# print(lang.is_reliable)
# print(lang.proportion)
# print(lang.probability)
return lang.language
def detect_libretranslate(self, content):
try:
language = self.lt.detect(content)
except: # TODO ERROR MESSAGE
language = None
if language:
return language[0].get('language')
def detect(self, content):
# gcld3
if len(content) >= 200:
language = self.detect_gcld3(content)
# libretranslate
else:
language = self.detect_libretranslate(content)
return language
def translate(self, content, source=None, target="en"): # TODO source target
translation = None
if content:
if not source:
source = self.detect(content)
# print(source, content)
if source:
if source != target:
try:
# print(content, source, target)
translation = self.lt.translate(content, source, target)
except:
translation = None
# TODO LOG and display error
if translation == content:
print('EQUAL')
translation = None
return translation
LIST_LANGUAGES = []
def get_translation_languages():
global LIST_LANGUAGES
if not LIST_LANGUAGES:
try:
LIST_LANGUAGES = LanguageTranslator().languages()
except Exception as e:
print(e)
LIST_LANGUAGES = []
return LIST_LANGUAGES
if __name__ == '__main__':
# t_content = ''
langg = LanguageTranslator()
# langg = LanguagesDetector()
# lang.translate(t_content, source='ru')
langg.languages()