mirror of https://github.com/CIRCL/AIL-framework
chg: [Language] replace pycld3 by gcld3 + clean text before language detection
parent
d6d67f6a4c
commit
1c46bb4296
|
@ -2,9 +2,11 @@
|
||||||
# -*-coding:UTF-8 -*
|
# -*-coding:UTF-8 -*
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
import html2text
|
||||||
|
|
||||||
import cld3
|
import gcld3
|
||||||
from libretranslatepy import LibreTranslateAPI
|
from libretranslatepy import LibreTranslateAPI
|
||||||
|
|
||||||
sys.path.append(os.environ['AIL_BIN'])
|
sys.path.append(os.environ['AIL_BIN'])
|
||||||
|
@ -259,6 +261,91 @@ class LanguageDetector:
|
||||||
def get_translator_instance():
|
def get_translator_instance():
|
||||||
return TRANSLATOR_URL
|
return TRANSLATOR_URL
|
||||||
|
|
||||||
|
def _get_html2text(content, ignore_links=False):
|
||||||
|
h = html2text.HTML2Text()
|
||||||
|
h.ignore_links = ignore_links
|
||||||
|
h.ignore_images = ignore_links
|
||||||
|
return h.handle(content)
|
||||||
|
|
||||||
|
def _clean_text_to_translate(content, html=False, keys_blocks=True):
|
||||||
|
if html:
|
||||||
|
content = _get_html2text(content, ignore_links=True)
|
||||||
|
|
||||||
|
# REMOVE URLS
|
||||||
|
regex = r'\b(?:http://|https://)?(?:[a-zA-Z\d-]{,63}(?:\.[a-zA-Z\d-]{,63})+)(?:\:[0-9]+)*(?:/(?:$|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*\b'
|
||||||
|
url_regex = re.compile(regex)
|
||||||
|
urls = url_regex.findall(content)
|
||||||
|
urls = sorted(urls, key=len, reverse=True)
|
||||||
|
for url in urls:
|
||||||
|
content = content.replace(url, '')
|
||||||
|
|
||||||
|
# REMOVE PGP Blocks
|
||||||
|
if keys_blocks:
|
||||||
|
regex_pgp_public_blocs = r'-----BEGIN PGP PUBLIC KEY BLOCK-----[\s\S]+?-----END PGP PUBLIC KEY BLOCK-----'
|
||||||
|
regex_pgp_signature = r'-----BEGIN PGP SIGNATURE-----[\s\S]+?-----END PGP SIGNATURE-----'
|
||||||
|
regex_pgp_message = r'-----BEGIN PGP MESSAGE-----[\s\S]+?-----END PGP MESSAGE-----'
|
||||||
|
re.compile(regex_pgp_public_blocs)
|
||||||
|
re.compile(regex_pgp_signature)
|
||||||
|
re.compile(regex_pgp_message)
|
||||||
|
res = re.findall(regex_pgp_public_blocs, content)
|
||||||
|
for it in res:
|
||||||
|
content = content.replace(it, '')
|
||||||
|
res = re.findall(regex_pgp_signature, content)
|
||||||
|
for it in res:
|
||||||
|
content = content.replace(it, '')
|
||||||
|
res = re.findall(regex_pgp_message, content)
|
||||||
|
for it in res:
|
||||||
|
content = content.replace(it, '')
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
class LanguagesDetector:
|
||||||
|
|
||||||
|
def __init__(self, nb_langs=3, min_proportion=0.2, min_probability=0.7, min_len=0):
|
||||||
|
self.lt = LibreTranslateAPI(get_translator_instance())
|
||||||
|
try:
|
||||||
|
self.lt.languages()
|
||||||
|
except Exception:
|
||||||
|
self.lt = None
|
||||||
|
self.detector = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=1000)
|
||||||
|
self.nb_langs = nb_langs
|
||||||
|
self.min_proportion = min_proportion
|
||||||
|
self.min_probability = min_probability
|
||||||
|
self.min_len = min_len
|
||||||
|
|
||||||
|
def detect_gcld3(self, content):
|
||||||
|
languages = []
|
||||||
|
content = _clean_text_to_translate(content, html=True)
|
||||||
|
if self.min_len > 0:
|
||||||
|
if len(content) < self.min_len:
|
||||||
|
return languages
|
||||||
|
for lang in self.detector.FindTopNMostFreqLangs(content, num_langs=self.nb_langs):
|
||||||
|
if lang.proportion >= self.min_proportion and lang.probability >= self.min_probability and lang.is_reliable:
|
||||||
|
languages.append(lang.language)
|
||||||
|
return languages
|
||||||
|
|
||||||
|
def detect_libretranslate(self, content):
|
||||||
|
languages = []
|
||||||
|
try:
|
||||||
|
# [{"confidence": 0.6, "language": "en"}]
|
||||||
|
resp = self.lt.detect(content)
|
||||||
|
except: # TODO ERROR MESSAGE
|
||||||
|
resp = []
|
||||||
|
if resp:
|
||||||
|
for language in resp:
|
||||||
|
if language.confidence >= self.min_probability:
|
||||||
|
languages.append(language)
|
||||||
|
return languages
|
||||||
|
|
||||||
|
def detect(self, content):
|
||||||
|
# gcld3
|
||||||
|
if len(content) >= 200 or not self.lt:
|
||||||
|
language = self.detect_gcld3(content)
|
||||||
|
# libretranslate
|
||||||
|
else:
|
||||||
|
language = self.detect_libretranslate(content)
|
||||||
|
return language
|
||||||
|
|
||||||
class LanguageTranslator:
|
class LanguageTranslator:
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -273,9 +360,15 @@ class LanguageTranslator:
|
||||||
print(e)
|
print(e)
|
||||||
return languages
|
return languages
|
||||||
|
|
||||||
def detect_cld3(self, content):
|
def detect_gcld3(self, content):
|
||||||
for lang in cld3.get_frequent_languages(content, num_langs=1):
|
content = _clean_text_to_translate(content, html=True)
|
||||||
return lang.language
|
detector = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=1000)
|
||||||
|
lang = detector.FindLanguage(content)
|
||||||
|
# print(lang.language)
|
||||||
|
# print(lang.is_reliable)
|
||||||
|
# print(lang.proportion)
|
||||||
|
# print(lang.probability)
|
||||||
|
return lang.language
|
||||||
|
|
||||||
def detect_libretranslate(self, content):
|
def detect_libretranslate(self, content):
|
||||||
try:
|
try:
|
||||||
|
@ -285,10 +378,10 @@ class LanguageTranslator:
|
||||||
if language:
|
if language:
|
||||||
return language[0].get('language')
|
return language[0].get('language')
|
||||||
|
|
||||||
def detect(self, content): # TODO replace by gcld3
|
def detect(self, content):
|
||||||
# cld3
|
# gcld3
|
||||||
if len(content) >= 200:
|
if len(content) >= 200:
|
||||||
language = self.detect_cld3(content)
|
language = self.detect_gcld3(content)
|
||||||
# libretranslate
|
# libretranslate
|
||||||
else:
|
else:
|
||||||
language = self.detect_libretranslate(content)
|
language = self.detect_libretranslate(content)
|
||||||
|
@ -313,18 +406,22 @@ class LanguageTranslator:
|
||||||
translation = None
|
translation = None
|
||||||
return translation
|
return translation
|
||||||
|
|
||||||
try:
|
|
||||||
LIST_LANGUAGES = LanguageTranslator().languages()
|
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
LIST_LANGUAGES = []
|
|
||||||
|
|
||||||
|
LIST_LANGUAGES = []
|
||||||
def get_translation_languages():
|
def get_translation_languages():
|
||||||
|
global LIST_LANGUAGES
|
||||||
|
if not LIST_LANGUAGES:
|
||||||
|
try:
|
||||||
|
LIST_LANGUAGES = LanguageTranslator().languages()
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
LIST_LANGUAGES = []
|
||||||
return LIST_LANGUAGES
|
return LIST_LANGUAGES
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
t_content = ''
|
# t_content = ''
|
||||||
langg = LanguageTranslator()
|
langg = LanguageTranslator()
|
||||||
|
# langg = LanguagesDetector()
|
||||||
# lang.translate(t_content, source='ru')
|
# lang.translate(t_content, source='ru')
|
||||||
langg.languages()
|
langg.languages()
|
||||||
|
|
|
@ -7,7 +7,6 @@ import magic
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import cld3
|
|
||||||
import html2text
|
import html2text
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
@ -23,6 +22,7 @@ from lib.ail_core import get_ail_uuid, rreplace
|
||||||
from lib.objects.abstract_object import AbstractObject
|
from lib.objects.abstract_object import AbstractObject
|
||||||
from lib.ConfigLoader import ConfigLoader
|
from lib.ConfigLoader import ConfigLoader
|
||||||
from lib import item_basic
|
from lib import item_basic
|
||||||
|
from lib.Language import LanguagesDetector
|
||||||
from lib.data_retention_engine import update_obj_date, get_obj_date_first
|
from lib.data_retention_engine import update_obj_date, get_obj_date_first
|
||||||
from packages import Date
|
from packages import Date
|
||||||
|
|
||||||
|
@ -338,21 +338,10 @@ class Item(AbstractObject):
|
||||||
nb_line += 1
|
nb_line += 1
|
||||||
return {'nb': nb_line, 'max_length': max_length}
|
return {'nb': nb_line, 'max_length': max_length}
|
||||||
|
|
||||||
|
# TODO RENAME ME
|
||||||
def get_languages(self, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7):
|
def get_languages(self, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7):
|
||||||
all_languages = []
|
ld = LanguagesDetector(nb_langs=num_langs, min_proportion=min_proportion, min_probability=min_probability, min_len=min_len)
|
||||||
## CLEAN CONTENT ##
|
return ld.detect(self.get_content())
|
||||||
content = self.get_html2text_content(ignore_links=True)
|
|
||||||
content = remove_all_urls_from_content(self.id, item_content=content) ##########################################
|
|
||||||
# REMOVE USELESS SPACE
|
|
||||||
content = ' '.join(content.split())
|
|
||||||
#- CLEAN CONTENT -#
|
|
||||||
#print(content)
|
|
||||||
#print(len(content))
|
|
||||||
if len(content) >= min_len: # # TODO: # FIXME: check num langs limit
|
|
||||||
for lang in cld3.get_frequent_languages(content, num_langs=num_langs):
|
|
||||||
if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable:
|
|
||||||
all_languages.append(lang)
|
|
||||||
return all_languages
|
|
||||||
|
|
||||||
def get_mimetype(self, content=None):
|
def get_mimetype(self, content=None):
|
||||||
if not content:
|
if not content:
|
||||||
|
@ -677,24 +666,6 @@ def remove_all_urls_from_content(item_id, item_content=None):
|
||||||
|
|
||||||
return item_content
|
return item_content
|
||||||
|
|
||||||
def get_item_languages(item_id, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7):
|
|
||||||
all_languages = []
|
|
||||||
|
|
||||||
## CLEAN CONTENT ##
|
|
||||||
content = get_item_content_html2text(item_id, ignore_links=True)
|
|
||||||
content = remove_all_urls_from_content(item_id, item_content=content)
|
|
||||||
|
|
||||||
# REMOVE USELESS SPACE
|
|
||||||
content = ' '.join(content.split())
|
|
||||||
#- CLEAN CONTENT -#
|
|
||||||
|
|
||||||
#print(content)
|
|
||||||
#print(len(content))
|
|
||||||
if len(content) >= min_len:
|
|
||||||
for lang in cld3.get_frequent_languages(content, num_langs=num_langs):
|
|
||||||
if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable:
|
|
||||||
all_languages.append(lang)
|
|
||||||
return all_languages
|
|
||||||
|
|
||||||
# API
|
# API
|
||||||
# def get_item(request_dict):
|
# def get_item(request_dict):
|
||||||
|
@ -945,13 +916,13 @@ def create_item(obj_id, obj_metadata, io_content):
|
||||||
# delete_item(child_id)
|
# delete_item(child_id)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
# if __name__ == '__main__':
|
||||||
# content = 'test file content'
|
# content = 'test file content'
|
||||||
# duplicates = {'tests/2020/01/02/test.gz': [{'algo':'ssdeep', 'similarity':75}, {'algo':'tlsh', 'similarity':45}]}
|
# duplicates = {'tests/2020/01/02/test.gz': [{'algo':'ssdeep', 'similarity':75}, {'algo':'tlsh', 'similarity':45}]}
|
||||||
#
|
#
|
||||||
# item = Item('tests/2020/01/02/test_save.gz')
|
# item = Item('tests/2020/01/02/test_save.gz')
|
||||||
# item.create(content, _save=False)
|
# item.create(content, _save=False)
|
||||||
filters = {'date_from': '20230101', 'date_to': '20230501', 'sources': ['crawled', 'submitted'], 'start': ':submitted/2023/04/28/submitted_2b3dd861-a75d-48e4-8cec-6108d41450da.gz'}
|
# filters = {'date_from': '20230101', 'date_to': '20230501', 'sources': ['crawled', 'submitted'], 'start': ':submitted/2023/04/28/submitted_2b3dd861-a75d-48e4-8cec-6108d41450da.gz'}
|
||||||
gen = get_all_items_objects(filters=filters)
|
# gen = get_all_items_objects(filters=filters)
|
||||||
for obj_id in gen:
|
# for obj_id in gen:
|
||||||
print(obj_id.id)
|
# print(obj_id.id)
|
||||||
|
|
|
@ -4,8 +4,6 @@
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import cld3
|
|
||||||
import html2text
|
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
@ -184,14 +182,6 @@ class Message(AbstractObject):
|
||||||
"""
|
"""
|
||||||
return self._set_field('translated', translation) # translation by hash ??? -> avoid translating multiple time
|
return self._set_field('translated', translation) # translation by hash ??? -> avoid translating multiple time
|
||||||
|
|
||||||
def get_html2text_content(self, content=None, ignore_links=False):
|
|
||||||
if not content:
|
|
||||||
content = self.get_content()
|
|
||||||
h = html2text.HTML2Text()
|
|
||||||
h.ignore_links = ignore_links
|
|
||||||
h.ignore_images = ignore_links
|
|
||||||
return h.handle(content)
|
|
||||||
|
|
||||||
# def get_ail_2_ail_payload(self):
|
# def get_ail_2_ail_payload(self):
|
||||||
# payload = {'raw': self.get_gzip_content(b64=True)}
|
# payload = {'raw': self.get_gzip_content(b64=True)}
|
||||||
# return payload
|
# return payload
|
||||||
|
@ -287,48 +277,6 @@ class Message(AbstractObject):
|
||||||
# meta['encoding'] = None
|
# meta['encoding'] = None
|
||||||
return meta
|
return meta
|
||||||
|
|
||||||
def _languages_cleaner(self, content=None):
|
|
||||||
if not content:
|
|
||||||
content = self.get_content()
|
|
||||||
# REMOVE URLS
|
|
||||||
regex = r'\b(?:http://|https://)?(?:[a-zA-Z\d-]{,63}(?:\.[a-zA-Z\d-]{,63})+)(?:\:[0-9]+)*(?:/(?:$|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*\b'
|
|
||||||
url_regex = re.compile(regex)
|
|
||||||
urls = url_regex.findall(content)
|
|
||||||
urls = sorted(urls, key=len, reverse=True)
|
|
||||||
for url in urls:
|
|
||||||
content = content.replace(url, '')
|
|
||||||
# REMOVE PGP Blocks
|
|
||||||
regex_pgp_public_blocs = r'-----BEGIN PGP PUBLIC KEY BLOCK-----[\s\S]+?-----END PGP PUBLIC KEY BLOCK-----'
|
|
||||||
regex_pgp_signature = r'-----BEGIN PGP SIGNATURE-----[\s\S]+?-----END PGP SIGNATURE-----'
|
|
||||||
regex_pgp_message = r'-----BEGIN PGP MESSAGE-----[\s\S]+?-----END PGP MESSAGE-----'
|
|
||||||
re.compile(regex_pgp_public_blocs)
|
|
||||||
re.compile(regex_pgp_signature)
|
|
||||||
re.compile(regex_pgp_message)
|
|
||||||
res = re.findall(regex_pgp_public_blocs, content)
|
|
||||||
for it in res:
|
|
||||||
content = content.replace(it, '')
|
|
||||||
res = re.findall(regex_pgp_signature, content)
|
|
||||||
for it in res:
|
|
||||||
content = content.replace(it, '')
|
|
||||||
res = re.findall(regex_pgp_message, content)
|
|
||||||
for it in res:
|
|
||||||
content = content.replace(it, '')
|
|
||||||
return content
|
|
||||||
|
|
||||||
def detect_languages(self, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7):
|
|
||||||
languages = []
|
|
||||||
## CLEAN CONTENT ##
|
|
||||||
content = self.get_html2text_content(ignore_links=True)
|
|
||||||
content = self._languages_cleaner(content=content)
|
|
||||||
# REMOVE USELESS SPACE
|
|
||||||
content = ' '.join(content.split())
|
|
||||||
# - CLEAN CONTENT - #
|
|
||||||
if len(content) >= min_len:
|
|
||||||
for lang in cld3.get_frequent_languages(content, num_langs=num_langs):
|
|
||||||
if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable:
|
|
||||||
languages.append(lang)
|
|
||||||
return languages
|
|
||||||
|
|
||||||
# def translate(self, content=None): # TODO translation plugin
|
# def translate(self, content=None): # TODO translation plugin
|
||||||
# # TODO get text language
|
# # TODO get text language
|
||||||
# if not content:
|
# if not content:
|
||||||
|
|
|
@ -31,7 +31,8 @@ class Languages(AbstractModule):
|
||||||
if obj.is_crawled():
|
if obj.is_crawled():
|
||||||
domain = Domain(obj.get_domain())
|
domain = Domain(obj.get_domain())
|
||||||
for lang in obj.get_languages(min_probability=0.8):
|
for lang in obj.get_languages(min_probability=0.8):
|
||||||
domain.add_language(lang.language)
|
print(lang)
|
||||||
|
domain.add_language(lang)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -42,7 +42,7 @@ scrapy>2.0.0
|
||||||
scrapy-splash>=0.7.2
|
scrapy-splash>=0.7.2
|
||||||
|
|
||||||
# Languages
|
# Languages
|
||||||
pycld3>0.20
|
gcld3
|
||||||
libretranslatepy
|
libretranslatepy
|
||||||
|
|
||||||
#Graph
|
#Graph
|
||||||
|
|
Loading…
Reference in New Issue