fix: [language] crawled items, force gcld3 detection

dev
terrtia 2024-02-05 14:10:19 +01:00
parent 99fedf9855
commit aa56e71631
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
3 changed files with 5 additions and 5 deletions

View File

@ -357,9 +357,9 @@ class LanguagesDetector:
languages.append(language)
return languages
def detect(self, content):
def detect(self, content, force_gcld3=False):
# gcld3
if len(content) >= 200 or not self.lt:
if len(content) >= 200 or not self.lt or force_gcld3:
language = self.detect_gcld3(content)
# libretranslate
else:

View File

@ -339,9 +339,9 @@ class Item(AbstractObject):
return {'nb': nb_line, 'max_length': max_length}
# TODO RENAME ME
def get_languages(self, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7):
def get_languages(self, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7, force_gcld3=False):
ld = LanguagesDetector(nb_langs=num_langs, min_proportion=min_proportion, min_probability=min_probability, min_len=min_len)
return ld.detect(self.get_content())
return ld.detect(self.get_content(), force_gcld3=force_gcld3)
def get_mimetype(self, content=None):
if not content:

View File

@ -30,7 +30,7 @@ class Languages(AbstractModule):
if obj.type == 'item':
if obj.is_crawled():
domain = Domain(obj.get_domain())
for lang in obj.get_languages(min_probability=0.8):
for lang in obj.get_languages(min_probability=0.8, force_gcld3=True):
print(lang)
domain.add_language(lang)