From aa56e716314a8b78a28489ed63adbdcab0e913cc Mon Sep 17 00:00:00 2001 From: terrtia Date: Mon, 5 Feb 2024 14:10:19 +0100 Subject: [PATCH] fix: [language] crawled items, force gcld3 detection --- bin/lib/Language.py | 4 ++-- bin/lib/objects/Items.py | 4 ++-- bin/modules/Languages.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/bin/lib/Language.py b/bin/lib/Language.py index 1fee96f7..041b0169 100755 --- a/bin/lib/Language.py +++ b/bin/lib/Language.py @@ -357,9 +357,9 @@ class LanguagesDetector: languages.append(language) return languages - def detect(self, content): + def detect(self, content, force_gcld3=False): # gcld3 - if len(content) >= 200 or not self.lt: + if len(content) >= 200 or not self.lt or force_gcld3: language = self.detect_gcld3(content) # libretranslate else: diff --git a/bin/lib/objects/Items.py b/bin/lib/objects/Items.py index 7b79749a..8204017d 100755 --- a/bin/lib/objects/Items.py +++ b/bin/lib/objects/Items.py @@ -339,9 +339,9 @@ class Item(AbstractObject): return {'nb': nb_line, 'max_length': max_length} # TODO RENAME ME - def get_languages(self, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7): + def get_languages(self, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7, force_gcld3=False): ld = LanguagesDetector(nb_langs=num_langs, min_proportion=min_proportion, min_probability=min_probability, min_len=min_len) - return ld.detect(self.get_content()) + return ld.detect(self.get_content(), force_gcld3=force_gcld3) def get_mimetype(self, content=None): if not content: diff --git a/bin/modules/Languages.py b/bin/modules/Languages.py index e1ce560a..bff7b0ba 100755 --- a/bin/modules/Languages.py +++ b/bin/modules/Languages.py @@ -30,7 +30,7 @@ class Languages(AbstractModule): if obj.type == 'item': if obj.is_crawled(): domain = Domain(obj.get_domain()) - for lang in obj.get_languages(min_probability=0.8): + for lang in obj.get_languages(min_probability=0.8, force_gcld3=True): print(lang) domain.add_language(lang)