mirror of https://github.com/CIRCL/AIL-framework
fix: [language] crawled items, force gcld3 detection
parent
99fedf9855
commit
aa56e71631
|
@ -357,9 +357,9 @@ class LanguagesDetector:
|
||||||
languages.append(language)
|
languages.append(language)
|
||||||
return languages
|
return languages
|
||||||
|
|
||||||
def detect(self, content):
|
def detect(self, content, force_gcld3=False):
|
||||||
# gcld3
|
# gcld3
|
||||||
if len(content) >= 200 or not self.lt:
|
if len(content) >= 200 or not self.lt or force_gcld3:
|
||||||
language = self.detect_gcld3(content)
|
language = self.detect_gcld3(content)
|
||||||
# libretranslate
|
# libretranslate
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -339,9 +339,9 @@ class Item(AbstractObject):
|
||||||
return {'nb': nb_line, 'max_length': max_length}
|
return {'nb': nb_line, 'max_length': max_length}
|
||||||
|
|
||||||
# TODO RENAME ME
|
# TODO RENAME ME
|
||||||
def get_languages(self, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7):
|
def get_languages(self, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7, force_gcld3=False):
|
||||||
ld = LanguagesDetector(nb_langs=num_langs, min_proportion=min_proportion, min_probability=min_probability, min_len=min_len)
|
ld = LanguagesDetector(nb_langs=num_langs, min_proportion=min_proportion, min_probability=min_probability, min_len=min_len)
|
||||||
return ld.detect(self.get_content())
|
return ld.detect(self.get_content(), force_gcld3=force_gcld3)
|
||||||
|
|
||||||
def get_mimetype(self, content=None):
|
def get_mimetype(self, content=None):
|
||||||
if not content:
|
if not content:
|
||||||
|
|
|
@ -30,7 +30,7 @@ class Languages(AbstractModule):
|
||||||
if obj.type == 'item':
|
if obj.type == 'item':
|
||||||
if obj.is_crawled():
|
if obj.is_crawled():
|
||||||
domain = Domain(obj.get_domain())
|
domain = Domain(obj.get_domain())
|
||||||
for lang in obj.get_languages(min_probability=0.8):
|
for lang in obj.get_languages(min_probability=0.8, force_gcld3=True):
|
||||||
print(lang)
|
print(lang)
|
||||||
domain.add_language(lang)
|
domain.add_language(lang)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue