mirror of https://github.com/CIRCL/AIL-framework
fix: [cld3] enable cld3
parent
4d39b2c813
commit
a32928643b
|
@ -273,20 +273,20 @@ def remove_all_urls_from_content(item_id, item_content=None):
|
||||||
def get_item_languages(item_id, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7):
|
def get_item_languages(item_id, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7):
|
||||||
all_languages = []
|
all_languages = []
|
||||||
|
|
||||||
# ## CLEAN CONTENT ##
|
## CLEAN CONTENT ##
|
||||||
# content = get_item_content_html2text(item_id, ignore_links=True)
|
content = get_item_content_html2text(item_id, ignore_links=True)
|
||||||
# content = remove_all_urls_from_content(item_id, item_content=content)
|
content = remove_all_urls_from_content(item_id, item_content=content)
|
||||||
#
|
|
||||||
# # REMOVE USELESS SPACE
|
# REMOVE USELESS SPACE
|
||||||
# content = ' '.join(content.split())
|
content = ' '.join(content.split())
|
||||||
# #- CLEAN CONTENT -#
|
#- CLEAN CONTENT -#
|
||||||
#
|
|
||||||
# #print(content)
|
#print(content)
|
||||||
# #print(len(content))
|
#print(len(content))
|
||||||
# if len(content) >= min_len:
|
if len(content) >= min_len:
|
||||||
# for lang in cld3.get_frequent_languages(content, num_langs=num_langs):
|
for lang in cld3.get_frequent_languages(content, num_langs=num_langs):
|
||||||
# if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable:
|
if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable:
|
||||||
# all_languages.append(lang)
|
all_languages.append(lang)
|
||||||
return all_languages
|
return all_languages
|
||||||
|
|
||||||
# API
|
# API
|
||||||
|
|
|
@ -148,20 +148,20 @@ def remove_all_urls_from_content(item_id, item_content=None):
|
||||||
def get_item_languages(item_id, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7):
|
def get_item_languages(item_id, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7):
|
||||||
all_languages = []
|
all_languages = []
|
||||||
|
|
||||||
# ## CLEAN CONTENT ##
|
## CLEAN CONTENT ##
|
||||||
# content = get_item_content_html2text(item_id, ignore_links=True)
|
content = get_item_content_html2text(item_id, ignore_links=True)
|
||||||
# content = remove_all_urls_from_content(item_id, item_content=content)
|
content = remove_all_urls_from_content(item_id, item_content=content)
|
||||||
#
|
|
||||||
# # REMOVE USELESS SPACE
|
# REMOVE USELESS SPACE
|
||||||
# content = ' '.join(content.split())
|
content = ' '.join(content.split())
|
||||||
# #- CLEAN CONTENT -#
|
#- CLEAN CONTENT -#
|
||||||
#
|
|
||||||
# #print(content)
|
#print(content)
|
||||||
# #print(len(content))
|
#print(len(content))
|
||||||
# if len(content) >= min_len:
|
if len(content) >= min_len:
|
||||||
# for lang in cld3.get_frequent_languages(content, num_langs=num_langs):
|
for lang in cld3.get_frequent_languages(content, num_langs=num_langs):
|
||||||
# if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable:
|
if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable:
|
||||||
# all_languages.append(lang)
|
all_languages.append(lang)
|
||||||
return all_languages
|
return all_languages
|
||||||
|
|
||||||
# API
|
# API
|
||||||
|
|
Loading…
Reference in New Issue