chg: [Languages]detect + search domains languages

pull/534/head
Terrtia 2020-12-11 21:02:07 +01:00
parent 28f6963ff4
commit 6bc54baf74
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
17 changed files with 990 additions and 21 deletions

View File

@ -216,6 +216,8 @@ function launching_scripts {
sleep 0.1 sleep 0.1
screen -S "Script_AIL" -X screen -t "Tags" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Tags.py; read x" screen -S "Script_AIL" -X screen -t "Tags" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Tags.py; read x"
sleep 0.1 sleep 0.1
screen -S "Script_AIL" -X screen -t "Languages" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Languages.py; read x"
sleep 0.1
screen -S "Script_AIL" -X screen -t "SentimentAnalysis" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./SentimentAnalysis.py; read x" screen -S "Script_AIL" -X screen -t "SentimentAnalysis" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./SentimentAnalysis.py; read x"
sleep 0.1 sleep 0.1
screen -S "Script_AIL" -X screen -t "DbCleaner" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./DbCleaner.py; read x" screen -S "Script_AIL" -X screen -t "DbCleaner" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./DbCleaner.py; read x"

33
bin/Languages.py Executable file
View File

@ -0,0 +1,33 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import sys
import cld3
from packages import Item
from lib import Domain
from pubsublogger import publisher
from Helper import Process
if __name__ == '__main__':
publisher.port = 6380
publisher.channel = 'Script'
# Section name in bin/packages/modules.cfg
config_section = 'Languages'
# Setup the I/O queues
p = Process(config_section)
while True:
message = p.get_from_set()
if message is None:
publisher.debug("{} queue is empty, waiting".format(config_section))
time.sleep(1)
continue
item_id = Item.get_item_id(message)
if Item.is_crawled(item_id):
domain = Item.get_item_domain(item_id)
Domain.add_domain_languages_by_item_id(domain, item_id)

View File

@ -9,6 +9,7 @@ The ``Domain``
import os import os
import sys import sys
import itertools
import time import time
import redis import redis
import random import random
@ -24,6 +25,7 @@ import Tag
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
import ConfigLoader import ConfigLoader
import Correlate_object import Correlate_object
import Language
import Screenshot import Screenshot
import Username import Username
@ -66,6 +68,15 @@ def sanitize_domain_type(domain_type):
else: else:
return 'regular' return 'regular'
def sanitize_domain_types(l_domain_type):
all_domain_types = get_all_domains_type()
if not l_domain_type:
return all_domain_types
for domain_type in l_domain_type:
if domain_type not in all_domain_types:
return all_domain_types
return l_domain_type
######## DOMAINS ######## ######## DOMAINS ########
def get_all_domains_type(): def get_all_domains_type():
return ['onion', 'regular'] return ['onion', 'regular']
@ -210,6 +221,15 @@ def get_domains_up_by_filers(domain_type, date_from=None, date_to=None, tags=[],
else: else:
return None return None
## TODO: filters:
# - tags
# - languages
# - daterange UP
def get_domains_by_filters():
pass
def create_domains_metadata_list(list_domains, domain_type): def create_domains_metadata_list(list_domains, domain_type):
l_domains = [] l_domains = []
for domain in list_domains: for domain in list_domains:
@ -218,9 +238,98 @@ def create_domains_metadata_list(list_domains, domain_type):
else: else:
dom_type = domain_type dom_type = domain_type
l_domains.append(get_domain_metadata(domain, dom_type, first_seen=True, last_ckeck=True, status=True, l_domains.append(get_domain_metadata(domain, dom_type, first_seen=True, last_ckeck=True, status=True,
ports=True, tags=True, screenshot=True, tags_safe=True)) ports=True, tags=True, languages=True, screenshot=True, tags_safe=True))
return l_domains return l_domains
######## LANGUAGES ########
def get_all_domains_languages():
return r_serv_onion.smembers('all_domains_languages')
def get_domains_by_languages(languages, l_domain_type=[]):
l_domain_type = sanitize_domain_types(l_domain_type)
if not languages:
return []
elif len(languages) == 1:
return get_all_domains_by_language(languages[0], l_domain_type=l_domain_type)
else:
all_domains_t = []
for domain_type in l_domain_type:
l_keys_name = []
for language in languages:
l_keys_name.append('language:domains:{}:{}'.format(domain_type, language))
res = r_serv_onion.sinter(l_keys_name[0], *l_keys_name[1:])
if res:
all_domains_t.append(res)
return list(itertools.chain.from_iterable(all_domains_t))
def get_all_domains_by_language(language, l_domain_type=[]):
l_domain_type = sanitize_domain_types(l_domain_type)
if len(l_domain_type) == 1:
return r_serv_onion.smembers('language:domains:{}:{}'.format(l_domain_type[0], language))
else:
l_keys_name = []
for domain_type in l_domain_type:
l_keys_name.append('language:domains:{}:{}'.format(domain_type, language))
return r_serv_onion.sunion(l_keys_name[0], *l_keys_name[1:])
def get_domain_languages(domain, r_list=False):
res = r_serv_onion.smembers('domain:language:{}'.format(domain))
if r_list:
return list(res)
else:
return res
def add_domain_language(domain, language):
language = language.split('-')[0]
domain_type = get_domain_type(domain)
r_serv_onion.sadd('all_domains_languages', language)
r_serv_onion.sadd('all_domains_languages:{}'.format(domain_type), language)
r_serv_onion.sadd('language:domains:{}:{}'.format(domain_type, language), domain)
r_serv_onion.sadd('domain:language:{}'.format(domain), language)
def add_domain_languages_by_item_id(domain, item_id):
for lang in Item.get_item_languages(item_id, min_proportion=0.2, min_probability=0.8):
add_domain_language(domain, lang.language)
def delete_domain_languages(domain):
domain_type = get_domain_type(domain)
for language in get_domain_languages(domain):
r_serv_onion.srem('language:domains:{}:{}'.format(domain_type, language), domain)
if not r_serv_onion.exists('language:domains:{}:{}'.format(domain_type, language)):
r_serv_onion.srem('all_domains_languages:{}'.format(domain_type), language)
exist_domain_type_lang = False
for domain_type in get_all_domains_type():
if r_serv_onion.sismembers('all_domains_languages:{}'.format(domain_type), language):
exist_domain_type_lang = True
continue
if not exist_domain_type_lang:
r_serv_onion.srem('all_domains_languages', language)
r_serv_onion.delete('domain:language:{}'.format(domain))
def _delete_all_domains_languages():
for language in get_all_domains_languages():
for domain in get_all_domains_by_language(language):
delete_domain_languages(domain)
## API ##
## TODO: verify domains type + languages list
## TODO: add pagination
def api_get_domains_by_languages(domains_types, languages, domains_metadata=False, page=1):
l_domains = sorted(get_domains_by_languages(languages, l_domain_type=domains_types))
l_domains = paginate_iterator(l_domains, nb_obj=28, page=page)
if not domains_metadata:
return l_domains
else:
l_dict_domains = []
for domain in l_domains['list_elem']:
l_dict_domains.append(get_domain_metadata(domain, get_domain_type(domain), first_seen=True, last_ckeck=True,
status=True, ports=True, tags=True, tags_safe=True,
languages=True, screenshot=True))
l_domains['list_elem'] = l_dict_domains
return l_domains
####---- ----####
######## DOMAIN ######## ######## DOMAIN ########
def get_domain_type(domain): def get_domain_type(domain):
@ -498,7 +607,7 @@ def get_domain_random_screenshot(domain):
''' '''
return Screenshot.get_randon_domain_screenshot(domain) return Screenshot.get_randon_domain_screenshot(domain)
def get_domain_metadata(domain, domain_type, first_seen=True, last_ckeck=True, status=True, ports=True, tags=False, tags_safe=False, screenshot=False): def get_domain_metadata(domain, domain_type, first_seen=True, last_ckeck=True, status=True, ports=True, tags=False, tags_safe=False, languages=False, screenshot=False):
''' '''
Get Domain basic metadata Get Domain basic metadata
@ -516,6 +625,7 @@ def get_domain_metadata(domain, domain_type, first_seen=True, last_ckeck=True, s
''' '''
dict_metadata = {} dict_metadata = {}
dict_metadata['id'] = domain dict_metadata['id'] = domain
dict_metadata['type'] = domain_type
if first_seen: if first_seen:
res = get_domain_first_seen(domain, domain_type=domain_type) res = get_domain_first_seen(domain, domain_type=domain_type)
if res is not None: if res is not None:
@ -535,6 +645,8 @@ def get_domain_metadata(domain, domain_type, first_seen=True, last_ckeck=True, s
dict_metadata['is_tags_safe'] = Tag.is_tags_safe(dict_metadata['tags']) dict_metadata['is_tags_safe'] = Tag.is_tags_safe(dict_metadata['tags'])
else: else:
dict_metadata['is_tags_safe'] = Tag.is_tags_safe(get_domain_tags(domain)) dict_metadata['is_tags_safe'] = Tag.is_tags_safe(get_domain_tags(domain))
if languages:
dict_metadata['languages'] = Language.get_languages_from_iso(get_domain_languages(domain, r_list=True), sort=True)
if screenshot: if screenshot:
dict_metadata['screenshot'] = get_domain_random_screenshot(domain) dict_metadata['screenshot'] = get_domain_random_screenshot(domain)
return dict_metadata return dict_metadata
@ -796,6 +908,14 @@ class Domain(object):
''' '''
return get_domain_tags(self.domain) return get_domain_tags(self.domain)
def get_domain_languages(self):
'''
Retun all languages of a given domain.
:param domain: domain name
'''
return get_domain_languages(self.domain)
def get_domain_correlation(self): def get_domain_correlation(self):
''' '''
Retun all correlation of a given domain. Retun all correlation of a given domain.

240
bin/lib/Language.py Executable file
View File

@ -0,0 +1,240 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import sys
import redis
dict_iso_languages = {
'af': 'Afrikaans',
'am': 'Amharic',
'ar': 'Arabic',
'bg': 'Bulgarian',
'bn': 'Bangla',
'bs': 'Bosnian',
'ca': 'Catalan',
'ceb': 'Cebuano',
'co': 'Corsican',
'cs': 'Czech',
'cy': 'Welsh',
'da': 'Danish',
'de': 'German',
'el': 'Greek',
'en': 'English',
'eo': 'Esperanto',
'es': 'Spanish',
'et': 'Estonian',
'eu': 'Basque',
'fa': 'Persian',
'fi': 'Finnish',
'fil': 'Filipino',
'fr': 'French',
'fy': 'Western Frisian',
'ga': 'Irish',
'gd': 'Scottish Gaelic',
'gl': 'Galician',
'gu': 'Gujarati',
'ha': 'Hausa',
'haw': 'Hawaiian',
'hi': 'Hindi',
'hmn': 'Hmong',
'hr': 'Croatian',
'ht': 'Haitian Creole',
'hu': 'Hungarian',
'hy': 'Armenian',
'id': 'Indonesian',
'ig': 'Igbo',
'is': 'Icelandic',
'it': 'Italian',
'iw': 'Hebrew',
'ja': 'Japanese',
'jv': 'Javanese',
'ka': 'Georgian',
'kk': 'Kazakh',
'km': 'Khmer',
'kn': 'Kannada',
'ko': 'Korean',
'ku': 'Kurdish',
'ky': 'Kyrgyz',
'la': 'Latin',
'lb': 'Luxembourgish',
'lo': 'Lao',
'lt': 'Lithuanian',
'lv': 'Latvian',
'mg': 'Malagasy',
'mi': 'Maori',
'mk': 'Macedonian',
'ml': 'Malayalam',
'mn': 'Mongolian',
'mr': 'Marathi',
'ms': 'Malay',
'mt': 'Maltese',
'my': 'Burmese',
'ne': 'Nepali',
'nl': 'Dutch',
'no': 'Norwegian',
'ny': 'Nyanja',
'pa': 'Punjabi',
'pl': 'Polish',
'ps': 'Pashto',
'pt': 'Portuguese',
'ro': 'Romanian',
'ru': 'Russian',
'sd': 'Sindhi',
'si': 'Sinhala',
'sk': 'Slovak',
'sl': 'Slovenian',
'sm': 'Samoan',
'sn': 'Shona',
'so': 'Somali',
'sq': 'Albanian',
'sr': 'Serbian',
'st': 'Southern Sotho',
'su': 'Sundanese',
'sv': 'Swedish',
'sw': 'Swahili',
'ta': 'Tamil',
'te': 'Telugu',
'tg': 'Tajik',
'th': 'Thai',
'tr': 'Turkish',
'uk': 'Ukrainian',
'ur': 'Urdu',
'uz': 'Uzbek',
'vi': 'Vietnamese',
'xh': 'Xhosa',
'yi': 'Yiddish',
'yo': 'Yoruba',
'zh': 'Chinese',
'zu': 'Zulu'
}
dict_languages_iso = {
'Afrikaans': 'af',
'Amharic': 'am',
'Arabic': 'ar',
'Bulgarian': 'bg',
'Bangla': 'bn',
'Bosnian': 'bs',
'Catalan': 'ca',
'Cebuano': 'ceb',
'Corsican': 'co',
'Czech': 'cs',
'Welsh': 'cy',
'Danish': 'da',
'German': 'de',
'Greek': 'el',
'English': 'en',
'Esperanto': 'eo',
'Spanish': 'es',
'Estonian': 'et',
'Basque': 'eu',
'Persian': 'fa',
'Finnish': 'fi',
'Filipino': 'fil',
'French': 'fr',
'Western Frisian': 'fy',
'Irish': 'ga',
'Scottish Gaelic': 'gd',
'Galician': 'gl',
'Gujarati': 'gu',
'Hausa': 'ha',
'Hawaiian': 'haw',
'Hindi': 'hi',
'Hmong': 'hmn',
'Croatian': 'hr',
'Haitian Creole': 'ht',
'Hungarian': 'hu',
'Armenian': 'hy',
'Indonesian': 'id',
'Igbo': 'ig',
'Icelandic': 'is',
'Italian': 'it',
'Hebrew': 'iw',
'Japanese': 'ja',
'Javanese': 'jv',
'Georgian': 'ka',
'Kazakh': 'kk',
'Khmer': 'km',
'Kannada': 'kn',
'Korean': 'ko',
'Kurdish': 'ku',
'Kyrgyz': 'ky',
'Latin': 'la',
'Luxembourgish': 'lb',
'Lao': 'lo',
'Lithuanian': 'lt',
'Latvian': 'lv',
'Malagasy': 'mg',
'Maori': 'mi',
'Macedonian': 'mk',
'Malayalam': 'ml',
'Mongolian': 'mn',
'Marathi': 'mr',
'Malay': 'ms',
'Maltese': 'mt',
'Burmese': 'my',
'Nepali': 'ne',
'Dutch': 'nl',
'Norwegian': 'no',
'Nyanja': 'ny',
'Punjabi': 'pa',
'Polish': 'pl',
'Pashto': 'ps',
'Portuguese': 'pt',
'Romanian': 'ro',
'Russian': 'ru',
'Sindhi': 'sd',
'Sinhala': 'si',
'Slovak': 'sk',
'Slovenian': 'sl',
'Samoan': 'sm',
'Shona': 'sn',
'Somali': 'so',
'Albanian': 'sq',
'Serbian': 'sr',
'Southern Sotho': 'st',
'Sundanese': 'su',
'Swedish': 'sv',
'Swahili': 'sw',
'Tamil': 'ta',
'Telugu': 'te',
'Tajik': 'tg',
'Thai': 'th',
'Turkish': 'tr',
'Ukrainian': 'uk',
'Urdu': 'ur',
'Uzbek': 'uz',
'Vietnamese': 'vi',
'Xhosa': 'xh',
'Yiddish': 'yi',
'Yoruba': 'yo',
'Chinese': 'zh',
'Zulu': 'zu'
}
def get_language_from_iso(iso_language):
return dict_iso_languages.get(iso_language, None)
def get_languages_from_iso(l_iso_languages, sort=False):
l_languages = []
for iso_language in l_iso_languages:
language = get_language_from_iso(iso_language)
if language:
l_languages.append(language)
if sort:
l_languages = sorted(l_languages)
return l_languages
def get_iso_from_language(language):
return dict_languages_iso.get(language, None)
def get_iso_from_languages(l_languages, sort=False):
l_iso = []
for language in l_languages:
iso_lang = get_iso_from_language(language)
if iso_lang:
l_iso.append(iso_lang)
if sort:
l_iso = sorted(l_iso)
return l_iso

View File

@ -2,8 +2,10 @@
# -*-coding:UTF-8 -* # -*-coding:UTF-8 -*
import os import os
import re
import sys import sys
import redis import redis
import cld3
import html2text import html2text
from io import BytesIO from io import BytesIO
@ -101,13 +103,62 @@ def add_item_parent(item_parent, item_id):
def get_item_content(item_id): def get_item_content(item_id):
return item_basic.get_item_content(item_id) return item_basic.get_item_content(item_id)
def get_item_content_html2text(item_id, item_content=None): def get_item_content_html2text(item_id, item_content=None, ignore_links=False):
if not item_content: if not item_content:
item_content = get_item_content(item_id) item_content = get_item_content(item_id)
h = html2text.HTML2Text() h = html2text.HTML2Text()
h.ignore_links = False h.ignore_links = ignore_links
h.ignore_images = ignore_links
return h.handle(item_content) return h.handle(item_content)
def remove_all_urls_from_content(item_id, item_content=None):
if not item_content:
item_content = get_item_content(item_id)
regex = r'\b(?:http://|https://)?(?:[a-zA-Z\d-]{,63}(?:\.[a-zA-Z\d-]{,63})+)(?:\:[0-9]+)*(?:/(?:$|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*\b'
url_regex = re.compile(regex)
urls = url_regex.findall(item_content)
urls = sorted(urls, key=len, reverse=True)
for url in urls:
item_content = item_content.replace(url, '')
regex_pgp_public_blocs = r'-----BEGIN PGP PUBLIC KEY BLOCK-----[\s\S]+?-----END PGP PUBLIC KEY BLOCK-----'
regex_pgp_signature = r'-----BEGIN PGP SIGNATURE-----[\s\S]+?-----END PGP SIGNATURE-----'
regex_pgp_message = r'-----BEGIN PGP MESSAGE-----[\s\S]+?-----END PGP MESSAGE-----'
re.compile(regex_pgp_public_blocs)
re.compile(regex_pgp_signature)
re.compile(regex_pgp_message)
res = re.findall(regex_pgp_public_blocs, item_content)
for it in res:
item_content = item_content.replace(it, '')
res = re.findall(regex_pgp_signature, item_content)
for it in res:
item_content = item_content.replace(it, '')
res = re.findall(regex_pgp_message, item_content)
for it in res:
item_content = item_content.replace(it, '')
return item_content
def get_item_languages(item_id, min_len=600, num_langs=3, min_proportion=0.2, min_probability=0.7):
all_languages = []
## CLEAN CONTENT ##
content = get_item_content_html2text(item_id, ignore_links=True)
content = remove_all_urls_from_content(item_id, item_content=content)
# REMOVE USELESS SPACE
content = ' '.join(content.split())
#- CLEAN CONTENT -#
#print(content)
#print(len(content))
if len(content) >= min_len:
for lang in cld3.get_frequent_languages(content, num_langs=num_langs):
if lang.proportion >= min_proportion and lang.probability >= min_probability and lang.is_reliable:
all_languages.append(lang)
return all_languages
# API # API
def get_item(request_dict): def get_item(request_dict):
if not request_dict: if not request_dict:
@ -496,3 +547,17 @@ def delete_domain_node(item_id):
domain_basic.delete_domain_item_core(item_id, domain, port) domain_basic.delete_domain_item_core(item_id, domain, port)
for child_id in get_all_domain_node_by_item_id(item_id): for child_id in get_all_domain_node_by_item_id(item_id):
delete_item(child_id) delete_item(child_id)
# if __name__ == '__main__':
# import Domain
# domain = Domain.Domain('domain.onion')
# for domain_history in domain.get_domain_history():
# domain_item = domain.get_domain_items_crawled(epoch=domain_history[1]) # item_tag
# if "items" in domain_item:
# for item_dict in domain_item['items']:
# item_id = item_dict['id']
# print(item_id)
# for lang in get_item_languages(item_id, min_proportion=0.2, min_probability=0.8):
# print(lang)
# print()
# print(get_item_languages(item_id, min_proportion=0.2, min_probability=0.6)) # 0.7 ?

View File

@ -46,6 +46,9 @@ publish = Redis_Tags
subscribe = Redis_Global subscribe = Redis_Global
publish = Redis_Tags publish = Redis_Tags
[Languages]
subscribe = Redis_Global
[Categ] [Categ]
subscribe = Redis_Global subscribe = Redis_Global
publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Web,Redis_Credential,Redis_SourceCode,Redis_Cve,Redis_ApiKey publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Web,Redis_Credential,Redis_SourceCode,Redis_Cve,Redis_ApiKey

View File

@ -17,6 +17,25 @@ import subprocess
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
import ConfigLoader import ConfigLoader
def launch_background_upgrade(version, script_name):
if r_serv.sismember('ail:to_update', version):
r_serv.delete('ail:update_error')
r_serv.set('ail:update_in_progress', version)
r_serv.set('ail:current_background_update', version)
r_serv.set('ail:current_background_script', 'domain tags update')
update_file = os.path.join(os.environ['AIL_HOME'], 'update', version, script_name)
process = subprocess.run(['python' ,update_file])
update_progress = r_serv.get('ail:current_background_script_stat')
if update_progress:
if int(update_progress) == 100:
r_serv.delete('ail:update_in_progress')
r_serv.delete('ail:current_background_script')
r_serv.delete('ail:current_background_script_stat')
r_serv.delete('ail:current_background_update')
r_serv.srem('ail:to_update', new_version)
if __name__ == "__main__": if __name__ == "__main__":
config_loader = ConfigLoader.ConfigLoader() config_loader = ConfigLoader.ConfigLoader()
@ -114,3 +133,8 @@ if __name__ == "__main__":
r_serv.delete('ail:current_background_script_stat') r_serv.delete('ail:current_background_script_stat')
r_serv.delete('ail:current_background_update') r_serv.delete('ail:current_background_update')
r_serv.srem('ail:to_update', new_version) r_serv.srem('ail:to_update', new_version)
launch_background_upgrade('v2.6', 'Update_screenshots.py')
launch_background_upgrade('v2.7', 'Update_domain_tags.py')
launch_background_upgrade('v3.4', 'Update_domain.py')

37
update/v3.4/Update.py Executable file
View File

@ -0,0 +1,37 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import re
import sys
import time
import redis
import datetime
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
import ConfigLoader
new_version = 'v3.4'
if __name__ == '__main__':
start_deb = time.time()
config_loader = ConfigLoader.ConfigLoader()
r_serv = config_loader.get_redis_conn("ARDB_DB")
r_serv_onion = config_loader.get_redis_conn("ARDB_Onion")
config_loader = None
#Set current update_in_progress
r_serv.set('ail:update_in_progress', new_version)
r_serv.set('ail:current_background_update', new_version)
r_serv_onion.sunionstore('domain_update_v3.4', 'full_onion_up', 'full_regular_up')
r_serv.set('update:nb_elem_to_convert', r_serv_onion.scard('domain_update_v3.4'))
r_serv.set('update:nb_elem_converted',0)
#Set current ail version
r_serv.set('ail:version', new_version)
#Set current ail version
r_serv.hset('ail:update_date', new_version, datetime.datetime.now().strftime("%Y%m%d"))

54
update/v3.4/Update.sh Executable file
View File

@ -0,0 +1,54 @@
#!/bin/bash
[ -z "$AIL_HOME" ] && echo "Needs the env var AIL_HOME. Run the script from the virtual environment." && exit 1;
[ -z "$AIL_REDIS" ] && echo "Needs the env var AIL_REDIS. Run the script from the virtual environment." && exit 1;
[ -z "$AIL_ARDB" ] && echo "Needs the env var AIL_ARDB. Run the script from the virtual environment." && exit 1;
[ -z "$AIL_BIN" ] && echo "Needs the env var AIL_ARDB. Run the script from the virtual environment." && exit 1;
[ -z "$AIL_FLASK" ] && echo "Needs the env var AIL_FLASK. Run the script from the virtual environment." && exit 1;
export PATH=$AIL_HOME:$PATH
export PATH=$AIL_REDIS:$PATH
export PATH=$AIL_ARDB:$PATH
export PATH=$AIL_BIN:$PATH
export PATH=$AIL_FLASK:$PATH
GREEN="\\033[1;32m"
DEFAULT="\\033[0;39m"
echo -e $GREEN"Shutting down AIL ..."$DEFAULT
bash ${AIL_BIN}/LAUNCH.sh -ks
wait
bash ${AIL_BIN}/LAUNCH.sh -ldbv &
wait
echo ""
# SUBMODULES #
git submodule update
# echo ""
# echo -e $GREEN"installing KVORCKS ..."$DEFAULT
# cd ${AIL_HOME}
# test ! -d kvrocks/ && git clone https://github.com/bitleak/kvrocks.git
# pushd kvrocks/
# make -j4
# popd
echo -e $GREEN"Installing html2text ..."$DEFAULT
pip3 install pycld3
echo ""
echo -e $GREEN"Updating AIL VERSION ..."$DEFAULT
echo ""
python ${AIL_HOME}/update/v3.4/Update.py
wait
echo ""
echo ""
echo ""
echo -e $GREEN"Shutting down ARDB ..."$DEFAULT
bash ${AIL_BIN}/LAUNCH.sh -ks
wait
exit 0

57
update/v3.4/Update_domain.py Executable file
View File

@ -0,0 +1,57 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import re
import sys
import time
import redis
import datetime
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
import ConfigLoader
import Domain
def update_update_stats():
nb_updated = int(r_serv_db.get('update:nb_elem_converted'))
progress = int((nb_updated * 100) / nb_elem_to_update)
print('{}/{} updated {}%'.format(nb_updated, nb_elem_to_update, progress))
r_serv_db.set('ail:current_background_script_stat', progress)
def update_domain_language(domain_obj, item_id):
domain_name = domain_obj.get_domain_name()
Domain.add_domain_languages_by_item_id(domain_name, item_id)
if __name__ == '__main__':
start_deb = time.time()
config_loader = ConfigLoader.ConfigLoader()
r_serv_db = config_loader.get_redis_conn("ARDB_DB")
r_serv_onion = config_loader.get_redis_conn("ARDB_Onion")
config_loader = None
nb_elem_to_update = r_serv_db.get('update:nb_elem_to_convert')
if not nb_elem_to_update:
nb_elem_to_update = 1
else:
nb_elem_to_update = int(nb_elem_to_update)
#Domain._delete_all_domains_languages()
while True:
domain = r_serv_onion.spop('domain_update_v3.4')
if domain is not None:
print(domain)
domain = Domain.Domain(domain)
for domain_history in domain.get_domain_history():
domain_item = domain.get_domain_items_crawled(epoch=domain_history[1]) # item_tag
if "items" in domain_item:
for item_dict in domain_item['items']:
update_domain_language(domain, item_dict['id'])
r_serv_db.incr('update:nb_elem_converted')
update_update_stats()
else:
sys.exit(0)

View File

@ -26,6 +26,7 @@ import Tag
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib')) sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib'))
import Domain import Domain
import crawlers import crawlers
import Language
r_cache = Flask_config.r_cache r_cache = Flask_config.r_cache
r_serv_db = Flask_config.r_serv_db r_serv_db = Flask_config.r_serv_db
@ -85,6 +86,9 @@ def send_to_spider():
return create_json_response(res[0], res[1]) return create_json_response(res[0], res[1])
return redirect(url_for('crawler_splash.manual')) return redirect(url_for('crawler_splash.manual'))
#### Domains ####
# add route : /crawlers/show_domain # add route : /crawlers/show_domain
@crawler_splash.route('/crawlers/showDomain', methods=['GET', 'POST']) @crawler_splash.route('/crawlers/showDomain', methods=['GET', 'POST'])
@login_required @login_required
@ -111,6 +115,7 @@ def showDomain():
dict_domain = {**dict_domain, **domain.get_domain_correlation()} dict_domain = {**dict_domain, **domain.get_domain_correlation()}
dict_domain['correlation_nb'] = Domain.get_domain_total_nb_correlation(dict_domain) dict_domain['correlation_nb'] = Domain.get_domain_total_nb_correlation(dict_domain)
dict_domain['father'] = domain.get_domain_father() dict_domain['father'] = domain.get_domain_father()
dict_domain['languages'] = Language.get_languages_from_iso(domain.get_domain_languages(), sort=True)
dict_domain['tags'] = domain.get_domain_tags() dict_domain['tags'] = domain.get_domain_tags()
dict_domain['tags_safe'] = Tag.is_tags_safe(dict_domain['tags']) dict_domain['tags_safe'] = Tag.is_tags_safe(dict_domain['tags'])
dict_domain['history'] = domain.get_domain_history_with_status() dict_domain['history'] = domain.get_domain_history_with_status()
@ -198,6 +203,38 @@ def domains_explorer_web():
dict_data = Domain.get_domains_up_by_filers('regular', page=page, date_from=date_from, date_to=date_to) dict_data = Domain.get_domains_up_by_filers('regular', page=page, date_from=date_from, date_to=date_to)
return render_template("domain_explorer.html", dict_data=dict_data, bootstrap_label=bootstrap_label, domain_type='regular') return render_template("domain_explorer.html", dict_data=dict_data, bootstrap_label=bootstrap_label, domain_type='regular')
@crawler_splash.route('/domains/languages/all/json', methods=['GET'])
@login_required
@login_read_only
def domains_all_languages_json():
# # TODO: get domain type
iso = request.args.get('iso')
domain_types = request.args.getlist('domain_types')
return jsonify(Language.get_languages_from_iso(Domain.get_all_domains_languages(), sort=True))
@crawler_splash.route('/domains/languages/search_get', methods=['GET'])
@login_required
@login_read_only
def domains_search_languages_get():
page = request.args.get('page')
try:
page = int(page)
except:
page = 1
domains_types = request.args.getlist('domain_types')
if domains_types:
domains_types = domains_types[0].split(',')
languages = request.args.getlist('languages')
if languages:
languages = languages[0].split(',')
l_dict_domains = Domain.api_get_domains_by_languages(domains_types, Language.get_iso_from_languages(languages), domains_metadata=True, page=page)
return render_template("domains/domains_filter_languages.html", template_folder='../../',
l_dict_domains=l_dict_domains, bootstrap_label=bootstrap_label,
current_languages=languages, domains_types=domains_types)
##-- --##
## Cookiejar ## ## Cookiejar ##
@crawler_splash.route('/crawler/cookiejar/add', methods=['GET']) @crawler_splash.route('/crawler/cookiejar/add', methods=['GET'])
@login_required @login_required

View File

@ -148,6 +148,10 @@
{% include 'tags/block_obj_tags_search.html' %} {% include 'tags/block_obj_tags_search.html' %}
{% endwith %} {% endwith %}
{% with object_type='domain' %}
{% include 'domains/block_languages_search.html' %}
{% endwith %}
</div> </div>
</div> </div>
</div> </div>

View File

@ -68,7 +68,7 @@
</div> </div>
{% with dict_data=dict_data, bootstrap_label=bootstrap_label %} {% with l_dict_domains=dict_data['list_elem'], bootstrap_label=bootstrap_label %}
{% include 'domains/card_img_domain.html' %} {% include 'domains/card_img_domain.html' %}
{% endwith %} {% endwith %}

View File

@ -67,6 +67,7 @@
<th>First Seen</th> <th>First Seen</th>
<th>Last Check</th> <th>Last Check</th>
<th>Ports</th> <th>Ports</th>
<th>Languages</th>
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
@ -74,6 +75,11 @@
<td class="panelText">{%if "first_seen" in dict_domain%}{{ dict_domain['first_seen'] }}{%endif%}</td> <td class="panelText">{%if "first_seen" in dict_domain%}{{ dict_domain['first_seen'] }}{%endif%}</td>
<td class="panelText">{%if "last_check" in dict_domain%}{{ dict_domain['last_check'] }}{%endif%}</td> <td class="panelText">{%if "last_check" in dict_domain%}{{ dict_domain['last_check'] }}{%endif%}</td>
<td class="panelText">{%if dict_domain["ports"]%}{{ dict_domain["ports"] }}{%endif%}</td> <td class="panelText">{%if dict_domain["ports"]%}{{ dict_domain["ports"] }}{%endif%}</td>
<td class="panelText">
{% for languages in dict_domain['languages'] %}
{{languages}}
{% endfor %}
</td>
</tr> </tr>
</tbody> </tbody>
</table> </table>

View File

@ -0,0 +1,73 @@
<div class="card mb-3 mt-1">
<div class="card-header text-white bg-dark">
<h5 class="card-title mb-0">
<i class="fas fa-language" style="font-size: 1.8rem;"></i> Domains by Languages :
</h5>
</div>
<div class="card-body">
<div class="input-group">
<div class="input-group-prepend">
<button class="btn btn-outline-danger" type="button" id="button-clear" style="z-index: 1;" onclick="emptySearch()">
<i class="fas fa-eraser"></i>
</button>
</div>
<input id="llanguages" name="llanguages" type="text" class="form-control" aria-describedby="button-clear" autocomplete="off">
</div>
<div class="mb-3">
<div class="custom-control custom-switch">
<input class="custom-control-input" type="checkbox" name="domain_onion_switch" value="" id="domain_onion_switch" {%if 'onion' in domains_types%}checked{%endif%}>
<label class="custom-control-label" for="domain_onion_switch">
<span class="badge badge-danger"><i class="fas fa-user-secret"></i> Onion Domains</span>
</label>
</div>
<div class="custom-control custom-switch">
<input class="custom-control-input" type="checkbox" name="domain_regular_switch" value="True" id="domain_regular_switch"{%if 'regular' in domains_types%}checked{%endif%}>
<label class="custom-control-label" for="domain_regular_switch">
<span class="badge badge-warning"><i class="fab fa-html5"></i> Web Domains</span>
</label>
</div>
</div>
<button class="btn btn-primary" type="button" id="button-search" onclick="searchLanguages()">
<i class="fas fa-search"></i> Search
</button>
</div>
</div>
<link href="{{ url_for('static', filename='css/tags.css') }}" rel="stylesheet" type="text/css" />
<script src="{{ url_for('static', filename='js/tags.js') }}"></script>
<script>
var llanguages;
$.getJSON("{{ url_for('crawler_splash.domains_all_languages_json') }}", //?object_type={{ object_type }}"
function(data) {
llanguages = $('#llanguages').tagSuggest({
data: data,
value: [{%if current_languages%}{% for language in current_languages %}'{{language|safe}}',{%endfor%}{%endif%}],
sortOrder: 'name',
maxDropHeight: 200,
name: 'llanguages'
});
});
function searchLanguages() {
var all_domain_types = ['onion', 'regular'] // TODO: load from flask
var l_domains_types = [];
var data = llanguages.getValue();
for (var i = 0; i < all_domain_types.length; i++) {
if (document.getElementById('domain_'+ all_domain_types[i] +'_switch').checked) {
l_domains_types.push(all_domain_types[i])
}
}
var parameter = "?languages=" + data + "&domain_types=" + l_domains_types +"{%if page%}&page={{ page }}{%endif%}";
window.location.href = "{{ url_for('crawler_splash.domains_search_languages_get') }}" + parameter;
}
function emptySearch() {
llanguages.clear();
}
</script>

View File

@ -1,10 +1,10 @@
{% for dict_domain in dict_data['list_elem'] %} {% for dict_domain in l_dict_domains %}
{% if loop.index0 % 4 == 0 %} {% if loop.index0 % 4 == 0 %}
<div class="card-deck mt-3"> <div class="card-deck mt-3">
{% endif %} {% endif %}
<div class="card"> <div class="card {% if dict_domain["status"] %}border-success{% else %}border-danger{% endif %}">
<div class="text-center"> <div class="text-center">
<canvas id="canvas_{{loop.index0}}" style="max-height: 400px; max-width: 100%;"></canvas> <canvas id="canvas_{{loop.index0}}" style="max-height: 400px; max-width: 100%;"></canvas>
</div> </div>
@ -13,24 +13,46 @@
<a target="_blank" href="{{ url_for('crawler_splash.showDomain') }}?domain={{dict_domain["id"]}}"> <a target="_blank" href="{{ url_for('crawler_splash.showDomain') }}?domain={{dict_domain["id"]}}">
{{dict_domain["id"]}} {{dict_domain["id"]}}
</a> </a>
{% if dict_domain["status"] %}
<span style="color:Green;">
<i class="fas fa-check-circle"></i> UP
</span>
{% else %}
<span style="color:Red;">
<i class="fas fa-times-circle"></i> DOWN
</span>
{% endif %}
</h5> </h5>
<div>
<span class="badge badge-dark">
<span data-toggle="tooltip" data-placement="top" title="Tooltip on top">
<span class="badge badge-info" style="font-size: 0.8rem;">
<i class="fas fa-hourglass-start"></i>
</span>
{{dict_domain["first_seen"]}}
</span>
<span class="badge badge-light mx-1" style="font-size: 1rem;">
<i class="far fa-calendar-alt"></i>
</span>
{{dict_domain["first_seen"]}}
<span class="badge badge-secondary" style="font-size: 0.8rem;">
<i class="fas fa-hourglass-end"></i>
</span>
</span>
</div>
<p class="card-text"> <p class="card-text">
<small class="text-muted"> <small class="text-muted">
First seen: {{dict_domain["first_seen"]}}<br> Ports: {{dict_domain["ports"]}}<br>
Last_seen: {{dict_domain["first_seen"]}}<br> {% if dict_domain['languages'] %}
Ports: {{dict_domain["ports"]}} Languages:
{% for language in dict_domain['languages'] %}
<span class="badge badge-secondary" style="font-size: 0.75rem;">{{ language }}</span>
{% endfor %}
{% endif %}
</small> </small>
</p> </p>
<small class="text-muted">Status: </small>
{% if dict_domain["status"] %}
<span style="color:Green;">
<i class="fas fa-check-circle"></i> UP
</span>
{% else %}
<span style="color:Red;">
<i class="fas fa-times-circle"></i> DOWN
</span>
{% endif %}
<div> <div>
{% for tag in dict_domain['tags'] %} {% for tag in dict_domain['tags'] %}
<a href="{{ url_for('tags_ui.get_obj_by_tags') }}?object_type=domain&ltags={{ tag }}"> <a href="{{ url_for('tags_ui.get_obj_by_tags') }}?object_type=domain&ltags={{ tag }}">
@ -50,6 +72,6 @@
{% endfor %} {% endfor %}
{% if dict_data['list_elem']|length % 4 != 0 %} {% if l_dict_domains|length % 4 != 0 %}
</div> </div>
{% endif %} {% endif %}

View File

@ -0,0 +1,192 @@
<!DOCTYPE html>
<html>
<head>
<title>Show Domain - AIL</title>
<link rel="icon" href="{{ url_for('static', filename='image/ail-icon.png') }}">
<!-- Core CSS -->
<link href="{{ url_for('static', filename='css/bootstrap4.min.css') }}" rel="stylesheet">
<link href="{{ url_for('static', filename='css/font-awesome.min.css') }}" rel="stylesheet">
<!-- JS -->
<script src="{{ url_for('static', filename='js/jquery.js')}}"></script>
<script src="{{ url_for('static', filename='js/popper.min.js')}}"></script>
<script src="{{ url_for('static', filename='js/bootstrap4.min.js') }}"></script>
<style>
.card-columns {
column-count: 4;
}
</style>
</head>
<body>
{% include 'nav_bar.html' %}
<div class="container-fluid">
<div class="row">
{% include 'crawler/menu_sidebar.html' %}
<div class="col-12 col-lg-10" id="core_content">
<div class="row">
<div class="col-12 col-lg-6">
{% include 'domains/block_languages_search.html' %}
</div>
<div class="col-12 col-xl-6">
<div class="card my-2 border-secondary" >
<div class="card-body py-2">
<div class="row">
<div class="col-md-3 text-center">
<button class="btn btn-primary" onclick="blocks.value=0;pixelate_all();">
<i class="fas fa-eye-slash"></i>
<span class="label-icon">Hide</span>
</button>
</div>
<div class="col-md-6">
<input class="custom-range mt-2" id="blocks" type="range" min="1" max="50" value="5">
</div>
<div class="col-md-3 text-center">
<button class="btn btn-primary" onclick="blocks.value=50;pixelate_all();">
<i class="fas fa-plus-square"></i>
<span class="label-icon">Full resolution</span>
</button>
</div>
</div>
</div>
</div>
</div>
</div>
{% with l_dict_domains=l_dict_domains['list_elem'], bootstrap_label=bootstrap_label %}
{% include 'domains/card_img_domain.html' %}
{% endwith %}
<br>
<br>
{%if l_dict_domains['list_elem']%}
{% with page=l_dict_domains['page'], nb_page_max=l_dict_domains['nb_pages'], nb_first_elem=l_dict_domains['nb_first_elem'], nb_last_elem=l_dict_domains['nb_last_elem'], nb_all_elem=l_dict_domains['nb_all_elem'] %}
{% set target_url=url_for('crawler_splash.domains_search_languages_get') + "?languages=" + ','.join(current_languages)%}
{%if domains_types %}
{% set target_url = target_url + '&domain_types=' + ','.join(domains_types)%}
{%endif%}
{% include 'pagination.html' %}
{% endwith %}
{%endif%}
</div>
</div>
</div>
</body>
<script>
$(document).ready(function(){
$('#nav_title_domains_explorer').removeClass("text-muted");
});
function toggle_sidebar(){
if($('#nav_menu').is(':visible')){
$('#nav_menu').hide();
$('#side_menu').removeClass('border-right')
$('#side_menu').removeClass('col-lg-2')
$('#core_content').removeClass('col-lg-10')
}else{
$('#nav_menu').show();
$('#side_menu').addClass('border-right')
$('#side_menu').addClass('col-lg-2')
$('#core_content').addClass('col-lg-10')
}
}
</script>
<script>
// img_url
// ctx
// canevas_id
var dict_canevas_blurr_img = {}
function init_canevas_blurr_img(canevas_id, img_url){
// ctx, turn off image smoothin
dict_canevas_blurr_img[canevas_id] = {}
var canvas_container = document.getElementById(canevas_id);
var ctx = canvas_container.getContext('2d');
ctx.webkitImageSmoothingEnabled = false;
ctx.imageSmoothingEnabled = false;
dict_canevas_blurr_img[canevas_id]["ctx"] = ctx;
// img
dict_canevas_blurr_img[canevas_id]["img"] = new Image();
dict_canevas_blurr_img[canevas_id]["img"].onload = function() {pixelate_img(canevas_id);};
dict_canevas_blurr_img[canevas_id]["img"].addEventListener("error", function() {img_error(canevas_id);});
dict_canevas_blurr_img[canevas_id]["img"].src = img_url;
}
function pixelate_all(){
Object.entries(dict_canevas_blurr_img).forEach(([key, value]) => {
pixelate_img(key);
});
}
function pixelate_img(canevas_id) {
if (typeof canevas_id !== 'undefined') {
var canevas_to_blurr = document.getElementById(canevas_id);
/// use slider value
if( blocks.value == 50 ){
size = 1;
} else {
var size = (blocks.value) * 0.01;
}
canevas_to_blurr.width = dict_canevas_blurr_img[canevas_id]["img"].width;
canevas_to_blurr.height = dict_canevas_blurr_img[canevas_id]["img"].height;
/// cache scaled width and height
w = canevas_to_blurr.width * size;
h = canevas_to_blurr.height * size;
/// draw original image to the scaled size
dict_canevas_blurr_img[canevas_id]["ctx"].drawImage(dict_canevas_blurr_img[canevas_id]["img"], 0, 0, w, h);
/// pixelated
dict_canevas_blurr_img[canevas_id]["ctx"].drawImage(canevas_to_blurr, 0, 0, w, h, 0, 0, canevas_to_blurr.width, canevas_to_blurr.height);
}
}
function img_error(canevas_id) {
dict_canevas_blurr_img[canevas_id]["img"].onerror=null;
dict_canevas_blurr_img[canevas_id]["img"].src="{{ url_for('static', filename='image/AIL.png') }}";
}
blocks.addEventListener('change', pixelate_all, false);
{% for dict_domain in l_dict_domains['list_elem'] %}
{% if 'screenshot' in dict_domain %}
{% if dict_domain['is_tags_safe'] %}
var screenshot_url = "{{ url_for('showsavedpastes.screenshot', filename="") }}{{dict_domain['screenshot']}}";
{% else %}
var screenshot_url = "{{ url_for('static', filename='image/AIL.png') }}";
{% endif %}
init_canevas_blurr_img("canvas_{{loop.index0}}", screenshot_url);
{% endif %}
{% endfor %}
</script>
</html>