mirror of https://github.com/CIRCL/AIL-framework
chg: [launcher + modules] add module tests (Onion module)
parent
869be4a493
commit
4896db98a3
|
@ -78,10 +78,10 @@ function helptext {
|
||||||
[-k | --killAll] Kill DB + Scripts
|
[-k | --killAll] Kill DB + Scripts
|
||||||
[-ks | --killscript] Scripts
|
[-ks | --killscript] Scripts
|
||||||
[-u | --update] Update AIL
|
[-u | --update] Update AIL
|
||||||
[-c | --crawler] LAUNCH Crawlers
|
[-ut | --thirdpartyUpdate] Update Web
|
||||||
[-f | --launchFeeder] LAUNCH Pystemon feeder
|
[-t | --test] Launch Tests
|
||||||
[-t | --thirdpartyUpdate] Update Web
|
|
||||||
[-rp | --resetPassword] Reset Password
|
[-rp | --resetPassword] Reset Password
|
||||||
|
[-f | --launchFeeder] LAUNCH Pystemon feeder
|
||||||
[-m | --menu] Display Advanced Menu
|
[-m | --menu] Display Advanced Menu
|
||||||
[-h | --help] Help
|
[-h | --help] Help
|
||||||
"
|
"
|
||||||
|
@ -234,34 +234,34 @@ function launching_scripts {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function launching_crawler {
|
# function launching_crawler {
|
||||||
if [[ ! $iscrawler ]]; then
|
# if [[ ! $iscrawler ]]; then
|
||||||
CONFIG=$AIL_HOME/configs/core.cfg
|
# CONFIG=$AIL_HOME/configs/core.cfg
|
||||||
lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_port/{print $3;exit}' "${CONFIG}")
|
# lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_port/{print $3;exit}' "${CONFIG}")
|
||||||
|
#
|
||||||
IFS='-' read -ra PORTS <<< "$lport"
|
# IFS='-' read -ra PORTS <<< "$lport"
|
||||||
if [ ${#PORTS[@]} -eq 1 ]
|
# if [ ${#PORTS[@]} -eq 1 ]
|
||||||
then
|
# then
|
||||||
first_port=${PORTS[0]}
|
# first_port=${PORTS[0]}
|
||||||
last_port=${PORTS[0]}
|
# last_port=${PORTS[0]}
|
||||||
else
|
# else
|
||||||
first_port=${PORTS[0]}
|
# first_port=${PORTS[0]}
|
||||||
last_port=${PORTS[1]}
|
# last_port=${PORTS[1]}
|
||||||
fi
|
# fi
|
||||||
|
#
|
||||||
screen -dmS "Crawler_AIL"
|
# screen -dmS "Crawler_AIL"
|
||||||
sleep 0.1
|
# sleep 0.1
|
||||||
|
#
|
||||||
for ((i=first_port;i<=last_port;i++)); do
|
# for ((i=first_port;i<=last_port;i++)); do
|
||||||
screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Crawler.py $i; read x"
|
# screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Crawler.py $i; read x"
|
||||||
sleep 0.1
|
# sleep 0.1
|
||||||
done
|
# done
|
||||||
|
#
|
||||||
echo -e $GREEN"\t* Launching Crawler_AIL scripts"$DEFAULT
|
# echo -e $GREEN"\t* Launching Crawler_AIL scripts"$DEFAULT
|
||||||
else
|
# else
|
||||||
echo -e $RED"\t* A screen is already launched"$DEFAULT
|
# echo -e $RED"\t* A screen is already launched"$DEFAULT
|
||||||
fi
|
# fi
|
||||||
}
|
# }
|
||||||
|
|
||||||
function shutting_down_redis {
|
function shutting_down_redis {
|
||||||
redis_dir=${AIL_HOME}/redis/src/
|
redis_dir=${AIL_HOME}/redis/src/
|
||||||
|
@ -490,6 +490,12 @@ function update_thirdparty {
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function launch_tests() {
|
||||||
|
tests_dir=${AIL_HOME}/tests
|
||||||
|
bin_dir=${AIL_BIN}
|
||||||
|
python3 `which nosetests` -w $tests_dir --with-coverage --cover-package=$bin_dir -d
|
||||||
|
}
|
||||||
|
|
||||||
function reset_password() {
|
function reset_password() {
|
||||||
echo -e "\t* Reseting UI admin password..."
|
echo -e "\t* Reseting UI admin password..."
|
||||||
if checking_ardb && checking_redis; then
|
if checking_ardb && checking_redis; then
|
||||||
|
@ -557,9 +563,6 @@ function menu_display {
|
||||||
Flask)
|
Flask)
|
||||||
launch_flask;
|
launch_flask;
|
||||||
;;
|
;;
|
||||||
Crawler)
|
|
||||||
launching_crawler;
|
|
||||||
;;
|
|
||||||
Killall)
|
Killall)
|
||||||
killall;
|
killall;
|
||||||
;;
|
;;
|
||||||
|
@ -614,12 +617,12 @@ while [ "$1" != "" ]; do
|
||||||
;;
|
;;
|
||||||
-u | --update ) update "--manual";
|
-u | --update ) update "--manual";
|
||||||
;;
|
;;
|
||||||
-t | --thirdpartyUpdate ) update_thirdparty;
|
-t | --test ) launch_tests;
|
||||||
|
;;
|
||||||
|
-ut | --thirdpartyUpdate ) update_thirdparty;
|
||||||
;;
|
;;
|
||||||
-rp | --resetPassword ) reset_password;
|
-rp | --resetPassword ) reset_password;
|
||||||
;;
|
;;
|
||||||
-c | --crawler ) launching_crawler;
|
|
||||||
;;
|
|
||||||
-f | --launchFeeder ) launch_feeder;
|
-f | --launchFeeder ) launch_feeder;
|
||||||
;;
|
;;
|
||||||
-h | --help ) helptext;
|
-h | --help ) helptext;
|
||||||
|
|
16
bin/Onion.py
16
bin/Onion.py
|
@ -126,11 +126,9 @@ class Onion(AbstractModule):
|
||||||
# list of tuples: (url, subdomains, domain)
|
# list of tuples: (url, subdomains, domain)
|
||||||
urls_to_crawl = []
|
urls_to_crawl = []
|
||||||
|
|
||||||
print(message)
|
|
||||||
id, score = message.split()
|
id, score = message.split()
|
||||||
item = Item(id)
|
item = Item(id)
|
||||||
item_content = item.get_content()
|
item_content = item.get_content()
|
||||||
item_content = 'http://33333333.kingdom7rv6wkfzn.onion?sdsd=ooooo http://2222222.kingdom7rv6wkfzn.onion'
|
|
||||||
|
|
||||||
# max execution time on regex
|
# max execution time on regex
|
||||||
res = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.url_regex, item.get_id(), item_content)
|
res = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.url_regex, item.get_id(), item_content)
|
||||||
|
@ -145,10 +143,6 @@ class Onion(AbstractModule):
|
||||||
domain = url_unpack['domain'].decode().lower()
|
domain = url_unpack['domain'].decode().lower()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
domain = url_unpack['domain'].lower()
|
domain = url_unpack['domain'].lower()
|
||||||
print('----')
|
|
||||||
print(url)
|
|
||||||
print(subdomain)
|
|
||||||
print(domain)
|
|
||||||
|
|
||||||
if crawlers.is_valid_onion_domain(domain):
|
if crawlers.is_valid_onion_domain(domain):
|
||||||
urls_to_crawl.append((url, subdomain, domain))
|
urls_to_crawl.append((url, subdomain, domain))
|
||||||
|
@ -164,8 +158,10 @@ class Onion(AbstractModule):
|
||||||
|
|
||||||
if crawlers.is_crawler_activated():
|
if crawlers.is_crawler_activated():
|
||||||
for to_crawl in urls_to_crawl:
|
for to_crawl in urls_to_crawl:
|
||||||
|
print(f'{to_crawl[2]} added to crawler queue: {to_crawl[0]}')
|
||||||
crawlers.add_item_to_discovery_queue('onion', to_crawl[2], to_crawl[1], to_crawl[0], item.get_id())
|
crawlers.add_item_to_discovery_queue('onion', to_crawl[2], to_crawl[1], to_crawl[0], item.get_id())
|
||||||
else:
|
else:
|
||||||
|
print(f'{to_print}Detected {len(urls_to_crawl)} .onion(s);{item.get_id()}')
|
||||||
self.redis_logger.warning(f'{to_print}Detected {len(urls_to_crawl)} .onion(s);{item.get_id()}')
|
self.redis_logger.warning(f'{to_print}Detected {len(urls_to_crawl)} .onion(s);{item.get_id()}')
|
||||||
# keep manual fetcher ????
|
# keep manual fetcher ????
|
||||||
## Manually fetch first page if crawler is disabled
|
## Manually fetch first page if crawler is disabled
|
||||||
|
@ -176,11 +172,3 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
module = Onion()
|
module = Onion()
|
||||||
module.run()
|
module.run()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
##########################
|
|
||||||
|
|
|
@ -0,0 +1,373 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*-coding:UTF-8 -*
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import uuid
|
||||||
|
import redis
|
||||||
|
|
||||||
|
from abc import ABC
|
||||||
|
from flask import url_for
|
||||||
|
|
||||||
|
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/'))
|
||||||
|
import ConfigLoader
|
||||||
|
|
||||||
|
class AbstractObject(ABC):
|
||||||
|
"""
|
||||||
|
Abstract Object
|
||||||
|
"""
|
||||||
|
|
||||||
|
# first seen last/seen ??
|
||||||
|
# # TODO: - tags
|
||||||
|
# - handle + refactor coorelations
|
||||||
|
# - creates others objects
|
||||||
|
|
||||||
|
def __init__(self, obj_type, id):
|
||||||
|
""" Abstract for all the AIL object
|
||||||
|
|
||||||
|
:param obj_type: object type (item, ...)
|
||||||
|
:param id: Object ID
|
||||||
|
"""
|
||||||
|
self.id = id
|
||||||
|
self.type = obj_type
|
||||||
|
|
||||||
|
def get_type(self):
|
||||||
|
return self.type
|
||||||
|
|
||||||
|
def get_id(self):
|
||||||
|
return self.id
|
||||||
|
|
||||||
|
|
||||||
|
config_loader = ConfigLoader.ConfigLoader()
|
||||||
|
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
|
||||||
|
config_loader = None
|
||||||
|
|
||||||
|
def is_valid_object_type(object_type):
|
||||||
|
if object_type in ['domain', 'item', 'image', 'decoded']:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_all_objects():
|
||||||
|
return ['domain', 'paste', 'pgp', 'cryptocurrency', 'decoded', 'screenshot']
|
||||||
|
|
||||||
|
def get_all_correlation_names():
|
||||||
|
'''
|
||||||
|
Return a list of all available correlations
|
||||||
|
'''
|
||||||
|
return ['pgp', 'cryptocurrency', 'decoded', 'screenshot']
|
||||||
|
|
||||||
|
def get_all_correlation_objects():
|
||||||
|
'''
|
||||||
|
Return a list of all correllated objects
|
||||||
|
'''
|
||||||
|
return ['domain', 'paste']
|
||||||
|
|
||||||
|
def exist_object(object_type, correlation_id, type_id=None):
|
||||||
|
if object_type == 'domain':
|
||||||
|
return Domain.verify_if_domain_exist(correlation_id)
|
||||||
|
elif object_type == 'paste' or object_type == 'item':
|
||||||
|
return Item.exist_item(correlation_id)
|
||||||
|
elif object_type == 'decoded':
|
||||||
|
return Decoded.exist_decoded(correlation_id)
|
||||||
|
elif object_type == 'pgp':
|
||||||
|
return Pgp.pgp._exist_corelation_field(type_id, correlation_id)
|
||||||
|
elif object_type == 'cryptocurrency':
|
||||||
|
return Cryptocurrency.cryptocurrency._exist_corelation_field(type_id, correlation_id)
|
||||||
|
elif object_type == 'screenshot' or object_type == 'image':
|
||||||
|
return Screenshot.exist_screenshot(correlation_id)
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_obj_date(object_type, object_id):
|
||||||
|
if object_type == "item":
|
||||||
|
return int(Item.get_item_date(object_id))
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# request_type => api or ui
|
||||||
|
def get_object_metadata(object_type, correlation_id, type_id=None):
|
||||||
|
if object_type == 'domain':
|
||||||
|
return Domain.Domain(correlation_id).get_domain_metadata(tags=True)
|
||||||
|
elif object_type == 'paste' or object_type == 'item':
|
||||||
|
return Item.get_item({"id": correlation_id, "date": True, "date_separator": True, "tags": True})[0]
|
||||||
|
elif object_type == 'decoded':
|
||||||
|
return Decoded.get_decoded_metadata(correlation_id, nb_seen=True, size=True, file_type=True, tag=True)
|
||||||
|
elif object_type == 'pgp':
|
||||||
|
return Pgp.pgp.get_metadata(type_id, correlation_id)
|
||||||
|
elif object_type == 'cryptocurrency':
|
||||||
|
return Cryptocurrency.cryptocurrency.get_metadata(type_id, correlation_id)
|
||||||
|
elif object_type == 'screenshot' or object_type == 'image':
|
||||||
|
return Screenshot.get_metadata(correlation_id)
|
||||||
|
|
||||||
|
def get_object_correlation(object_type, value, correlation_names=None, correlation_objects=None, requested_correl_type=None):
|
||||||
|
if object_type == 'domain':
|
||||||
|
return Domain.get_domain_all_correlation(value, correlation_names=correlation_names)
|
||||||
|
elif object_type == 'paste' or object_type == 'item':
|
||||||
|
return Item.get_item_all_correlation(value, correlation_names=correlation_names)
|
||||||
|
elif object_type == 'decoded':
|
||||||
|
return Decoded.get_decoded_correlated_object(value, correlation_objects=correlation_objects)
|
||||||
|
elif object_type == 'pgp':
|
||||||
|
return Pgp.pgp.get_correlation_all_object(requested_correl_type, value, correlation_objects=correlation_objects)
|
||||||
|
elif object_type == 'cryptocurrency':
|
||||||
|
return Cryptocurrency.cryptocurrency.get_correlation_all_object(requested_correl_type, value, correlation_objects=correlation_objects)
|
||||||
|
elif object_type == 'screenshot' or object_type == 'image':
|
||||||
|
return Screenshot.get_screenshot_correlated_object(value, correlation_objects=correlation_objects)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def get_correlation_node_icon(correlation_name, correlation_type=None, value=None):
|
||||||
|
'''
|
||||||
|
Used in UI Graph.
|
||||||
|
Return a font awesome icon for a given correlation_name.
|
||||||
|
|
||||||
|
:param correlation_name: correlation name
|
||||||
|
:param correlation_name: str
|
||||||
|
:param correlation_type: correlation type
|
||||||
|
:type correlation_type: str, optional
|
||||||
|
|
||||||
|
:return: a dictionnary {font awesome class, icon_code}
|
||||||
|
:rtype: dict
|
||||||
|
'''
|
||||||
|
icon_class = 'fas'
|
||||||
|
icon_text = ''
|
||||||
|
node_color = "#332288"
|
||||||
|
node_radius = 6
|
||||||
|
if correlation_name == "pgp":
|
||||||
|
node_color = '#44AA99'
|
||||||
|
if correlation_type == 'key':
|
||||||
|
icon_text = '\uf084'
|
||||||
|
elif correlation_type == 'name':
|
||||||
|
icon_text = '\uf507'
|
||||||
|
elif correlation_type == 'mail':
|
||||||
|
icon_text = '\uf1fa'
|
||||||
|
else:
|
||||||
|
icon_text = 'times'
|
||||||
|
|
||||||
|
elif correlation_name == 'cryptocurrency':
|
||||||
|
node_color = '#DDCC77'
|
||||||
|
if correlation_type == 'bitcoin':
|
||||||
|
icon_class = 'fab'
|
||||||
|
icon_text = '\uf15a'
|
||||||
|
elif correlation_type == 'monero':
|
||||||
|
icon_class = 'fab'
|
||||||
|
icon_text = '\uf3d0'
|
||||||
|
elif correlation_type == 'ethereum':
|
||||||
|
icon_class = 'fab'
|
||||||
|
icon_text = '\uf42e'
|
||||||
|
else:
|
||||||
|
icon_text = '\uf51e'
|
||||||
|
|
||||||
|
elif correlation_name == 'decoded':
|
||||||
|
node_color = '#88CCEE'
|
||||||
|
correlation_type = Decoded.get_decoded_item_type(value).split('/')[0]
|
||||||
|
if correlation_type == 'application':
|
||||||
|
icon_text = '\uf15b'
|
||||||
|
elif correlation_type == 'audio':
|
||||||
|
icon_text = '\uf1c7'
|
||||||
|
elif correlation_type == 'image':
|
||||||
|
icon_text = '\uf1c5'
|
||||||
|
elif correlation_type == 'text':
|
||||||
|
icon_text = '\uf15c'
|
||||||
|
else:
|
||||||
|
icon_text = '\uf249'
|
||||||
|
|
||||||
|
elif correlation_name == 'screenshot' or correlation_name == 'image':
|
||||||
|
node_color = '#E1F5DF'
|
||||||
|
icon_text = '\uf03e'
|
||||||
|
|
||||||
|
elif correlation_name == 'domain':
|
||||||
|
node_radius = 5
|
||||||
|
node_color = '#3DA760'
|
||||||
|
if Domain.get_domain_type(value) == 'onion':
|
||||||
|
icon_text = '\uf06e'
|
||||||
|
else:
|
||||||
|
icon_class = 'fab'
|
||||||
|
icon_text = '\uf13b'
|
||||||
|
|
||||||
|
elif correlation_name == 'paste':
|
||||||
|
node_radius = 5
|
||||||
|
if Item.is_crawled(value):
|
||||||
|
node_color = 'red'
|
||||||
|
else:
|
||||||
|
node_color = '#332288'
|
||||||
|
|
||||||
|
return {"icon_class": icon_class, "icon_text": icon_text, "node_color": node_color, "node_radius": node_radius}
|
||||||
|
|
||||||
|
def get_item_url(correlation_name, value, correlation_type=None):
|
||||||
|
'''
|
||||||
|
Warning: use only in flask
|
||||||
|
'''
|
||||||
|
url = '#'
|
||||||
|
if correlation_name == "pgp":
|
||||||
|
endpoint = 'correlation.show_correlation'
|
||||||
|
url = url_for(endpoint, object_type="pgp", type_id=correlation_type, correlation_id=value)
|
||||||
|
elif correlation_name == 'cryptocurrency':
|
||||||
|
endpoint = 'correlation.show_correlation'
|
||||||
|
url = url_for(endpoint, object_type="cryptocurrency", type_id=correlation_type, correlation_id=value)
|
||||||
|
elif correlation_name == 'decoded':
|
||||||
|
endpoint = 'correlation.show_correlation'
|
||||||
|
url = url_for(endpoint, object_type="decoded", correlation_id=value)
|
||||||
|
elif correlation_name == 'screenshot' or correlation_name == 'image': ### # TODO: rename me
|
||||||
|
endpoint = 'correlation.show_correlation'
|
||||||
|
url = url_for(endpoint, object_type="screenshot", correlation_id=value)
|
||||||
|
elif correlation_name == 'domain':
|
||||||
|
endpoint = 'crawler_splash.showDomain'
|
||||||
|
url = url_for(endpoint, domain=value)
|
||||||
|
elif correlation_name == 'item':
|
||||||
|
endpoint = 'showsavedpastes.showsavedpaste'
|
||||||
|
url = url_for(endpoint, paste=value)
|
||||||
|
elif correlation_name == 'paste': ### # TODO: remove me
|
||||||
|
endpoint = 'showsavedpastes.showsavedpaste'
|
||||||
|
url = url_for(endpoint, paste=value)
|
||||||
|
return url
|
||||||
|
|
||||||
|
def get_obj_tag_table_keys(object_type):
|
||||||
|
'''
|
||||||
|
Warning: use only in flask (dynamic templates)
|
||||||
|
'''
|
||||||
|
if object_type=="domain":
|
||||||
|
return ['id', 'first_seen', 'last_check', 'status'] # # TODO: add root screenshot
|
||||||
|
|
||||||
|
|
||||||
|
def create_graph_links(links_set):
|
||||||
|
graph_links_list = []
|
||||||
|
for link in links_set:
|
||||||
|
graph_links_list.append({"source": link[0], "target": link[1]})
|
||||||
|
return graph_links_list
|
||||||
|
|
||||||
|
def create_graph_nodes(nodes_set, root_node_id):
|
||||||
|
graph_nodes_list = []
|
||||||
|
for node_id in nodes_set:
|
||||||
|
correlation_name, correlation_type, value = node_id.split(';', 3)
|
||||||
|
dict_node = {"id": node_id}
|
||||||
|
dict_node['style'] = get_correlation_node_icon(correlation_name, correlation_type, value)
|
||||||
|
dict_node['text'] = value
|
||||||
|
if node_id == root_node_id:
|
||||||
|
dict_node["style"]["node_color"] = 'orange'
|
||||||
|
dict_node["style"]["node_radius"] = 7
|
||||||
|
dict_node['url'] = get_item_url(correlation_name, value, correlation_type)
|
||||||
|
graph_nodes_list.append(dict_node)
|
||||||
|
return graph_nodes_list
|
||||||
|
|
||||||
|
def create_node_id(correlation_name, value, correlation_type=''):
|
||||||
|
if correlation_type is None:
|
||||||
|
correlation_type = ''
|
||||||
|
return '{};{};{}'.format(correlation_name, correlation_type, value)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # TODO: filter by correlation type => bitcoin, mail, ...
|
||||||
|
def get_graph_node_object_correlation(object_type, root_value, mode, correlation_names, correlation_objects, max_nodes=300, requested_correl_type=None):
|
||||||
|
links = set()
|
||||||
|
nodes = set()
|
||||||
|
|
||||||
|
root_node_id = create_node_id(object_type, root_value, requested_correl_type)
|
||||||
|
nodes.add(root_node_id)
|
||||||
|
|
||||||
|
root_correlation = get_object_correlation(object_type, root_value, correlation_names, correlation_objects, requested_correl_type=requested_correl_type)
|
||||||
|
for correl in root_correlation:
|
||||||
|
if correl in ('pgp', 'cryptocurrency'):
|
||||||
|
for correl_type in root_correlation[correl]:
|
||||||
|
for correl_val in root_correlation[correl][correl_type]:
|
||||||
|
|
||||||
|
# add correlation
|
||||||
|
correl_node_id = create_node_id(correl, correl_val, correl_type)
|
||||||
|
|
||||||
|
if mode=="union":
|
||||||
|
if len(nodes) > max_nodes:
|
||||||
|
break
|
||||||
|
nodes.add(correl_node_id)
|
||||||
|
links.add((root_node_id, correl_node_id))
|
||||||
|
|
||||||
|
# get second correlation
|
||||||
|
res = get_object_correlation(correl, correl_val, correlation_names, correlation_objects, requested_correl_type=correl_type)
|
||||||
|
if res:
|
||||||
|
for corr_obj in res:
|
||||||
|
for correl_key_val in res[corr_obj]:
|
||||||
|
#filter root value
|
||||||
|
if correl_key_val == root_value:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(nodes) > max_nodes:
|
||||||
|
break
|
||||||
|
new_corel_1 = create_node_id(corr_obj, correl_key_val)
|
||||||
|
new_corel_2 = create_node_id(correl, correl_val, correl_type)
|
||||||
|
nodes.add(new_corel_1)
|
||||||
|
nodes.add(new_corel_2)
|
||||||
|
links.add((new_corel_1, new_corel_2))
|
||||||
|
|
||||||
|
if mode=="inter":
|
||||||
|
nodes.add(correl_node_id)
|
||||||
|
links.add((root_node_id, correl_node_id))
|
||||||
|
if correl in ('decoded', 'screenshot', 'domain', 'paste'):
|
||||||
|
for correl_val in root_correlation[correl]:
|
||||||
|
|
||||||
|
correl_node_id = create_node_id(correl, correl_val)
|
||||||
|
if mode=="union":
|
||||||
|
if len(nodes) > max_nodes:
|
||||||
|
break
|
||||||
|
nodes.add(correl_node_id)
|
||||||
|
links.add((root_node_id, correl_node_id))
|
||||||
|
|
||||||
|
res = get_object_correlation(correl, correl_val, correlation_names, correlation_objects)
|
||||||
|
if res:
|
||||||
|
for corr_obj in res:
|
||||||
|
if corr_obj in ('decoded', 'domain', 'paste', 'screenshot'):
|
||||||
|
for correl_key_val in res[corr_obj]:
|
||||||
|
#filter root value
|
||||||
|
if correl_key_val == root_value:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(nodes) > max_nodes:
|
||||||
|
break
|
||||||
|
new_corel_1 = create_node_id(corr_obj, correl_key_val)
|
||||||
|
new_corel_2 = create_node_id(correl, correl_val)
|
||||||
|
nodes.add(new_corel_1)
|
||||||
|
nodes.add(new_corel_2)
|
||||||
|
links.add((new_corel_1, new_corel_2))
|
||||||
|
|
||||||
|
if mode=="inter":
|
||||||
|
nodes.add(correl_node_id)
|
||||||
|
links.add((root_node_id, correl_node_id))
|
||||||
|
|
||||||
|
if corr_obj in ('pgp', 'cryptocurrency'):
|
||||||
|
for correl_key_type in res[corr_obj]:
|
||||||
|
for correl_key_val in res[corr_obj][correl_key_type]:
|
||||||
|
#filter root value
|
||||||
|
if correl_key_val == root_value:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(nodes) > max_nodes:
|
||||||
|
break
|
||||||
|
new_corel_1 = create_node_id(corr_obj, correl_key_val, correl_key_type)
|
||||||
|
new_corel_2 = create_node_id(correl, correl_val)
|
||||||
|
nodes.add(new_corel_1)
|
||||||
|
nodes.add(new_corel_2)
|
||||||
|
links.add((new_corel_1, new_corel_2))
|
||||||
|
|
||||||
|
if mode=="inter":
|
||||||
|
nodes.add(correl_node_id)
|
||||||
|
links.add((root_node_id, correl_node_id))
|
||||||
|
|
||||||
|
|
||||||
|
return {"nodes": create_graph_nodes(nodes, root_node_id), "links": create_graph_links(links)}
|
||||||
|
|
||||||
|
|
||||||
|
def get_obj_global_id(obj_type, obj_id, obj_sub_type=None):
|
||||||
|
if obj_sub_type:
|
||||||
|
return '{}:{}:{}'.format(obj_type, obj_sub_type, obj_id)
|
||||||
|
else:
|
||||||
|
# # TODO: remove me
|
||||||
|
if obj_type=='paste':
|
||||||
|
obj_type='item'
|
||||||
|
# # TODO: remove me
|
||||||
|
if obj_type=='screenshot':
|
||||||
|
obj_type='image'
|
||||||
|
|
||||||
|
return '{}:{}'.format(obj_type, obj_id)
|
||||||
|
|
||||||
|
######## API EXPOSED ########
|
||||||
|
def sanitize_object_type(object_type):
|
||||||
|
if not is_valid_object_type(object_type):
|
||||||
|
return ({'status': 'error', 'reason': 'Incorrect object_type'}, 400)
|
||||||
|
######## ########
|
|
@ -843,6 +843,21 @@ def get_all_queues_stats():
|
||||||
dict_stats[queue_type] = get_stats_elem_to_crawl_by_queue_type(queue_type)
|
dict_stats[queue_type] = get_stats_elem_to_crawl_by_queue_type(queue_type)
|
||||||
return dict_stats
|
return dict_stats
|
||||||
|
|
||||||
|
def is_domain_in_queue(queue_type, domain):
|
||||||
|
return r_serv_onion.sismember(f'{queue_type}_domain_crawler_queue', domain)
|
||||||
|
|
||||||
|
def is_item_in_queue(queue_type, url, item_id, queue_name=None):
|
||||||
|
if queue_name is None:
|
||||||
|
queues = get_all_queues_keys()
|
||||||
|
else:
|
||||||
|
queues = get_queue_key_by_name(queue_name)
|
||||||
|
|
||||||
|
key = f'{url};{item_id}'
|
||||||
|
for queue in queues:
|
||||||
|
if r_serv_onion.sismember(queue.format(queue_type), key):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def add_item_to_discovery_queue(queue_type, domain, subdomain, url, item_id):
|
def add_item_to_discovery_queue(queue_type, domain, subdomain, url, item_id):
|
||||||
date_month = datetime.now().strftime("%Y%m")
|
date_month = datetime.now().strftime("%Y%m")
|
||||||
date = datetime.now().strftime("%Y%m%d")
|
date = datetime.now().strftime("%Y%m%d")
|
||||||
|
@ -868,6 +883,17 @@ def add_item_to_discovery_queue(queue_type, domain, subdomain, url, item_id):
|
||||||
r_serv_onion.sadd(f'{queue_type}_crawler_queue', msg)
|
r_serv_onion.sadd(f'{queue_type}_crawler_queue', msg)
|
||||||
print(f'sent to queue: {subdomain}')
|
print(f'sent to queue: {subdomain}')
|
||||||
|
|
||||||
|
def queue_test_clean_up(queue_type, domain, item_id):
|
||||||
|
date_month = datetime.now().strftime("%Y%m")
|
||||||
|
r_serv_onion.srem(f'month_{queue_type}_up:{date_month}', domain)
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
r_serv_onion.srem(f'{queue_type}_domain_crawler_queue', domain)
|
||||||
|
msg = f'{domain};{item_id}'
|
||||||
|
r_serv_onion.srem(f'{queue_type}_crawler_discovery_queue', msg)
|
||||||
|
r_serv_onion.srem(f'{queue_type}_crawler_queue', msg)
|
||||||
|
|
||||||
|
|
||||||
def remove_task_from_crawler_queue(queue_name, queue_type, key_to_remove):
|
def remove_task_from_crawler_queue(queue_name, queue_type, key_to_remove):
|
||||||
r_serv_onion.srem(queue_name.format(queue_type), key_to_remove)
|
r_serv_onion.srem(queue_name.format(queue_type), key_to_remove)
|
||||||
|
|
||||||
|
@ -1417,7 +1443,7 @@ def test_ail_crawlers():
|
||||||
|
|
||||||
#### ---- ####
|
#### ---- ####
|
||||||
|
|
||||||
if __name__ == '__main__':
|
#if __name__ == '__main__':
|
||||||
# res = get_splash_manager_version()
|
# res = get_splash_manager_version()
|
||||||
# res = test_ail_crawlers()
|
# res = test_ail_crawlers()
|
||||||
# res = is_test_ail_crawlers_successful()
|
# res = is_test_ail_crawlers_successful()
|
||||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,43 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
sys.path.append(os.environ['AIL_BIN'])
|
||||||
|
|
||||||
|
# Modules Classes
|
||||||
|
from Onion import Onion
|
||||||
|
|
||||||
|
# projects packages
|
||||||
|
import lib.crawlers as crawlers
|
||||||
|
|
||||||
|
class Test_Module_Onion(unittest.TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.module_obj = Onion()
|
||||||
|
|
||||||
|
def test_module(self):
|
||||||
|
item_id = 'tests/2021/01/01/onion.gz'
|
||||||
|
domain_1 = 'eswpccgr5xyovsahffkehgleqthrasfpfdblwbs4lstd345dwq5qumqd.onion'
|
||||||
|
domain_2 = 'www.facebookcorewwwi.onion'
|
||||||
|
crawlers.queue_test_clean_up('onion', domain_1, 'tests/2021/01/01/onion.gz')
|
||||||
|
|
||||||
|
self.module_obj.compute(f'{item_id} 3')
|
||||||
|
if crawlers.is_crawler_activated():
|
||||||
|
## check domain queues
|
||||||
|
# all domains queue
|
||||||
|
self.assertTrue(crawlers.is_domain_in_queue('onion', domain_1))
|
||||||
|
# all url/item queue
|
||||||
|
self.assertTrue(crawlers.is_item_in_queue('onion', f'http://{domain_1}', item_id))
|
||||||
|
# domain blacklist
|
||||||
|
self.assertFalse(crawlers.is_domain_in_queue('onion', domain_2))
|
||||||
|
# invalid onion
|
||||||
|
self.assertFalse(crawlers.is_domain_in_queue('onion', 'invalid.onion'))
|
||||||
|
|
||||||
|
# clean DB
|
||||||
|
crawlers.queue_test_clean_up('onion', domain_1, 'tests/2021/01/01/onion.gz')
|
||||||
|
else:
|
||||||
|
# # TODO: check warning logs
|
||||||
|
pass
|
Loading…
Reference in New Issue