mirror of https://github.com/CIRCL/AIL-framework
fix: [Crawler] fix onion blacklist + add crawler info
parent
1a1fda4c47
commit
bb301a870c
|
@ -10,6 +10,8 @@ import time
|
|||
import subprocess
|
||||
import requests
|
||||
|
||||
from pyfaup.faup import Faup
|
||||
|
||||
sys.path.append(os.environ['AIL_BIN'])
|
||||
from Helper import Process
|
||||
from pubsublogger import publisher
|
||||
|
@ -22,6 +24,9 @@ def on_error_send_message_back_in_queue(type_hidden_service, domain, message):
|
|||
|
||||
def crawl_onion(url, domain, date, date_month, message):
|
||||
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain)
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
|
||||
|
||||
#if not r_onion.sismember('full_onion_up', domain) and not r_onion.sismember('onion_down:'+date , domain):
|
||||
super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father')
|
||||
if super_father is None:
|
||||
|
@ -43,13 +48,15 @@ def crawl_onion(url, domain, date, date_month, message):
|
|||
print('--------------------------------------')
|
||||
print(' \033[91m DOCKER SPLASH DOWN\033[0m')
|
||||
print(' {} DOWN'.format(splash_url))
|
||||
exit(1)
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'SPLASH DOWN')
|
||||
nb_retry == 0
|
||||
|
||||
print(' \033[91m DOCKER SPLASH NOT AVAILABLE\033[0m')
|
||||
print(' Retry({}) in 10 seconds'.format(nb_retry))
|
||||
time.sleep(10)
|
||||
|
||||
if r.status_code == 200:
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling')
|
||||
process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, type_hidden_service, url, domain, paste, super_father],
|
||||
stdout=subprocess.PIPE)
|
||||
while process.poll() is None:
|
||||
|
@ -67,6 +74,7 @@ def crawl_onion(url, domain, date, date_month, message):
|
|||
print('')
|
||||
print(' PROXY DOWN OR BAD CONFIGURATION\033[0m'.format(splash_url))
|
||||
print('------------------------------------------------------------------------')
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Error')
|
||||
exit(-2)
|
||||
else:
|
||||
print(process.stdout.read())
|
||||
|
@ -76,6 +84,7 @@ def crawl_onion(url, domain, date, date_month, message):
|
|||
print('--------------------------------------')
|
||||
print(' \033[91m DOCKER SPLASH DOWN\033[0m')
|
||||
print(' {} DOWN'.format(splash_url))
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling')
|
||||
exit(1)
|
||||
|
||||
|
||||
|
@ -119,6 +128,7 @@ if __name__ == '__main__':
|
|||
print('splash url: {}'.format(splash_url))
|
||||
|
||||
crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
|
||||
faup = Faup()
|
||||
|
||||
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"))
|
||||
|
||||
|
@ -140,6 +150,10 @@ if __name__ == '__main__':
|
|||
db=p.config.getint("ARDB_Onion", "db"),
|
||||
decode_responses=True)
|
||||
|
||||
r_cache.sadd('all_crawler:{}'.format(type_hidden_service), splash_port)
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
|
||||
|
||||
# load domains blacklist
|
||||
try:
|
||||
with open(os.environ['AIL_BIN']+'/torcrawler/blacklist_onion.txt', 'r') as f:
|
||||
|
@ -180,7 +194,10 @@ if __name__ == '__main__':
|
|||
print('domain: {}'.format(domain))
|
||||
print('domain_url: {}'.format(domain_url))
|
||||
|
||||
if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain):
|
||||
faup.decode(domain)
|
||||
onion_domain=faup.get()['domain'].decode()
|
||||
|
||||
if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain) and not r_onion.sismember('blacklist_{}'.format(type_hidden_service), onion_domain):
|
||||
|
||||
date = datetime.datetime.now().strftime("%Y%m%d")
|
||||
date_month = datetime.datetime.now().strftime("%Y%m")
|
||||
|
@ -243,6 +260,10 @@ if __name__ == '__main__':
|
|||
r_onion.lpush('last_{}'.format(type_hidden_service), domain)
|
||||
r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)
|
||||
|
||||
#update crawler status
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
|
||||
r_cache.hrem('metadata_crawler:{}'.format(splash_port), 'crawling_domain')
|
||||
|
||||
else:
|
||||
continue
|
||||
else:
|
||||
|
|
|
@ -30,6 +30,12 @@ r_serv = redis.StrictRedis(
|
|||
db=cfg.getint("Redis_Queues", "db"),
|
||||
decode_responses=True)
|
||||
|
||||
r_cache = redis.StrictRedis(
|
||||
host=cfg.get("Redis_Cache", "host"),
|
||||
port=cfg.getint("Redis_Cache", "port"),
|
||||
db=cfg.getint("Redis_Cache", "db"),
|
||||
decode_responses=True)
|
||||
|
||||
r_serv_log = redis.StrictRedis(
|
||||
host=cfg.get("Redis_Log", "host"),
|
||||
port=cfg.getint("Redis_Log", "port"),
|
||||
|
|
|
@ -19,6 +19,7 @@ import Flask_config
|
|||
app = Flask_config.app
|
||||
cfg = Flask_config.cfg
|
||||
baseUrl = Flask_config.baseUrl
|
||||
r_cache = Flask_config.r_cache
|
||||
r_serv_onion = Flask_config.r_serv_onion
|
||||
r_serv_metadata = Flask_config.r_serv_metadata
|
||||
bootstrap_label = Flask_config.bootstrap_label
|
||||
|
@ -102,8 +103,22 @@ def hiddenServices_page():
|
|||
metadata_onion['status_icon'] = 'fa-times-circle'
|
||||
list_onion.append(metadata_onion)
|
||||
|
||||
crawler_metadata=[]
|
||||
all_onion_crawler = r_cache.smembers('all_crawler:onion')
|
||||
for crawler in all_onion_crawler:
|
||||
crawling_domain = r_cache.hget('metadata_crawler:{}'.format(splash_port), 'crawling_domain')
|
||||
started_time = r_cache.hget('metadata_crawler:{}'.format(splash_port), 'started_time')
|
||||
status_info = r_cache.hget('metadata_crawler:{}'.format(splash_port), 'status')
|
||||
crawler_info = '{} - {}'.format(crawler, started_time)
|
||||
if status_info=='Waiting' or status_info=='Crawling':
|
||||
status=True
|
||||
else:
|
||||
status=False
|
||||
crawler_metadata.append({'crawler_info': crawler, 'crawling_domain': crawling_domain, 'status_info': status_info, 'status': status})
|
||||
|
||||
date_string = '{}-{}-{}'.format(date[0:4], date[4:6], date[6:8])
|
||||
return render_template("hiddenServices.html", last_onions=list_onion, statDomains=statDomains, date_from=date_string, date_to=date_string)
|
||||
return render_template("hiddenServices.html", last_onions=list_onion, statDomains=statDomains,
|
||||
crawler_metadata=crawler_metadata, date_from=date_string, date_to=date_string)
|
||||
|
||||
@hiddenServices.route("/hiddenServices/last_crawled_domains_with_stats_json", methods=['GET'])
|
||||
def last_crawled_domains_with_stats_json():
|
||||
|
|
|
@ -142,7 +142,6 @@
|
|||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="panel panel-info">
|
||||
<div class="panel-heading">
|
||||
<i class="fa fa-eye-slash"></i> Domains Crawled Today
|
||||
|
@ -203,6 +202,33 @@
|
|||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
{%if crawler_metadata%}
|
||||
<div class="panel panel-info">
|
||||
<div class="panel-heading">
|
||||
Crawlers Status
|
||||
</div>
|
||||
|
||||
<table class="table table-hover table-striped">
|
||||
<tbody>
|
||||
{% for crawler in crawler_metadata %}
|
||||
<tr>
|
||||
<td>
|
||||
<i class="fa fa-{%if crawler['status']%}check{%else%}times{%endif%}-circle fa-2x" style="color:{%if crawler['status']%}Green{%else%}Red{%endif%};"></i> {{crawler['crawler_info']}}
|
||||
</td>
|
||||
<td>
|
||||
{{crawler['crawling_domain']}}
|
||||
</td>
|
||||
<td style="color:{%if crawler['status']%}Green{%else%}Red{%endif%};">
|
||||
{{crawler['status_info']}}
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
{%endif%}
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
|
Loading…
Reference in New Issue