From 6328cc22b7524a0291567eed1a76d191f13b1dd1 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Fri, 28 Sep 2018 16:29:09 +0200 Subject: [PATCH] chg: [Crawler] add domains blacklist --- bin/Crawler.py | 12 +++++++++++- bin/torcrawler/blacklist_onion.txt | 1 + .../modules/hiddenServices/Flask_hiddenServices.py | 1 + .../hiddenServices/templates/hiddenServices.html | 4 ++++ 4 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 bin/torcrawler/blacklist_onion.txt diff --git a/bin/Crawler.py b/bin/Crawler.py index 9ebff043..99917c49 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -130,6 +130,16 @@ if __name__ == '__main__': db=p.config.getint("ARDB_Onion", "db"), decode_responses=True) + # load domains blacklist + try: + with open(os.environ['AIL_BIN']+'/torcrawler/blacklist_onion.txt', 'r') as f: + r_onion.delete('blacklist_{}'.format(type_hidden_service)) + lines = f.read().splitlines() + for line in lines: + r_onion.sadd('blacklist_{}'.format(type_hidden_service), line) + except Exception: + pass + while True: # Recovering the streamed message informations. @@ -160,7 +170,7 @@ if __name__ == '__main__': print('domain: {}'.format(domain)) print('domain_url: {}'.format(domain_url)) - if not r_onion.sismember('banned_{}'.format(type_hidden_service), domain): + if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain): date = datetime.datetime.now().strftime("%Y%m%d") date_month = datetime.datetime.now().strftime("%Y%m") diff --git a/bin/torcrawler/blacklist_onion.txt b/bin/torcrawler/blacklist_onion.txt new file mode 100644 index 00000000..a96b0bb8 --- /dev/null +++ b/bin/torcrawler/blacklist_onion.txt @@ -0,0 +1 @@ +www.facebookcorewwwi.onion diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index c504d1d3..47ea56f1 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -75,6 +75,7 @@ def hiddenServices_page(): statDomains['domains_up'] = r_serv_onion.scard('onion_up:{}'.format(date)) statDomains['domains_down'] = r_serv_onion.scard('onion_down:{}'.format(date)) statDomains['total'] = statDomains['domains_up'] + statDomains['domains_down'] + statDomains['domains_queue'] = r_serv_onion.scard('onion_domain_crawler_queue') for onion in last_onions: metadata_onion = {} diff --git a/var/www/modules/hiddenServices/templates/hiddenServices.html b/var/www/modules/hiddenServices/templates/hiddenServices.html index 1839c5ef..59aeb2ae 100644 --- a/var/www/modules/hiddenServices/templates/hiddenServices.html +++ b/var/www/modules/hiddenServices/templates/hiddenServices.html @@ -120,6 +120,10 @@ Crawled Domains {{ statDomains['total'] }} + + + Domains in Queue + {{ statDomains['domains_queue'] }}