chg: [Onion] change onion regex, fix crawler

pull/260/head
Terrtia 2018-08-13 09:23:14 +02:00
parent 8b1c10b38c
commit 7652089433
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
4 changed files with 82 additions and 34 deletions

View File

@ -3,6 +3,7 @@
import os import os
import sys import sys
import re
import redis import redis
import datetime import datetime
import time import time
@ -16,6 +17,33 @@ from pubsublogger import publisher
def signal_handler(sig, frame): def signal_handler(sig, frame):
sys.exit(0) sys.exit(0)
def crawl_onion(url, domain):
date = datetime.datetime.now().strftime("%Y%m%d")
if not r_onion.sismember('onion_up:'+date , domain):
super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father')
if super_father is None:
super_father=paste
process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, domain, paste, super_father],
stdout=subprocess.PIPE)
while process.poll() is None:
time.sleep(1)
if process.returncode == 0:
if r_serv_metadata.exists('paste_children:'+paste):
msg = 'infoleak:automatic-detection="onion";{}'.format(paste)
p.populate_set_out(msg, 'Tags')
print(process.stdout.read())
r_onion.sadd('onion_up:'+date , domain)
r_onion.sadd('onion_up_link:'+date , url)
else:
r_onion.sadd('onion_down:'+date , domain)
r_onion.sadd('onion_down_link:'+date , url)
print(process.stdout.read())
if __name__ == '__main__': if __name__ == '__main__':
publisher.port = 6380 publisher.port = 6380
@ -52,6 +80,9 @@ if __name__ == '__main__':
db=p.config.getint("ARDB_Onion", "db"), db=p.config.getint("ARDB_Onion", "db"),
decode_responses=True) decode_responses=True)
url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
re.compile(url_regex)
while True: while True:
message = p.get_from_set() message = p.get_from_set()
@ -61,30 +92,24 @@ if __name__ == '__main__':
if len(splitted) == 2: if len(splitted) == 2:
url, paste = splitted url, paste = splitted
print(url) url_list = re.findall(url_regex, url)[0]
if url_list[1] == '':
url= 'http://{}'.format(url)
if not r_cache.exists(url): link, s, credential, subdomain, domain, host, port, \
super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father') resource_path, query_string, f1, f2, f3, f4 = url_list
if super_father is None: domain = url_list[4]
super_father=paste
process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, paste, super_father], domain_url = 'http://{}'.format(domain)
stdout=subprocess.PIPE)
while process.poll() is None:
time.sleep(1)
date = datetime.datetime.now().strftime("%Y%m%d") print('------------------START ONIOM CRAWLER------------------')
print(date) print('url: {}'.format(url))
url_domain = url.replace('http://', '') print('domain: {}'.format(domain))
if process.returncode == 0: print('domain_url: {}'.format(domain_url))
if r_serv_metadata.exists('paste_children:'+paste):
msg = 'infoleak:automatic-detection="onion";{}'.format(paste)
p.populate_set_out(msg, 'Tags')
r_onion.sadd('onion_up:'+date , url_domain) crawl_onion(url, domain)
else: if url != domain_url:
r_onion.sadd('onion_down:'+date , url_domain) crawl_onion(domain_url, domain)
print(process.stdout.read())
else: else:
continue continue

View File

@ -108,7 +108,7 @@ if __name__ == "__main__":
# Thanks to Faup project for this regex # Thanks to Faup project for this regex
# https://github.com/stricaud/faup # https://github.com/stricaud/faup
url_regex = "((http|https|ftp)\://([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
while True: while True:
if message is not None: if message is not None:

View File

@ -9,14 +9,11 @@ import uuid
import datetime import datetime
import base64 import base64
import redis import redis
from urllib.parse import urlparse
from scrapy import Spider from scrapy import Spider
from scrapy.linkextractors import LinkExtractor from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess, Crawler from scrapy.crawler import CrawlerProcess, Crawler
from twisted.internet import reactor
from scrapy_splash import SplashRequest from scrapy_splash import SplashRequest
sys.path.append(os.environ['AIL_BIN']) sys.path.append(os.environ['AIL_BIN'])
@ -40,19 +37,20 @@ class TorSplashCrawler():
'DEPTH_LIMIT': crawler_depth_limit 'DEPTH_LIMIT': crawler_depth_limit
}) })
def crawl(self, url, original_paste, super_father): def crawl(self, url, domain, original_paste, super_father):
self.process.crawl(self.crawler, url=url, original_paste=original_paste, super_father=super_father) self.process.crawl(self.crawler, url=url, domain=domain,original_paste=original_paste, super_father=super_father)
self.process.start() self.process.start()
class TorSplashSpider(Spider): class TorSplashSpider(Spider):
name = 'TorSplashSpider' name = 'TorSplashSpider'
def __init__(self, url, original_paste, super_father, *args, **kwargs): def __init__(self, url, domain,original_paste, super_father, *args, **kwargs):
self.original_paste = original_paste self.original_paste = original_paste
self.super_father = super_father self.super_father = super_father
self.start_urls = url self.start_urls = url
self.domains = [urlparse(url).netloc] self.domains = [domain]
date = datetime.datetime.now().strftime("%Y/%m/%d") date = datetime.datetime.now().strftime("%Y/%m/%d")
self.full_date = datetime.datetime.now().strftime("%Y%m%d")
config_section = 'Crawler' config_section = 'Crawler'
self.p = Process(config_section) self.p = Process(config_section)
@ -75,6 +73,12 @@ class TorSplashCrawler():
db=self.p.config.getint("ARDB_Metadata", "db"), db=self.p.config.getint("ARDB_Metadata", "db"),
decode_responses=True) decode_responses=True)
self.r_serv_onion = redis.StrictRedis(
host=self.p.config.get("ARDB_Onion", "host"),
port=self.p.config.getint("ARDB_Onion", "port"),
db=self.p.config.getint("ARDB_Onion", "db"),
decode_responses=True)
self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"), self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"),
self.p.config.get("Directories", "crawled"), date ) self.p.config.get("Directories", "crawled"), date )
@ -96,6 +100,7 @@ class TorSplashCrawler():
print(response.headers) print(response.headers)
print(response.status) print(response.status)
# # TODO: # FIXME:
self.r_cache.setbit(response.url, 0, 1) self.r_cache.setbit(response.url, 0, 1)
self.r_cache.expire(response.url, 360000) self.r_cache.expire(response.url, 360000)
@ -105,8 +110,19 @@ class TorSplashCrawler():
# save new paste on disk # save new paste on disk
if self.save_crawled_paste(filename_paste, response.data['html']): if self.save_crawled_paste(filename_paste, response.data['html']):
# create onion metadata
if not self.r_serv_onion.exists('onion_metadata:{}'.format(self.domain[0])):
self.r_serv_onion.hset('onion_metadata:{}'.format(self.domain[0]), 'first_seen', self.full_date)
self.r_serv_onion.hset('onion_metadata:{}'.format(self.domain[0]), 'last_seen', self.full_date)
# add onion screenshot history
self.r_serv_onion.sadd('onion_history:{}'.format(self.domain[0]), self.full_date)
#create paste metadata
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father) self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father)
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['parent']) self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['parent'])
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0])
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url) self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url)
self.r_serv_metadata.sadd('paste_children:'+response.meta['parent'], filename_paste) self.r_serv_metadata.sadd('paste_children:'+response.meta['parent'], filename_paste)
@ -114,6 +130,13 @@ class TorSplashCrawler():
dirname = os.path.dirname(filename_screenshot) dirname = os.path.dirname(filename_screenshot)
if not os.path.exists(dirname): if not os.path.exists(dirname):
os.makedirs(dirname) os.makedirs(dirname)
print(sys.getsizeof(response.data['png']))
print(sys.getsizeof(response.data['html']))
print(self.domains[0])
with open(filename_screenshot, 'wb') as f: with open(filename_screenshot, 'wb') as f:
f.write(base64.standard_b64decode(response.data['png'].encode())) f.write(base64.standard_b64decode(response.data['png'].encode()))
@ -140,7 +163,6 @@ class TorSplashCrawler():
def save_crawled_paste(self, filename, content): def save_crawled_paste(self, filename, content):
print(filename)
if os.path.isfile(filename): if os.path.isfile(filename):
print('File: {} already exist in submitted pastes'.format(filename)) print('File: {} already exist in submitted pastes'.format(filename))
return False return False

View File

@ -8,8 +8,8 @@ from TorSplashCrawler import TorSplashCrawler
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) != 4: if len(sys.argv) != 5:
print('usage:', 'tor_crawler.py', 'url', 'paste', 'super_father') print('usage:', 'tor_crawler.py', 'url', 'domain', 'paste', 'super_father')
exit(1) exit(1)
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
@ -26,8 +26,9 @@ if __name__ == '__main__':
crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit") crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit")
url = sys.argv[1] url = sys.argv[1]
paste = sys.argv[2] domain = sys.argv[2]
super_father = sys.argv[3] paste = sys.argv[3]
super_father = sys.argv[4]
crawler = TorSplashCrawler(splash_url, http_proxy, crawler_depth_limit) crawler = TorSplashCrawler(splash_url, http_proxy, crawler_depth_limit)
crawler.crawl(url, paste, super_father) crawler.crawl(url, domain, paste, super_father)