chg: [Onion] add onion splash crawler

pull/260/head
Terrtia 2018-08-09 17:42:21 +02:00
parent 54cc4f3723
commit 8b1c10b38c
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
7 changed files with 319 additions and 2 deletions

92
bin/Crawler.py Executable file
View File

@ -0,0 +1,92 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import sys
import redis
import datetime
import time
import subprocess
sys.path.append(os.environ['AIL_BIN'])
from Helper import Process
from pubsublogger import publisher
def signal_handler(sig, frame):
sys.exit(0)
if __name__ == '__main__':
publisher.port = 6380
publisher.channel = "Script"
publisher.info("Script Crawler started")
config_section = 'Crawler'
# Setup the I/O queues
p = Process(config_section)
splash_url = p.config.get("Crawler", "splash_url")
http_proxy = p.config.get("Crawler", "http_proxy")
crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
#signal.signal(signal.SIGINT, signal_handler)
r_serv_metadata = redis.StrictRedis(
host=p.config.get("ARDB_Metadata", "host"),
port=p.config.getint("ARDB_Metadata", "port"),
db=p.config.getint("ARDB_Metadata", "db"),
decode_responses=True)
r_cache = redis.StrictRedis(
host=p.config.get("Redis_Cache", "host"),
port=p.config.getint("Redis_Cache", "port"),
db=p.config.getint("Redis_Cache", "db"),
decode_responses=True)
r_onion = redis.StrictRedis(
host=p.config.get("ARDB_Onion", "host"),
port=p.config.getint("ARDB_Onion", "port"),
db=p.config.getint("ARDB_Onion", "db"),
decode_responses=True)
while True:
message = p.get_from_set()
# Recovering the streamed message informations.
if message is not None:
splitted = message.split(';')
if len(splitted) == 2:
url, paste = splitted
print(url)
if not r_cache.exists(url):
super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father')
if super_father is None:
super_father=paste
process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, paste, super_father],
stdout=subprocess.PIPE)
while process.poll() is None:
time.sleep(1)
date = datetime.datetime.now().strftime("%Y%m%d")
print(date)
url_domain = url.replace('http://', '')
if process.returncode == 0:
if r_serv_metadata.exists('paste_children:'+paste):
msg = 'infoleak:automatic-detection="onion";{}'.format(paste)
p.populate_set_out(msg, 'Tags')
r_onion.sadd('onion_up:'+date , url_domain)
else:
r_onion.sadd('onion_down:'+date , url_domain)
print(process.stdout.read())
else:
continue
else:
time.sleep(1)

View File

@ -21,7 +21,6 @@ Requirements
*Need the ZMQ_Sub_Onion_Q Module running to be able to work properly.
"""
import pprint
import time
from packages import Paste
from pubsublogger import publisher
@ -123,6 +122,7 @@ if __name__ == "__main__":
PST = Paste.Paste(filename)
for x in PST.get_regex(url_regex):
print(x)
# Extracting url with regex
url, s, credential, subdomain, domain, host, port, \
resource_path, query_string, f1, f2, f3, f4 = x
@ -149,12 +149,18 @@ if __name__ == "__main__":
to_print = 'Onion;{};{};{};'.format(PST.p_source,
PST.p_date,
PST.p_name)
'''
for url in fetch(p, r_cache, urls, domains_list, path):
publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_path))
p.populate_set_out('onion;{}'.format(PST.p_path), 'alertHandler')
msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path)
p.populate_set_out(msg, 'Tags')
'''
for url in urls:
msg = '{};{}'.format(url,PST.p_path)
print('send to crawler')
p.populate_set_out(msg, 'Crawler')
else:
publisher.info('{}Onion related;{}'.format(to_print, PST.p_path))

View File

@ -3,6 +3,8 @@ bloomfilters = Blooms
dicofilters = Dicos
pastes = PASTES
base64 = BASE64
crawled = crawled
crawled_screenshot = CRAWLED_SCREENSHOT
wordtrending_csv = var/www/static/csv/wordstrendingdata
wordsfile = files/wordfile
@ -171,6 +173,11 @@ host = localhost
port = 6382
db = 8
[ARDB_Onion]
host = localhost
port = 6382
db = 9
[Url]
cc_critical = DE
@ -215,3 +222,8 @@ channel = FetchedOnion
host = localhost
port = 6381
db = 0
[Crawler]
crawler_depth_limit = 1
splash_url = http://127.0.0.1:8050
http_proxy = http://127.0.0.1:9050

View File

@ -61,7 +61,7 @@ publish = Redis_Duplicate,Redis_ModuleStats,Redis_alertHandler,Redis_Tags
[Onion]
subscribe = Redis_Onion
publish = Redis_ValidOnion,ZMQ_FetchedOnion,Redis_alertHandler,Redis_Tags
publish = Redis_ValidOnion,ZMQ_FetchedOnion,Redis_alertHandler,Redis_Tags,Redis_Crawler
#publish = Redis_Global,Redis_ValidOnion,ZMQ_FetchedOnion,Redis_alertHandler
[DumpValidOnion]
@ -136,3 +136,8 @@ publish = Redis_Duplicate,Redis_alertHandler,Redis_Tags
[submit_paste]
subscribe = Redis
publish = Redis_Mixer
[Crawler]
subscribe = Redis_Crawler
publish = Redis_Mixer,Redis_Tags

View File

@ -0,0 +1,165 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import sys
import gzip
import base64
import uuid
import datetime
import base64
import redis
from urllib.parse import urlparse
from scrapy import Spider
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess, Crawler
from twisted.internet import reactor
from scrapy_splash import SplashRequest
sys.path.append(os.environ['AIL_BIN'])
from Helper import Process
class TorSplashCrawler():
def __init__(self, splash_url, http_proxy, crawler_depth_limit):
self.process = CrawlerProcess({'LOG_ENABLED': False})
self.crawler = Crawler(self.TorSplashSpider, {
'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0',
'SPLASH_URL': splash_url,
'HTTP_PROXY': http_proxy,
'ROBOTSTXT_OBEY': False,
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
},
'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
'DEPTH_LIMIT': crawler_depth_limit
})
def crawl(self, url, original_paste, super_father):
self.process.crawl(self.crawler, url=url, original_paste=original_paste, super_father=super_father)
self.process.start()
class TorSplashSpider(Spider):
name = 'TorSplashSpider'
def __init__(self, url, original_paste, super_father, *args, **kwargs):
self.original_paste = original_paste
self.super_father = super_father
self.start_urls = url
self.domains = [urlparse(url).netloc]
date = datetime.datetime.now().strftime("%Y/%m/%d")
config_section = 'Crawler'
self.p = Process(config_section)
self.r_cache = redis.StrictRedis(
host=self.p.config.get("Redis_Cache", "host"),
port=self.p.config.getint("Redis_Cache", "port"),
db=self.p.config.getint("Redis_Cache", "db"),
decode_responses=True)
self.r_serv_log_submit = redis.StrictRedis(
host=self.p.config.get("Redis_Log_submit", "host"),
port=self.p.config.getint("Redis_Log_submit", "port"),
db=self.p.config.getint("Redis_Log_submit", "db"),
decode_responses=True)
self.r_serv_metadata = redis.StrictRedis(
host=self.p.config.get("ARDB_Metadata", "host"),
port=self.p.config.getint("ARDB_Metadata", "port"),
db=self.p.config.getint("ARDB_Metadata", "db"),
decode_responses=True)
self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"),
self.p.config.get("Directories", "crawled"), date )
self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date )
def start_requests(self):
yield SplashRequest(
self.start_urls,
self.parse,
endpoint='render.json',
meta={'parent': self.original_paste},
args={ 'html': 1,
'wait': 10,
'render_all': 1,
'png': 1}
)
def parse(self,response):
print(response.headers)
print(response.status)
self.r_cache.setbit(response.url, 0, 1)
self.r_cache.expire(response.url, 360000)
UUID = self.domains[0]+str(uuid.uuid4())
filename_paste = os.path.join(self.crawled_paste_filemame, UUID)
filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png')
# save new paste on disk
if self.save_crawled_paste(filename_paste, response.data['html']):
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father)
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['parent'])
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url)
self.r_serv_metadata.sadd('paste_children:'+response.meta['parent'], filename_paste)
dirname = os.path.dirname(filename_screenshot)
if not os.path.exists(dirname):
os.makedirs(dirname)
with open(filename_screenshot, 'wb') as f:
f.write(base64.standard_b64decode(response.data['png'].encode()))
# save external links in set
lext = LinkExtractor(deny_domains=self.domains, unique=True)
for link in lext.extract_links(response):
self.r_serv_metadata.sadd('paste_crawler:filename_paste', link)
#le = LinkExtractor(unique=True)
le = LinkExtractor(allow_domains=self.domains, unique=True)
for link in le.extract_links(response):
self.r_cache.setbit(link, 0, 0)
self.r_cache.expire(link, 360000)
yield SplashRequest(
link.url,
self.parse,
endpoint='render.json',
meta={'parent': UUID},
args={ 'html': 1,
'png': 1,
'render_all': 1,
'wait': 10}
)
def save_crawled_paste(self, filename, content):
print(filename)
if os.path.isfile(filename):
print('File: {} already exist in submitted pastes'.format(filename))
return False
try:
gzipencoded = gzip.compress(content.encode())
gzip64encoded = base64.standard_b64encode(gzipencoded).decode()
except:
print("file error: {}".format(filename))
return False
# send paste to Global
relay_message = "{0} {1}".format(filename, gzip64encoded)
self.p.populate_set_out(relay_message, 'Mixer')
# increase nb of paste by feeder name
self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1)
# tag crawled paste
msg = 'infoleak:submission="crawler";{}'.format(filename)
self.p.populate_set_out(msg, 'Tags')
return True

33
bin/torcrawler/tor_crawler.py Executable file
View File

@ -0,0 +1,33 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import sys
import configparser
from TorSplashCrawler import TorSplashCrawler
if __name__ == '__main__':
if len(sys.argv) != 4:
print('usage:', 'tor_crawler.py', 'url', 'paste', 'super_father')
exit(1)
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
if not os.path.exists(configfile):
raise Exception('Unable to find the configuration file. \
Did you set environment variables? \
Or activate the virtualenv.')
cfg = configparser.ConfigParser()
cfg.read(configfile)
splash_url = cfg.get("Crawler", "splash_url")
http_proxy = cfg.get("Crawler", "http_proxy")
crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit")
url = sys.argv[1]
paste = sys.argv[2]
super_father = sys.argv[3]
crawler = TorSplashCrawler(splash_url, http_proxy, crawler_depth_limit)
crawler.crawl(url, paste, super_father)

View File

@ -0,0 +1,4 @@
[proxy]
host=localhost
port=9050
type=SOCKS5