AIL-framework/bin/torcrawler/TorSplashCrawler.py

166 lines
6.8 KiB
Python

#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import sys
import gzip
import base64
import uuid
import datetime
import base64
import redis
from urllib.parse import urlparse
from scrapy import Spider
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess, Crawler
from twisted.internet import reactor
from scrapy_splash import SplashRequest
sys.path.append(os.environ['AIL_BIN'])
from Helper import Process
class TorSplashCrawler():
def __init__(self, splash_url, http_proxy, crawler_depth_limit):
self.process = CrawlerProcess({'LOG_ENABLED': False})
self.crawler = Crawler(self.TorSplashSpider, {
'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0',
'SPLASH_URL': splash_url,
'HTTP_PROXY': http_proxy,
'ROBOTSTXT_OBEY': False,
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
},
'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
'DEPTH_LIMIT': crawler_depth_limit
})
def crawl(self, url, original_paste, super_father):
self.process.crawl(self.crawler, url=url, original_paste=original_paste, super_father=super_father)
self.process.start()
class TorSplashSpider(Spider):
name = 'TorSplashSpider'
def __init__(self, url, original_paste, super_father, *args, **kwargs):
self.original_paste = original_paste
self.super_father = super_father
self.start_urls = url
self.domains = [urlparse(url).netloc]
date = datetime.datetime.now().strftime("%Y/%m/%d")
config_section = 'Crawler'
self.p = Process(config_section)
self.r_cache = redis.StrictRedis(
host=self.p.config.get("Redis_Cache", "host"),
port=self.p.config.getint("Redis_Cache", "port"),
db=self.p.config.getint("Redis_Cache", "db"),
decode_responses=True)
self.r_serv_log_submit = redis.StrictRedis(
host=self.p.config.get("Redis_Log_submit", "host"),
port=self.p.config.getint("Redis_Log_submit", "port"),
db=self.p.config.getint("Redis_Log_submit", "db"),
decode_responses=True)
self.r_serv_metadata = redis.StrictRedis(
host=self.p.config.get("ARDB_Metadata", "host"),
port=self.p.config.getint("ARDB_Metadata", "port"),
db=self.p.config.getint("ARDB_Metadata", "db"),
decode_responses=True)
self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"),
self.p.config.get("Directories", "crawled"), date )
self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date )
def start_requests(self):
yield SplashRequest(
self.start_urls,
self.parse,
endpoint='render.json',
meta={'parent': self.original_paste},
args={ 'html': 1,
'wait': 10,
'render_all': 1,
'png': 1}
)
def parse(self,response):
print(response.headers)
print(response.status)
self.r_cache.setbit(response.url, 0, 1)
self.r_cache.expire(response.url, 360000)
UUID = self.domains[0]+str(uuid.uuid4())
filename_paste = os.path.join(self.crawled_paste_filemame, UUID)
filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png')
# save new paste on disk
if self.save_crawled_paste(filename_paste, response.data['html']):
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father)
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['parent'])
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url)
self.r_serv_metadata.sadd('paste_children:'+response.meta['parent'], filename_paste)
dirname = os.path.dirname(filename_screenshot)
if not os.path.exists(dirname):
os.makedirs(dirname)
with open(filename_screenshot, 'wb') as f:
f.write(base64.standard_b64decode(response.data['png'].encode()))
# save external links in set
lext = LinkExtractor(deny_domains=self.domains, unique=True)
for link in lext.extract_links(response):
self.r_serv_metadata.sadd('paste_crawler:filename_paste', link)
#le = LinkExtractor(unique=True)
le = LinkExtractor(allow_domains=self.domains, unique=True)
for link in le.extract_links(response):
self.r_cache.setbit(link, 0, 0)
self.r_cache.expire(link, 360000)
yield SplashRequest(
link.url,
self.parse,
endpoint='render.json',
meta={'parent': UUID},
args={ 'html': 1,
'png': 1,
'render_all': 1,
'wait': 10}
)
def save_crawled_paste(self, filename, content):
print(filename)
if os.path.isfile(filename):
print('File: {} already exist in submitted pastes'.format(filename))
return False
try:
gzipencoded = gzip.compress(content.encode())
gzip64encoded = base64.standard_b64encode(gzipencoded).decode()
except:
print("file error: {}".format(filename))
return False
# send paste to Global
relay_message = "{0} {1}".format(filename, gzip64encoded)
self.p.populate_set_out(relay_message, 'Mixer')
# increase nb of paste by feeder name
self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1)
# tag crawled paste
msg = 'infoleak:submission="crawler";{}'.format(filename)
self.p.populate_set_out(msg, 'Tags')
return True