AIL-framework/bin/torcrawler/TorSplashCrawler.py

268 lines
10 KiB
Python

#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import sys
import uuid
import datetime
import redis
import json
import time
from hashlib import sha256
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError
from twisted.web._newclient import ResponseNeverReceived
from scrapy import Spider
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess, Crawler
from scrapy_splash import SplashRequest, SplashJsonResponse
sys.path.append(os.environ['AIL_BIN'])
from Helper import Process
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib'))
#import ConfigLoader
import Screenshot
import crawlers
script_cookie = """
function main(splash, args)
-- Default values
splash.js_enabled = true
splash.private_mode_enabled = true
splash.images_enabled = true
splash.webgl_enabled = true
splash.media_source_enabled = true
-- Force enable things
splash.plugins_enabled = true
splash.request_body_enabled = true
splash.response_body_enabled = true
splash.indexeddb_enabled = true
splash.html5_media_enabled = true
splash.http2_enabled = true
-- User defined
splash.resource_timeout = args.resource_timeout
splash.timeout = args.timeout
-- Allow to pass cookies
splash:init_cookies(args.cookies)
-- Run
ok, reason = splash:go{args.url}
if not ok and not reason:find("http") then
return {
error = reason,
last_url = splash:url()
}
end
if reason == "http504" then
splash:set_result_status_code(504)
return ''
end
splash:wait{args.wait}
-- Page instrumentation
-- splash.scroll_position = {y=1000}
splash:wait{args.wait}
-- Response
return {
har = splash:har(),
html = splash:html(),
png = splash:png{render_all=true},
cookies = splash:get_cookies(),
last_url = splash:url()
}
end
"""
class TorSplashCrawler():
def __init__(self, splash_url, crawler_options):
self.process = CrawlerProcess({'LOG_ENABLED': True})
self.crawler = Crawler(self.TorSplashSpider, {
'USER_AGENT': crawler_options['user_agent'],
'SPLASH_URL': splash_url,
'ROBOTSTXT_OBEY': False,
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
},
'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
'HTTPERROR_ALLOW_ALL': True,
'RETRY_TIMES': 2,
'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'],
'DEPTH_LIMIT': crawler_options['depth_limit'],
'SPLASH_COOKIES_DEBUG': False
})
def crawl(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item):
self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, requested_mode=requested_mode, url=url, domain=domain, port=port, cookies=cookies, original_item=original_item)
self.process.start()
class TorSplashSpider(Spider):
name = 'TorSplashSpider'
def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs):
self.domain_type = type
self.requested_mode = requested_mode
self.original_item = original_item
self.root_key = None
self.start_urls = url
self.domains = [domain]
self.port = str(port)
date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8])
self.full_date = date['date_day']
self.date_month = date['date_month']
self.date_epoch = int(date['epoch'])
self.png = crawler_options['png']
self.har = crawler_options['har']
self.cookies = cookies
config_section = 'Crawler'
self.p = Process(config_section)
self.item_dir = os.path.join(self.p.config.get("Directories", "crawled"), date_str )
self.har_dir = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str )
self.r_serv_log_submit = redis.StrictRedis(
host=self.p.config.get("Redis_Log_submit", "host"),
port=self.p.config.getint("Redis_Log_submit", "port"),
db=self.p.config.getint("Redis_Log_submit", "db"),
decode_responses=True)
self.root_key = None
def build_request_arg(self, cookies):
return {'wait': 10,
'resource_timeout': 30, # /!\ Weird behaviour if timeout < resource_timeout /!\
'timeout': 30,
'cookies': cookies,
'lua_source': script_cookie
}
def start_requests(self):
l_cookies = self.build_request_arg(self.cookies)
yield SplashRequest(
self.start_urls,
self.parse,
errback=self.errback_catcher,
endpoint='execute',
meta={'father': self.original_item, 'current_url': self.start_urls},
args=l_cookies
)
# # TODO: remove duplicate and anchor
def parse(self,response):
#print(response.headers)
#print(response.status)
if response.status == 504:
# no response
#print('504 detected')
pass
# LUA ERROR # # TODO: print/display errors
elif 'error' in response.data:
if(response.data['error'] == 'network99'):
print('Connection to proxy refused')
else:
print(response.data['error'])
elif response.status != 200:
print('other response: {}'.format(response.status))
# detect connection to proxy refused
error_log = (json.loads(response.body.decode()))
print(error_log)
#elif crawlers.is_redirection(self.domains[0], response.data['last_url']):
# pass # ignore response
else:
item_id = crawlers.create_item_id(self.item_dir, self.domains[0])
self.save_crawled_item(item_id, response.data['html'])
crawlers.create_item_metadata(item_id, self.domains[0], response.data['last_url'], self.port, response.meta['father'])
if self.root_key is None:
self.root_key = item_id
crawlers.add_domain_root_item(item_id, self.domain_type, self.domains[0], self.date_epoch, self.port)
crawlers.create_domain_metadata(self.domain_type, self.domains[0], self.port, self.full_date, self.date_month)
if 'cookies' in response.data:
all_cookies = response.data['cookies']
else:
all_cookies = []
# SCREENSHOT
if 'png' in response.data:
sha256_string = Screenshot.save_crawled_screeshot(response.data['png'], 5000000, f_save=self.requested_mode)
if sha256_string:
Screenshot.save_item_relationship(sha256_string, item_id)
Screenshot.save_domain_relationship(sha256_string, self.domains[0])
# HAR
if 'har' in response.data:
crawlers.save_har(self.har_dir, item_id, response.data['har'])
le = LinkExtractor(allow_domains=self.domains, unique=True)
for link in le.extract_links(response):
l_cookies = self.build_request_arg(all_cookies)
yield SplashRequest(
link.url,
self.parse,
errback=self.errback_catcher,
endpoint='execute',
meta={'father': item_id, 'current_url': link.url},
args=l_cookies
)
def errback_catcher(self, failure):
# catch all errback failures,
self.logger.error(repr(failure))
if failure.check(ResponseNeverReceived):
request = failure.request
url= request.meta['current_url']
father = request.meta['father']
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
time.sleep(10)
if response:
response_root_key = response.meta['root_key']
else:
response_root_key = None
yield SplashRequest(
url,
self.parse,
errback=self.errback_catcher,
endpoint='execute',
cache_args=['lua_source'],
meta={'father': father, 'current_url': url},
args=self.build_request_arg(response.cookiejar)
)
else:
print('failure')
#print(failure)
print(failure.type)
def save_crawled_item(self, item_id, item_content):
gzip64encoded = crawlers.save_crawled_item(item_id, item_content)
# Send item to queue
# send paste to Global
relay_message = "{0} {1}".format(item_id, gzip64encoded)
self.p.populate_set_out(relay_message, 'Mixer')
# increase nb of paste by feeder name
self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1)
# tag crawled paste
msg = 'infoleak:submission="crawler";{}'.format(item_id)
self.p.populate_set_out(msg, 'Tags')