AIL-framework/bin/torcrawler/TorSplashCrawler.py

287 lines
11 KiB
Python
Raw Normal View History

2018-08-09 17:42:21 +02:00
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import sys
import uuid
import datetime
import redis
import json
import time
2018-08-09 17:42:21 +02:00
from hashlib import sha256
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError
from twisted.web._newclient import ResponseNeverReceived
2018-08-09 17:42:21 +02:00
from scrapy import Spider
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess, Crawler
from scrapy_splash import SplashRequest, SplashJsonResponse
2018-08-09 17:42:21 +02:00
sys.path.append(os.environ['AIL_BIN'])
from Helper import Process
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib'))
#import ConfigLoader
import Screenshot
import crawlers
script_cookie = """
function main(splash, args)
-- Default values
splash.js_enabled = true
splash.private_mode_enabled = true
splash.images_enabled = true
splash.webgl_enabled = true
splash.media_source_enabled = true
-- Force enable things
splash.plugins_enabled = true
splash.request_body_enabled = true
splash.response_body_enabled = true
splash.indexeddb_enabled = true
splash.html5_media_enabled = true
splash.http2_enabled = true
-- User defined
splash.resource_timeout = args.resource_timeout
splash.timeout = args.timeout
-- Allow to pass cookies
splash:init_cookies(args.cookies)
-- Run
ok, reason = splash:go{args.url}
if not ok and not reason:find("http") then
return {
error = reason,
last_url = splash:url()
}
end
if reason == "http504" then
splash:set_result_status_code(504)
return ''
end
splash:wait{args.wait}
-- Page instrumentation
-- splash.scroll_position = {y=1000}
splash:wait{args.wait}
-- Response
return {
har = splash:har(),
html = splash:html(),
png = splash:png{render_all=true},
cookies = splash:get_cookies(),
last_url = splash:url()
}
end
"""
2018-08-09 17:42:21 +02:00
class TorSplashCrawler():
2019-02-21 09:54:43 +01:00
def __init__(self, splash_url, crawler_options):
self.process = CrawlerProcess({'LOG_ENABLED': True})
2018-08-09 17:42:21 +02:00
self.crawler = Crawler(self.TorSplashSpider, {
2019-02-21 09:54:43 +01:00
'USER_AGENT': crawler_options['user_agent'],
2018-08-09 17:42:21 +02:00
'SPLASH_URL': splash_url,
'ROBOTSTXT_OBEY': False,
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
2018-08-09 17:42:21 +02:00
},
'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
2018-09-17 15:35:06 +02:00
'HTTPERROR_ALLOW_ALL': True,
'RETRY_TIMES': 2,
2019-02-21 09:54:43 +01:00
'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'],
'DEPTH_LIMIT': crawler_options['depth_limit'],
'SPLASH_COOKIES_DEBUG': False
2018-08-09 17:42:21 +02:00
})
def crawl(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item):
self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, requested_mode=requested_mode, url=url, domain=domain, port=port, cookies=cookies, original_item=original_item)
2018-08-09 17:42:21 +02:00
self.process.start()
class TorSplashSpider(Spider):
name = 'TorSplashSpider'
def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs):
self.domain_type = type
self.requested_mode = requested_mode
2019-02-25 16:38:50 +01:00
self.original_item = original_item
2019-02-22 17:00:24 +01:00
self.root_key = None
2018-08-09 17:42:21 +02:00
self.start_urls = url
self.domains = [domain]
self.port = str(port)
2019-02-25 16:38:50 +01:00
date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8])
self.full_date = date['date_day']
self.date_month = date['date_month']
self.date_epoch = int(date['epoch'])
2020-03-20 16:20:01 +01:00
self.png = crawler_options['png']
self.har = crawler_options['har']
self.cookies = cookies
config_section = 'Crawler'
self.p = Process(config_section)
self.item_dir = os.path.join(self.p.config.get("Directories", "crawled"), date_str )
self.har_dir = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str )
self.r_serv_log_submit = redis.StrictRedis(
host=self.p.config.get("Redis_Log_submit", "host"),
port=self.p.config.getint("Redis_Log_submit", "port"),
db=self.p.config.getint("Redis_Log_submit", "db"),
decode_responses=True)
self.root_key = None
def build_request_arg(self, cookies):
return {'wait': 10,
'resource_timeout': 30, # /!\ Weird behaviour if timeout < resource_timeout /!\
'timeout': 30,
'cookies': cookies,
'lua_source': script_cookie
}
2018-08-09 17:42:21 +02:00
def start_requests(self):
l_cookies = self.build_request_arg(self.cookies)
2018-08-09 17:42:21 +02:00
yield SplashRequest(
self.start_urls,
self.parse,
errback=self.errback_catcher,
endpoint='execute',
2020-04-01 14:58:27 +02:00
meta={'father': self.original_item, 'current_url': self.start_urls},
args=l_cookies
2018-08-09 17:42:21 +02:00
)
# # TODO: remove duplicate and anchor
2018-08-09 17:42:21 +02:00
def parse(self,response):
#print(response.headers)
#print(response.status)
2018-09-17 15:35:06 +02:00
if response.status == 504:
# no response
#print('504 detected')
pass
# LUA ERROR # # TODO: print/display errors
elif 'error' in response.data:
if(response.data['error'] == 'network99'):
## splash restart ##
error_retry = request.meta.get('error_retry', 0)
if error_retry < 3:
error_retry += 1
url= request.meta['current_url']
father = request.meta['father']
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
time.sleep(10)
yield SplashRequest(
url,
self.parse,
errback=self.errback_catcher,
endpoint='execute',
cache_args=['lua_source'],
2020-04-06 10:57:51 +02:00
meta={'father': father, 'current_url': url, 'error_retry': error_retry},
args=self.build_request_arg(response.cookiejar)
)
else:
print('Connection to proxy refused')
else:
print(response.data['error'])
2018-09-17 15:35:06 +02:00
elif response.status != 200:
2018-09-27 16:47:48 +02:00
print('other response: {}'.format(response.status))
# detect connection to proxy refused
error_log = (json.loads(response.body.decode()))
print(error_log)
#elif crawlers.is_redirection(self.domains[0], response.data['last_url']):
# pass # ignore response
2018-09-17 15:35:06 +02:00
else:
item_id = crawlers.create_item_id(self.item_dir, self.domains[0])
self.save_crawled_item(item_id, response.data['html'])
crawlers.create_item_metadata(item_id, self.domains[0], response.data['last_url'], self.port, response.meta['father'])
if self.root_key is None:
self.root_key = item_id
crawlers.add_domain_root_item(item_id, self.domain_type, self.domains[0], self.date_epoch, self.port)
crawlers.create_domain_metadata(self.domain_type, self.domains[0], self.port, self.full_date, self.date_month)
if 'cookies' in response.data:
all_cookies = response.data['cookies']
2019-02-12 15:51:19 +01:00
else:
all_cookies = []
# SCREENSHOT
if 'png' in response.data and self.png:
sha256_string = Screenshot.save_crawled_screeshot(response.data['png'], 5000000, f_save=self.requested_mode)
if sha256_string:
Screenshot.save_item_relationship(sha256_string, item_id)
Screenshot.save_domain_relationship(sha256_string, self.domains[0])
# HAR
if 'har' in response.data and self.har:
crawlers.save_har(self.har_dir, item_id, response.data['har'])
le = LinkExtractor(allow_domains=self.domains, unique=True)
for link in le.extract_links(response):
l_cookies = self.build_request_arg(all_cookies)
yield SplashRequest(
link.url,
self.parse,
errback=self.errback_catcher,
endpoint='execute',
2020-04-01 14:58:27 +02:00
meta={'father': item_id, 'current_url': link.url},
args=l_cookies
)
2018-09-17 15:35:06 +02:00
def errback_catcher(self, failure):
# catch all errback failures,
self.logger.error(repr(failure))
if failure.check(ResponseNeverReceived):
request = failure.request
2020-04-01 14:58:27 +02:00
url= request.meta['current_url']
father = request.meta['father']
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
time.sleep(10)
if response:
response_root_key = response.meta['root_key']
else:
response_root_key = None
yield SplashRequest(
url,
self.parse,
errback=self.errback_catcher,
endpoint='execute',
cache_args=['lua_source'],
2020-04-01 14:58:27 +02:00
meta={'father': father, 'current_url': url},
args=self.build_request_arg(response.cookiejar)
)
else:
print('failure')
#print(failure)
print(failure.type)
def save_crawled_item(self, item_id, item_content):
gzip64encoded = crawlers.save_crawled_item(item_id, item_content)
# Send item to queue
# send paste to Global
relay_message = "{0} {1}".format(item_id, gzip64encoded)
self.p.populate_set_out(relay_message, 'Mixer')
# increase nb of paste by feeder name
self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1)
# tag crawled paste
msg = 'infoleak:submission="crawler";{}'.format(item_id)
self.p.populate_set_out(msg, 'Tags')