2018-08-09 17:42:21 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# -*-coding:UTF-8 -*
|
|
|
|
|
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
import uuid
|
|
|
|
import datetime
|
|
|
|
import redis
|
2018-08-24 10:13:56 +02:00
|
|
|
import json
|
2019-01-04 15:51:08 +01:00
|
|
|
import time
|
2018-08-09 17:42:21 +02:00
|
|
|
|
2019-04-24 14:09:04 +02:00
|
|
|
from hashlib import sha256
|
|
|
|
|
2018-08-16 17:24:39 +02:00
|
|
|
from scrapy.spidermiddlewares.httperror import HttpError
|
|
|
|
from twisted.internet.error import DNSLookupError
|
|
|
|
from twisted.internet.error import TimeoutError
|
2019-01-04 15:51:08 +01:00
|
|
|
from twisted.web._newclient import ResponseNeverReceived
|
2018-08-16 17:24:39 +02:00
|
|
|
|
2018-08-09 17:42:21 +02:00
|
|
|
from scrapy import Spider
|
|
|
|
from scrapy.linkextractors import LinkExtractor
|
|
|
|
from scrapy.crawler import CrawlerProcess, Crawler
|
|
|
|
|
2020-03-09 17:02:18 +01:00
|
|
|
from scrapy_splash import SplashRequest, SplashJsonResponse
|
2018-08-09 17:42:21 +02:00
|
|
|
|
|
|
|
sys.path.append(os.environ['AIL_BIN'])
|
|
|
|
from Helper import Process
|
|
|
|
|
2020-03-09 17:02:18 +01:00
|
|
|
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib'))
|
2020-03-20 16:15:25 +01:00
|
|
|
#import ConfigLoader
|
|
|
|
import Screenshot
|
2020-03-24 17:15:43 +01:00
|
|
|
import crawlers
|
2020-03-09 17:02:18 +01:00
|
|
|
|
|
|
|
script_cookie = """
|
|
|
|
function main(splash, args)
|
|
|
|
-- Default values
|
|
|
|
splash.js_enabled = true
|
|
|
|
splash.private_mode_enabled = true
|
|
|
|
splash.images_enabled = true
|
|
|
|
splash.webgl_enabled = true
|
|
|
|
splash.media_source_enabled = true
|
2020-03-20 16:15:25 +01:00
|
|
|
|
2020-03-09 17:02:18 +01:00
|
|
|
-- Force enable things
|
|
|
|
splash.plugins_enabled = true
|
|
|
|
splash.request_body_enabled = true
|
|
|
|
splash.response_body_enabled = true
|
2020-03-20 16:15:25 +01:00
|
|
|
|
2020-03-09 17:02:18 +01:00
|
|
|
splash.indexeddb_enabled = true
|
|
|
|
splash.html5_media_enabled = true
|
|
|
|
splash.http2_enabled = true
|
2020-03-20 16:15:25 +01:00
|
|
|
|
2020-03-09 17:02:18 +01:00
|
|
|
-- User defined
|
|
|
|
splash.resource_timeout = args.resource_timeout
|
|
|
|
splash.timeout = args.timeout
|
|
|
|
|
2020-03-20 16:15:25 +01:00
|
|
|
-- Allow to pass cookies
|
2020-03-09 17:02:18 +01:00
|
|
|
splash:init_cookies(args.cookies)
|
2020-03-20 16:15:25 +01:00
|
|
|
|
2020-03-09 17:02:18 +01:00
|
|
|
-- Run
|
|
|
|
ok, reason = splash:go{args.url}
|
2020-03-20 16:15:25 +01:00
|
|
|
if not ok and not reason:find("http") then
|
|
|
|
return {
|
|
|
|
error = reason,
|
|
|
|
last_url = splash:url()
|
|
|
|
}
|
2020-03-09 17:02:18 +01:00
|
|
|
end
|
2020-03-30 18:43:50 +02:00
|
|
|
if reason == "http504" then
|
|
|
|
splash:set_result_status_code(504)
|
|
|
|
return ''
|
|
|
|
end
|
2020-03-20 16:15:25 +01:00
|
|
|
|
2020-03-09 17:02:18 +01:00
|
|
|
splash:wait{args.wait}
|
|
|
|
-- Page instrumentation
|
|
|
|
-- splash.scroll_position = {y=1000}
|
|
|
|
splash:wait{args.wait}
|
|
|
|
-- Response
|
|
|
|
return {
|
|
|
|
har = splash:har(),
|
|
|
|
html = splash:html(),
|
|
|
|
png = splash:png{render_all=true},
|
2020-03-20 16:15:25 +01:00
|
|
|
cookies = splash:get_cookies(),
|
|
|
|
last_url = splash:url()
|
2020-03-09 17:02:18 +01:00
|
|
|
}
|
|
|
|
end
|
|
|
|
"""
|
|
|
|
|
2018-08-09 17:42:21 +02:00
|
|
|
class TorSplashCrawler():
|
|
|
|
|
2019-02-21 09:54:43 +01:00
|
|
|
def __init__(self, splash_url, crawler_options):
|
2020-03-09 17:02:18 +01:00
|
|
|
self.process = CrawlerProcess({'LOG_ENABLED': True})
|
2018-08-09 17:42:21 +02:00
|
|
|
self.crawler = Crawler(self.TorSplashSpider, {
|
2019-02-21 09:54:43 +01:00
|
|
|
'USER_AGENT': crawler_options['user_agent'],
|
2018-08-09 17:42:21 +02:00
|
|
|
'SPLASH_URL': splash_url,
|
|
|
|
'ROBOTSTXT_OBEY': False,
|
|
|
|
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
|
|
|
|
'scrapy_splash.SplashMiddleware': 725,
|
|
|
|
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
|
2020-03-09 17:02:18 +01:00
|
|
|
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
|
2018-08-09 17:42:21 +02:00
|
|
|
},
|
|
|
|
'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
|
|
|
|
'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
|
2018-09-17 15:35:06 +02:00
|
|
|
'HTTPERROR_ALLOW_ALL': True,
|
2020-03-30 18:43:50 +02:00
|
|
|
'RETRY_TIMES': 2,
|
2019-02-21 09:54:43 +01:00
|
|
|
'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'],
|
2020-03-09 17:02:18 +01:00
|
|
|
'DEPTH_LIMIT': crawler_options['depth_limit'],
|
2020-03-30 18:43:50 +02:00
|
|
|
'SPLASH_COOKIES_DEBUG': False
|
2018-08-09 17:42:21 +02:00
|
|
|
})
|
|
|
|
|
2020-09-14 17:03:36 +02:00
|
|
|
def crawl(self, splash_url, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item):
|
|
|
|
self.process.crawl(self.crawler, splash_url=splash_url, type=type, crawler_options=crawler_options, date=date, requested_mode=requested_mode, url=url, domain=domain, port=port, cookies=cookies, original_item=original_item)
|
2018-08-09 17:42:21 +02:00
|
|
|
self.process.start()
|
|
|
|
|
|
|
|
class TorSplashSpider(Spider):
|
|
|
|
name = 'TorSplashSpider'
|
|
|
|
|
2020-09-14 17:03:36 +02:00
|
|
|
def __init__(self, splash_url, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs):
|
|
|
|
self.splash_url = splash_url
|
2020-03-20 16:15:25 +01:00
|
|
|
self.domain_type = type
|
2019-05-13 14:24:16 +02:00
|
|
|
self.requested_mode = requested_mode
|
2019-02-25 16:38:50 +01:00
|
|
|
self.original_item = original_item
|
2019-02-22 17:00:24 +01:00
|
|
|
self.root_key = None
|
2018-08-09 17:42:21 +02:00
|
|
|
self.start_urls = url
|
2018-08-13 09:23:14 +02:00
|
|
|
self.domains = [domain]
|
2019-03-22 16:48:07 +01:00
|
|
|
self.port = str(port)
|
2019-02-25 16:38:50 +01:00
|
|
|
date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8])
|
|
|
|
self.full_date = date['date_day']
|
|
|
|
self.date_month = date['date_month']
|
|
|
|
self.date_epoch = int(date['epoch'])
|
2020-03-20 16:20:01 +01:00
|
|
|
|
2020-03-23 18:00:09 +01:00
|
|
|
self.png = crawler_options['png']
|
|
|
|
self.har = crawler_options['har']
|
2020-03-09 17:02:18 +01:00
|
|
|
self.cookies = cookies
|
|
|
|
|
2020-03-20 16:15:25 +01:00
|
|
|
config_section = 'Crawler'
|
|
|
|
self.p = Process(config_section)
|
|
|
|
self.item_dir = os.path.join(self.p.config.get("Directories", "crawled"), date_str )
|
|
|
|
self.har_dir = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str )
|
|
|
|
self.r_serv_log_submit = redis.StrictRedis(
|
|
|
|
host=self.p.config.get("Redis_Log_submit", "host"),
|
|
|
|
port=self.p.config.getint("Redis_Log_submit", "port"),
|
|
|
|
db=self.p.config.getint("Redis_Log_submit", "db"),
|
|
|
|
decode_responses=True)
|
|
|
|
|
|
|
|
self.root_key = None
|
|
|
|
|
2020-03-09 17:02:18 +01:00
|
|
|
def build_request_arg(self, cookies):
|
|
|
|
return {'wait': 10,
|
2020-03-30 18:43:50 +02:00
|
|
|
'resource_timeout': 30, # /!\ Weird behaviour if timeout < resource_timeout /!\
|
2020-03-09 17:02:18 +01:00
|
|
|
'timeout': 30,
|
|
|
|
'cookies': cookies,
|
|
|
|
'lua_source': script_cookie
|
|
|
|
}
|
2018-08-09 17:42:21 +02:00
|
|
|
|
|
|
|
def start_requests(self):
|
2020-03-09 17:02:18 +01:00
|
|
|
l_cookies = self.build_request_arg(self.cookies)
|
2018-08-09 17:42:21 +02:00
|
|
|
yield SplashRequest(
|
|
|
|
self.start_urls,
|
|
|
|
self.parse,
|
2019-01-04 15:51:08 +01:00
|
|
|
errback=self.errback_catcher,
|
2020-03-09 17:02:18 +01:00
|
|
|
endpoint='execute',
|
2020-04-01 14:58:27 +02:00
|
|
|
meta={'father': self.original_item, 'current_url': self.start_urls},
|
2020-03-09 17:02:18 +01:00
|
|
|
args=l_cookies
|
2018-08-09 17:42:21 +02:00
|
|
|
)
|
|
|
|
|
2020-03-20 16:15:25 +01:00
|
|
|
# # TODO: remove duplicate and anchor
|
2018-08-09 17:42:21 +02:00
|
|
|
def parse(self,response):
|
2018-08-24 10:13:56 +02:00
|
|
|
#print(response.headers)
|
|
|
|
#print(response.status)
|
2018-09-17 15:35:06 +02:00
|
|
|
if response.status == 504:
|
2020-03-30 18:43:50 +02:00
|
|
|
# no response
|
|
|
|
#print('504 detected')
|
|
|
|
pass
|
2020-03-20 16:15:25 +01:00
|
|
|
|
|
|
|
# LUA ERROR # # TODO: print/display errors
|
|
|
|
elif 'error' in response.data:
|
|
|
|
if(response.data['error'] == 'network99'):
|
2020-04-06 10:52:44 +02:00
|
|
|
## splash restart ##
|
|
|
|
error_retry = request.meta.get('error_retry', 0)
|
|
|
|
if error_retry < 3:
|
|
|
|
error_retry += 1
|
|
|
|
url= request.meta['current_url']
|
|
|
|
father = request.meta['father']
|
|
|
|
|
|
|
|
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
|
|
|
|
time.sleep(10)
|
|
|
|
yield SplashRequest(
|
|
|
|
url,
|
|
|
|
self.parse,
|
|
|
|
errback=self.errback_catcher,
|
|
|
|
endpoint='execute',
|
|
|
|
cache_args=['lua_source'],
|
2020-04-06 10:57:51 +02:00
|
|
|
meta={'father': father, 'current_url': url, 'error_retry': error_retry},
|
2020-04-06 10:52:44 +02:00
|
|
|
args=self.build_request_arg(response.cookiejar)
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
print('Connection to proxy refused')
|
2020-03-20 16:15:25 +01:00
|
|
|
else:
|
|
|
|
print(response.data['error'])
|
|
|
|
|
2018-09-17 15:35:06 +02:00
|
|
|
elif response.status != 200:
|
2018-09-27 16:47:48 +02:00
|
|
|
print('other response: {}'.format(response.status))
|
2020-03-20 16:15:25 +01:00
|
|
|
# detect connection to proxy refused
|
2018-09-27 15:43:03 +02:00
|
|
|
error_log = (json.loads(response.body.decode()))
|
2020-03-20 16:15:25 +01:00
|
|
|
print(error_log)
|
2020-03-27 17:06:26 +01:00
|
|
|
#elif crawlers.is_redirection(self.domains[0], response.data['last_url']):
|
|
|
|
# pass # ignore response
|
2018-09-17 15:35:06 +02:00
|
|
|
else:
|
|
|
|
|
2020-03-24 17:15:43 +01:00
|
|
|
item_id = crawlers.create_item_id(self.item_dir, self.domains[0])
|
2020-03-20 16:15:25 +01:00
|
|
|
self.save_crawled_item(item_id, response.data['html'])
|
2020-03-24 17:15:43 +01:00
|
|
|
crawlers.create_item_metadata(item_id, self.domains[0], response.data['last_url'], self.port, response.meta['father'])
|
2020-03-20 16:15:25 +01:00
|
|
|
|
|
|
|
if self.root_key is None:
|
|
|
|
self.root_key = item_id
|
2020-03-24 17:15:43 +01:00
|
|
|
crawlers.add_domain_root_item(item_id, self.domain_type, self.domains[0], self.date_epoch, self.port)
|
|
|
|
crawlers.create_domain_metadata(self.domain_type, self.domains[0], self.port, self.full_date, self.date_month)
|
2020-03-09 17:02:18 +01:00
|
|
|
|
|
|
|
if 'cookies' in response.data:
|
|
|
|
all_cookies = response.data['cookies']
|
2019-02-12 15:51:19 +01:00
|
|
|
else:
|
2020-03-09 17:02:18 +01:00
|
|
|
all_cookies = []
|
|
|
|
|
2020-03-20 16:15:25 +01:00
|
|
|
# SCREENSHOT
|
2020-06-04 16:05:32 +02:00
|
|
|
if 'png' in response.data and self.png:
|
2020-03-20 16:15:25 +01:00
|
|
|
sha256_string = Screenshot.save_crawled_screeshot(response.data['png'], 5000000, f_save=self.requested_mode)
|
|
|
|
if sha256_string:
|
|
|
|
Screenshot.save_item_relationship(sha256_string, item_id)
|
|
|
|
Screenshot.save_domain_relationship(sha256_string, self.domains[0])
|
|
|
|
# HAR
|
2020-06-04 16:05:32 +02:00
|
|
|
if 'har' in response.data and self.har:
|
2020-03-24 17:15:43 +01:00
|
|
|
crawlers.save_har(self.har_dir, item_id, response.data['har'])
|
2020-03-09 17:02:18 +01:00
|
|
|
|
|
|
|
le = LinkExtractor(allow_domains=self.domains, unique=True)
|
|
|
|
for link in le.extract_links(response):
|
|
|
|
l_cookies = self.build_request_arg(all_cookies)
|
|
|
|
yield SplashRequest(
|
|
|
|
link.url,
|
|
|
|
self.parse,
|
|
|
|
errback=self.errback_catcher,
|
|
|
|
endpoint='execute',
|
2020-04-01 14:58:27 +02:00
|
|
|
meta={'father': item_id, 'current_url': link.url},
|
2020-03-09 17:02:18 +01:00
|
|
|
args=l_cookies
|
|
|
|
)
|
2018-09-17 15:35:06 +02:00
|
|
|
|
2018-08-16 17:24:39 +02:00
|
|
|
def errback_catcher(self, failure):
|
|
|
|
# catch all errback failures,
|
|
|
|
self.logger.error(repr(failure))
|
|
|
|
|
2019-01-04 15:51:08 +01:00
|
|
|
if failure.check(ResponseNeverReceived):
|
2020-09-14 17:03:36 +02:00
|
|
|
## DEBUG ##
|
|
|
|
self.logger.error(failure.request)
|
|
|
|
if failure.value.response:
|
|
|
|
self.logger.error(failure.value.response)
|
|
|
|
## ----- ##
|
|
|
|
|
|
|
|
# Extract request metadata
|
|
|
|
url = failure.request.meta['current_url']
|
|
|
|
father = failure.request.meta['father']
|
|
|
|
l_cookies = self.build_request_arg(failure.request.meta['splash']['args']['cookies'])
|
|
|
|
|
|
|
|
# Check if Splash restarted
|
|
|
|
if not crawlers.is_splash_reachable(self.splash_url):
|
2020-09-14 17:13:12 +02:00
|
|
|
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 30s ...', url)
|
|
|
|
time.sleep(30)
|
2020-09-14 17:03:36 +02:00
|
|
|
|
2019-01-04 15:51:08 +01:00
|
|
|
yield SplashRequest(
|
|
|
|
url,
|
|
|
|
self.parse,
|
|
|
|
errback=self.errback_catcher,
|
2020-03-09 17:02:18 +01:00
|
|
|
endpoint='execute',
|
2020-04-01 14:58:27 +02:00
|
|
|
meta={'father': father, 'current_url': url},
|
2020-09-14 17:03:36 +02:00
|
|
|
args=l_cookies
|
2019-01-04 15:51:08 +01:00
|
|
|
)
|
|
|
|
|
|
|
|
else:
|
2020-09-14 17:03:36 +02:00
|
|
|
self.logger.error(failure.type)
|
|
|
|
self.logger.error(failure.getErrorMessage())
|
2020-03-20 16:15:25 +01:00
|
|
|
|
|
|
|
def save_crawled_item(self, item_id, item_content):
|
2020-03-24 17:15:43 +01:00
|
|
|
gzip64encoded = crawlers.save_crawled_item(item_id, item_content)
|
2020-03-20 16:15:25 +01:00
|
|
|
|
|
|
|
# Send item to queue
|
|
|
|
# send paste to Global
|
|
|
|
relay_message = "{0} {1}".format(item_id, gzip64encoded)
|
|
|
|
self.p.populate_set_out(relay_message, 'Mixer')
|
|
|
|
|
|
|
|
# increase nb of paste by feeder name
|
|
|
|
self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1)
|
|
|
|
|
|
|
|
# tag crawled paste
|
|
|
|
msg = 'infoleak:submission="crawler";{}'.format(item_id)
|
|
|
|
self.p.populate_set_out(msg, 'Tags')
|