From 60f7645ac151b09c6ff5abb544d614d3cfa75db2 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Fri, 22 Feb 2019 17:00:24 +0100 Subject: [PATCH] chg: [Crawler] refractor --- bin/Crawler.py | 404 +++++++++++++++-------------- bin/Onion.py | 7 +- bin/torcrawler/TorSplashCrawler.py | 16 +- bin/torcrawler/tor_crawler.py | 19 +- 4 files changed, 248 insertions(+), 198 deletions(-) diff --git a/bin/Crawler.py b/bin/Crawler.py index 1f61a50a..c11144f9 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -16,75 +16,105 @@ sys.path.append(os.environ['AIL_BIN']) from Helper import Process from pubsublogger import publisher -# ======== GLOBAL VARIABLES ======== -publisher.port = 6380 -publisher.channel = "Script" - -config_section = 'Crawler' - -# Setup the I/O queues -p = Process(config_section) - -accepted_services = ['onion', 'regular'] - -dic_regex = {} -dic_regex['onion'] = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" -re.compile(dic_regex['onion']) -dic_regex['i2p'] = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" -re.compile(dic_regex['i2p']) -dic_regex['regular'] = dic_regex['i2p'] - -faup = Faup() - -PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) - -r_serv_metadata = redis.StrictRedis( - host=p.config.get("ARDB_Metadata", "host"), - port=p.config.getint("ARDB_Metadata", "port"), - db=p.config.getint("ARDB_Metadata", "db"), - decode_responses=True) - -r_cache = redis.StrictRedis( - host=p.config.get("Redis_Cache", "host"), - port=p.config.getint("Redis_Cache", "port"), - db=p.config.getint("Redis_Cache", "db"), - decode_responses=True) - -r_onion = redis.StrictRedis( - host=p.config.get("ARDB_Onion", "host"), - port=p.config.getint("ARDB_Onion", "port"), - db=p.config.getint("ARDB_Onion", "db"), - decode_responses=True) - # ======== FUNCTIONS ======== -def decode_val(value): - if value is not None: - value = value.decode() - return value -def load_type_blacklist(type_service): - # load domains blacklist +def load_blacklist(service_type): try: - with open(os.path.join(os.environ['AIL_BIN'],'/torcrawler/blacklist_{}.txt'.format(type_service)), 'r') as f: + with open(os.environ['AIL_BIN']+'/torcrawler/blacklist_{}.txt'.format(service_type), 'r') as f: + redis_crawler.delete('blacklist_{}'.format(service_type)) lines = f.read().splitlines() for line in lines: - r_onion.sadd('blacklist_{}'.format(type_service), line) + redis_crawler.sadd('blacklist_{}'.format(service_type), line) except Exception: pass +# Extract info form url (url, domain, domain url, ...) +def unpack_url(url): + faup.decode(url) + url_unpack = faup.get() + domain = url_unpack['domain'].decode() + if url_unpack['scheme'] is None: + to_crawl['scheme'] = url_unpack['scheme'] + to_crawl['url']= 'http://{}'.format(url) + to_crawl['domain_url'] = 'http://{}'.format(domain) + else: + to_crawl['scheme'] = url_unpack['scheme'] + to_crawl['url']= '{}://{}'.format(to_crawl['scheme'], url) + to_crawl['domain_url'] = '{}://{}'.format(to_crawl['scheme'], domain) + to_crawl['port'] = url_unpack['port'] + to_crawl['tld'] = url_unpack['tld'].deocode() + return to_crawl -def on_error_send_message_back_in_queue(type_hidden_service, domain, message): - # send this msg back in the queue - if not r_onion.sismember('{}_domain_crawler_queue'.format(type_hidden_service), domain): - r_onion.sadd('{}_domain_crawler_queue'.format(type_hidden_service), domain) - r_onion.sadd('{}_crawler_priority_queue'.format(type_hidden_service), message) +# get url, paste and service_type to crawl +def get_elem_to_crawl(rotation_mode): + message = None + domain_service_type = None -def crawl_onion(url, domain, date, date_month, message, mode): + #load_priority_queue + for service_type in rotation_mode: + message = redis_crawler.spop('{}_crawler_priority_queue'.format(service_type)) + if message is not None: + domain_service_type = type_service + break + #load_normal_queue + if message is None: + for service_type in rotation_mode: + message = redis_crawler.spop('{}_crawler_queue'.format(service_type)) + if message is not None: + domain_service_type = type_service + break + + if message: + splitted = message.rsplit(';', 1) + if len(splitted) == 2: + url, paste = splitted + if paste: + paste = paste.replace(PASTES_FOLDER+'/', '') + else: + url = message + paste = 'requested' + + message = {'url': url, 'paste': paste, 'type_service': domain_service_type, 'original_message': message} + + return message + +def load_crawler_config(service_type, domain, paste): + # Auto and Manual Crawling + if paste is None: + crawler_config['requested'] = True + # default crawler + else: + crawler_config['requested'] = False + return crawler_config + +def is_domain_up_day(domain, type_service, date_day): + if redis_crawler.sismember('{}_up:{}'.format(type_service, date_day), domain): + return True + else: + return False + +def set_crawled_domain_metadata(type_service, date, domain, father_item): + # first seen + if not redis_crawler.hexists('{}_metadata:{}'.format(type_service, domain), 'first_seen'): + redis_crawler.hset('{}_metadata:{}'.format(type_service, domain), 'first_seen', date['date_day']) + + redis_crawler.hset('{}_metadata:{}'.format(type_service, domain), 'paste_parent', father_item) + # last check + redis_crawler.hset('{}_metadata:{}'.format(type_service, domain), 'last_check', date['date_day']) + +# Put message back on queue +def on_error_send_message_back_in_queue(type_service, domain, message): + if not redis_crawler.sismember('{}_domain_crawler_queue'.format(type_service), domain): + redis_crawler.sadd('{}_domain_crawler_queue'.format(type_service), domain) + redis_crawler.sadd('{}_crawler_priority_queue'.format(type_service), message) + +##########################################################################################################< +def crawl_onion(url, domain, message, crawler_config): + print('Launching Crawler: {}'.format(url)) r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain) r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S")) - #if not r_onion.sismember('full_onion_up', domain) and not r_onion.sismember('onion_down:'+date , domain): super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father') if super_father is None: super_father=paste @@ -100,7 +130,7 @@ def crawl_onion(url, domain, date, date_month, message, mode): nb_retry += 1 if nb_retry == 6: - on_error_send_message_back_in_queue(type_hidden_service, domain, message) + on_error_send_message_back_in_queue(type_service, domain, message) publisher.error('{} SPASH DOWN'.format(splash_url)) print('--------------------------------------') print(' \033[91m DOCKER SPLASH DOWN\033[0m') @@ -114,7 +144,7 @@ def crawl_onion(url, domain, date, date_month, message, mode): if r.status_code == 200: r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling') - process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, type_hidden_service, url, domain, paste, super_father], + process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, type_service, url, domain, paste, super_father], stdout=subprocess.PIPE) while process.poll() is None: time.sleep(1) @@ -124,7 +154,7 @@ def crawl_onion(url, domain, date, date_month, message, mode): print(output) # error: splash:Connection to proxy refused if 'Connection to proxy refused' in output: - on_error_send_message_back_in_queue(type_hidden_service, domain, message) + on_error_send_message_back_in_queue(type_service, domain, message) publisher.error('{} SPASH, PROXY DOWN OR BAD CONFIGURATION'.format(splash_url)) print('------------------------------------------------------------------------') print(' \033[91m SPLASH: Connection to proxy refused') @@ -137,171 +167,167 @@ def crawl_onion(url, domain, date, date_month, message, mode): print(process.stdout.read()) exit(-1) else: - on_error_send_message_back_in_queue(type_hidden_service, domain, message) + on_error_send_message_back_in_queue(type_service, domain, message) print('--------------------------------------') print(' \033[91m DOCKER SPLASH DOWN\033[0m') print(' {} DOWN'.format(splash_url)) r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Crawling') exit(1) -# ======== MAIN ======== +# check external links (full_crawl) +def search_potential_source_domain(type_service, domain): + external_domains = set() + for link in redis_crawler.smembers('domain_{}_external_links:{}'.format(type_service, domain)): + # unpack url + url_data = unpack_url(link) + if url_data['domain'] != domain: + if url_data['tld'] == 'onion' or url_data['tld'] == 'i2p': + external_domains.add(url_data['domain']) + # # TODO: add special tag ? + if len(external_domains) >= 20: + redis_crawler.sadd('{}_potential_source'.format(type_service), domain) + print('New potential source found: domain') + redis_crawler.delete('domain_{}_external_links:{}'.format(type_service, domain)) + + if __name__ == '__main__': if len(sys.argv) != 3: - #print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)', 'splash_port') - print('usage:', 'Crawler.py', 'mode (manual or automatic)', 'splash_port') + print('usage:', 'Crawler.py', 'type_service (onion or i2p or regular)', 'splash_port') exit(1) - - mode = sys.argv[1] +################################################## + type_service = sys.argv[1] splash_port = sys.argv[2] - if mode == 'automatic': - type_hidden_service = 'onion' + rotation_mode = ['onion', 'regular'] - # verify crawler type (type_hidden_service) - if type_hidden_service not in accepted_services: - print('incorrect crawler type: {}'.format(type_hidden_service)) - exit(0) - else: - publisher.info("Script Crawler started") + default_port = ['http': 80, 'https': 443] - # load domains blacklist - load_type_blacklist('onions') - load_type_blacklist('regular') +################################################################### # TODO: port + + publisher.port = 6380 + publisher.channel = "Script" + publisher.info("Script Crawler started") + config_section = 'Crawler' + + # Setup the I/O queues + p = Process(config_section) splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port) print('splash url: {}'.format(splash_url)) - crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit") + faup = Faup() - # Crawler status - r_cache.sadd('all_crawler:{}'.format(splash_port) - r_cache.sadd('all_crawler:{}:{}'.format(mode, type_hidden_service), splash_port) - r_cache.hset('metadata_crawler:{}'.format(splash_port), 'mode', mode) + PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) + + r_serv_metadata = redis.StrictRedis( + host=p.config.get("ARDB_Metadata", "host"), + port=p.config.getint("ARDB_Metadata", "port"), + db=p.config.getint("ARDB_Metadata", "db"), + decode_responses=True) + + r_cache = redis.StrictRedis( + host=p.config.get("Redis_Cache", "host"), + port=p.config.getint("Redis_Cache", "port"), + db=p.config.getint("Redis_Cache", "db"), + decode_responses=True) + + redis_crawler = redis.StrictRedis( + host=p.config.get("ARDB_Onion", "host"), + port=p.config.getint("ARDB_Onion", "port"), + db=p.config.getint("ARDB_Onion", "db"), + decode_responses=True) + + # Track launched crawler + r_cache.sadd('all_crawler', splash_port) r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting') r_cache.hset('metadata_crawler:{}'.format(splash_port), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S")) + # update hardcoded blacklist + load_blacklist('onion') + load_blacklist('regular') + while True: - if mode == 'automatic': - # Priority Queue - Recovering the streamed message informations. - message = r_onion.spop('{}_crawler_priority_queue'.format(type_hidden_service)) - # Recovering the streamed message informations. - if message is None: - message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service)) - - else: - pass + to_crawl = get_elem_to_crawl(rotation_mode) + if to_crawl_dict: + url_data = unpack_url(to_crawl['url']) + # remove domain from queue + redis_crawler.srem('{}_domain_crawler_queue'.format(to_crawl['type_service']), url_data['domain']) - if message is not None: + print() + print() + print('\033[92m------------------START CRAWLER------------------\033[0m') + print('crawler type: {}'.format(to_crawl['type_service'])) + print('\033[92m-------------------------------------------------\033[0m') + print('url: {}'.format(url_data['url'])) + print('domain: {}'.format(url_data['domain'])) + print('domain_url: {}'.format(url_data['domain_url'])) - splitted = message.split(';') - if len(splitted) == 2: - url, paste = splitted - paste = paste.replace(PASTES_FOLDER+'/', '') - - # extract data from url - faup.decode(url) - url_unpack = faup.get() - url = decode_val(url_unpack['url']) - port = decode_val(url_unpack['port']) - scheme = decode_val(url_unpack['scheme']) - domain = decode_val(url_unpack['domain']) - host = decode_val(url_unpack['domain']) - - # Add Scheme to url - if scheme is None: - url= 'http://{}'.format(url) - domain_url = 'http://{}'.format(domain) + # Check blacklist + if not redis_crawler.sismember('blacklist_{}'.format(to_crawl['type_service']), url_data['domain'])): + date = {'date_day'= datetime.datetime.now().strftime("%Y%m%d"), + 'date_month'= datetime.datetime.now().strftime("%Y%m"), + 'epoch'= int(time.time())} - # remove url to crawl from queue - r_onion.srem('{}_domain_crawler_queue'.format(type_hidden_service), domain) + crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste']) + # check if default crawler + if not crawler_config['requested']: + # Auto crawl only if service not up this month + if redis_crawler.sismember('month_{}_up:{}'.format(to_crawl['type_service'], date['date_month']), url_data['domain']): + continue - print() - print() - print('\033[92m------------------START CRAWLER------------------\033[0m') - print('crawler type: {}'.format(type_hidden_service)) - print('\033[92m-------------------------------------------------\033[0m') - print('url: {}'.format(url)) - print('domain: {}'.format(domain)) - print('domain_url: {}'.format(domain_url)) + set_crawled_domain_metadata(to_crawl['type_service'], date, url_data['domain'], to_crawl['paste'], crawler_config) - if not r_onion.sismember('blacklist_{}'.format(type_hidden_service), domain): - date = datetime.datetime.now().strftime("%Y%m%d") - date_month = datetime.datetime.now().strftime("%Y%m") + #### CRAWLER #### + # Manual and Auto Crawler + if crawler_config['requested']: - if not r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and not r_onion.sismember('{}_down:{}'.format(type_hidden_service, date), domain): - # first seen - if not r_onion.hexists('{}_metadata:{}'.format(type_hidden_service, domain), 'first_seen'): - r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'first_seen', date) + ######################################################crawler strategy + # CRAWL domain + crawl_onion(url_data['url'], url_data['domain'], to_crawl['original_message']) - # last_father - r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'paste_parent', paste) - - # last check - r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date) - - # Launch Scrapy-Splash Crawler - crawl_onion(url, domain, date, date_month, message, mode) - # Crawl Domain - if url != domain_url: - #Crawl Domain with port number - if port is not None: - print('{}:{}'.format(domain_url, port)) - crawl_onion('{}:{}'.format(domain_url, port), domain, date, date_month, message, mode) - #Crawl without port number - print(domain_url) - crawl_onion(domain_url, domain, date, date_month, message, mode) - - # update last check - r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date) - - # save down onion - if not r_onion.sismember('{}_up:{}'.format(type_hidden_service, date), domain): - r_onion.sadd('{}_down:{}'.format(type_hidden_service, date), domain) - else: - #r_onion.hincrby('{}_link_up'.format(type_hidden_service), url, 1) - if r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and r_serv_metadata.exists('paste_children:'+paste): - msg = 'infoleak:automatic-detection="{}";{}'.format(type_hidden_service, paste) - p.populate_set_out(msg, 'Tags') - - # add onion screenshot history - # add crawled days - if r_onion.lindex('{}_history:{}'.format(type_hidden_service, domain), 0) != date: - r_onion.lpush('{}_history:{}'.format(type_hidden_service, domain), date) - # add crawled history by date - r_onion.lpush('{}_history:{}:{}'.format(type_hidden_service, domain, date), paste) - - if mode == 'automatic': - # check external onions links (full_crawl) - external_domains = set() - for link in r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)): - external_domain = re.findall(dic_regex[type_hidden_service], link) - external_domain.extend(re.findall(url_i2p, link)) - if len(external_domain) > 0: - external_domain = external_domain[0][4] - else: - continue - if '.onion' in external_domain and external_domain != domain: - external_domains.add(external_domain) - elif '.i2p' in external_domain and external_domain != domain: - external_domains.add(external_domain) - if len(external_domains) >= 10: - r_onion.sadd('{}_potential_source'.format(type_hidden_service), domain) - r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain)) - print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain))) - - # update list, last crawled sites - r_onion.lpush('last_{}'.format(type_hidden_service), domain) - r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15) - - #update crawler status - r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting') - r_cache.hdel('metadata_crawler:{}'.format(splash_port), 'crawling_domain') + # Default Crawler else: - print(' Blacklisted Site') + # CRAWL domain + crawl_onion(url_data['domain_url'], url_data['domain'], to_crawl['original_message']) + if url != domain_url and not is_domain_up_day(url_data['domain'], to_crawl['type_service'], date['date_day']): + crawl_onion(url_data['url'], url_data['domain'], to_crawl['original_message']) + + + ################################################### handle port + # CRAWL with port + #if port is not None: + # crawl_onion('{}:{}'.format(domain_url, port), domain, message) + #### #### + + + # Save last_status day (DOWN) + if not is_domain_up_day(url_data['domain'], to_crawl['type_service'], date['date_day']): + redis_crawler.sadd('{}_down:{}'.format(to_crawl['type_service'], date['day']), url_data['domain']) + + # if domain was UP at least one time + if redis_crawler.exists('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain'])): + # add crawler history (if domain is down) + if not redis_crawler.zrangebyscore('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain']), date['epoch'], date['epoch']): + # Domain is down + redis_crawler.zadd('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain']), int(date['epoch']), int(date['epoch'])) + + ############################ + # extract page content + ############################ + + # update list, last crawled domains + redis_crawler.lpush('last_{}'.format(to_crawl['type_service']), url_data['domain']) + redis_crawler.ltrim('last_{}'.format(to_crawl['type_service']), 0, 15) + + #update crawler status + r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting') + r_cache.hdel('metadata_crawler:{}'.format(splash_port), 'crawling_domain') + else: + print(' Blacklisted Domain') print() print() diff --git a/bin/Onion.py b/bin/Onion.py index 801118d5..292346d5 100755 --- a/bin/Onion.py +++ b/bin/Onion.py @@ -235,10 +235,11 @@ if __name__ == "__main__": else: for url in fetch(p, r_cache, urls, domains_list, path): publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_path)) - p.populate_set_out('onion;{}'.format(PST.p_path), 'alertHandler') + #p.populate_set_out('onion;{}'.format(PST.p_path), 'alertHandler') - msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path) - p.populate_set_out(msg, 'Tags') + # TAG Item + msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path) + p.populate_set_out(msg, 'Tags') else: publisher.info('{}Onion related;{}'.format(to_print, PST.p_path)) diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index 42d9a6af..9b3ee389 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -57,6 +57,7 @@ class TorSplashCrawler(): self.type = type self.original_paste = original_paste self.super_father = super_father + self.root_key = None self.start_urls = url self.domains = [domain] date = datetime.datetime.now().strftime("%Y/%m/%d") @@ -109,7 +110,7 @@ class TorSplashCrawler(): self.parse, errback=self.errback_catcher, endpoint='render.json', - meta={'father': self.original_paste}, + meta={'father': self.original_paste, 'root_key': None}, args=self.arg_crawler ) @@ -147,10 +148,15 @@ class TorSplashCrawler(): # create onion metadata if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])): self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date) - self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'last_seen', self.full_date) + + # create root_key + if self.root_key is None: + self.root_key = relative_filename_paste + # Create/Update crawler history + self.r_serv_onion.zadd('crawler_history_{}:{}'.format(type_service, domain), int(date['epoch']), self.root_key) #create paste metadata - self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father) + self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.root_key) self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['father']) self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0]) self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url) @@ -185,7 +191,7 @@ class TorSplashCrawler(): self.parse, errback=self.errback_catcher, endpoint='render.json', - meta={'father': relative_filename_paste}, + meta={'father': relative_filename_paste, 'root_key': response.meta['root_key']}, args=self.arg_crawler ) @@ -205,7 +211,7 @@ class TorSplashCrawler(): self.parse, errback=self.errback_catcher, endpoint='render.json', - meta={'father': father}, + meta={'father': father, 'root_key': response.meta['root_key']}, args=self.arg_crawler ) diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py index e8a7d96b..7331115b 100755 --- a/bin/torcrawler/tor_crawler.py +++ b/bin/torcrawler/tor_crawler.py @@ -31,10 +31,27 @@ if __name__ == '__main__': domain = sys.argv[4] paste = sys.argv[5] super_father = sys.argv[6] - + if crawler_options is None: crawler_options = default_crawler_options + + + + redis_crawler.exists('crawler_option_manual:{}:{}'.format(service_type, domain)): +crawler_config['mode_name'] = 'auto' + crawler_config['requested'] = True + + + + + + + + + + + crawler_options['depth_limit'] = cfg.getint("Crawler", "crawler_depth_limit") crawler_options['user_agent'] = tor_browser_agent