diff --git a/OVERVIEW.md b/OVERVIEW.md index f30f763d..f389a085 100644 --- a/OVERVIEW.md +++ b/OVERVIEW.md @@ -57,6 +57,15 @@ Redis and ARDB overview | ------ | ------ | | *tag* | *paste* | +## DB7 - Metadata: + +#### Crawled Items: +##### Hset: +| Key | Field | Value | +| ------ | ------ | ------ | +| paste_metadata:**item path** | super_father | **first url crawled** | +| | father | **item father** | +| | domain | **crawled domain**:**domain port** | ## DB9 - Crawler: @@ -65,19 +74,20 @@ Redis and ARDB overview | ------ | ------ | ------ | | **service type**:**domain** | first_seen | **date** | | | last_check | **date** | +| | ports | **port**;**port**;**port** ... | | | paste_parent | **parent last crawling (can be auto or manual)** | ##### Zset: | Key | Field | Value | | ------ | ------ | ------ | -| crawler_history_**service type**:**domain** | **item root (first crawled item)** | **epoch (seconds)** | +| crawler\_history\_**service type**:**domain** | **item root (first crawled item)** | **epoch (seconds)** | -##### Regular key: +##### Key: | Key | Value | | ------ | ------ | -| crawler_history_**service type**:**domain** | **json config** | +| crawler\_config:**crawler mode**:**service type**:**domain** | **json config** | -##### exemple json config: +###### exemple json config: ```json { "closespider_pagecount": 1, diff --git a/bin/Crawler.py b/bin/Crawler.py index e8ff8eb1..19de0e2a 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -167,8 +167,9 @@ def on_error_send_message_back_in_queue(type_service, domain, message): redis_crawler.sadd('{}_domain_crawler_queue'.format(type_service), domain) redis_crawler.sadd('{}_crawler_priority_queue'.format(type_service), message) -def crawl_onion(url, domain, message, crawler_config): +def crawl_onion(url, domain, port, type_service, message, crawler_config): crawler_config['url'] = url + crawler_config['port'] = port print('Launching Crawler: {}'.format(url)) r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain) @@ -261,7 +262,6 @@ if __name__ == '__main__': rotation_mode = ['onion', 'regular'] default_proto_map = {'http': 80, 'https': 443} ######################################################## add ftp ??? -################################################################### # TODO: port publisher.port = 6380 publisher.channel = "Script" @@ -331,6 +331,7 @@ if __name__ == '__main__': print('url: {}'.format(url_data['url'])) print('domain: {}'.format(url_data['domain'])) print('domain_url: {}'.format(url_data['domain_url'])) + print() # Check blacklist if not redis_crawler.sismember('blacklist_{}'.format(to_crawl['type_service']), url_data['domain']): @@ -357,40 +358,33 @@ if __name__ == '__main__': ######################################################crawler strategy # CRAWL domain - crawl_onion(url_data['url'], url_data['domain'], to_crawl['original_message'], crawler_config) + crawl_onion(url_data['url'], url_data['domain'], url_data['port'], to_crawl['type_service'], to_crawl['original_message'], crawler_config) # Default Crawler else: # CRAWL domain - crawl_onion(url_data['domain_url'], url_data['domain'], to_crawl['original_message'], crawler_config) + crawl_onion(url_data['domain_url'], url_data['domain'], url_data['port'], to_crawl['type_service'], to_crawl['original_message'], crawler_config) #if url != domain_url and not is_domain_up_day(url_data['domain'], to_crawl['type_service'], date['date_day']): # crawl_onion(url_data['url'], url_data['domain'], to_crawl['original_message']) - ################################################### handle port - # CRAWL with port - #if port is not None: - # crawl_onion('{}:{}'.format(domain_url, port), domain, message) - #### #### - - # Save last_status day (DOWN) if not is_domain_up_day(url_data['domain'], to_crawl['type_service'], date['date_day']): redis_crawler.sadd('{}_down:{}'.format(to_crawl['type_service'], date['date_day']), url_data['domain']) # if domain was UP at least one time - if redis_crawler.exists('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain'])): + if redis_crawler.exists('crawler_history_{}:{}:{}'.format(to_crawl['type_service'], url_data['domain'], url_data['port'])): # add crawler history (if domain is down) - if not redis_crawler.zrangebyscore('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain']), date['epoch'], date['epoch']): + if not redis_crawler.zrangebyscore('crawler_history_{}:{}:{}'.format(to_crawl['type_service'], url_data['domain'], url_data['port']), date['epoch'], date['epoch']): # Domain is down - redis_crawler.zadd('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain']), int(date['epoch']), int(date['epoch'])) + redis_crawler.zadd('crawler_history_{}:{}:{}'.format(to_crawl['type_service'], url_data['domain'], url_data['port']), int(date['epoch']), int(date['epoch'])) ############################ # extract page content ############################ # update list, last crawled domains - redis_crawler.lpush('last_{}'.format(to_crawl['type_service']), '{};{}'.format(url_data['domain'], date['epoch'])) + redis_crawler.lpush('last_{}'.format(to_crawl['type_service']), '{}:{};{}'.format(url_data['domain'], url_data['port'], date['epoch'])) redis_crawler.ltrim('last_{}'.format(to_crawl['type_service']), 0, 15) #update crawler status diff --git a/bin/packages/HiddenServices.py b/bin/packages/HiddenServices.py index 74e66cf9..837aa74c 100755 --- a/bin/packages/HiddenServices.py +++ b/bin/packages/HiddenServices.py @@ -37,7 +37,7 @@ class HiddenServices(object): """ - def __init__(self, domain, type): + def __init__(self, domain, type, port=80): configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') if not os.path.exists(configfile): @@ -61,6 +61,7 @@ class HiddenServices(object): self.domain = domain self.type = type + self.port = port self.tags = {} if type == 'onion' or type == 'regular': @@ -110,7 +111,7 @@ class HiddenServices(object): self.tags[tag] = self.tags.get(tag, 0) + 1 def get_first_crawled(self): - res = self.r_serv_onion.zrange('crawler_history_{}:{}'.format(self.type, self.domain), 0, 0, withscores=True) + res = self.r_serv_onion.zrange('crawler_history_{}:{}:{}'.format(self.type, self.domain, self.port), 0, 0, withscores=True) if res: res = res[0] return {'root_item':res[0], 'epoch':res[1]} @@ -118,26 +119,31 @@ class HiddenServices(object): return {} def get_last_crawled(self): - res = self.r_serv_onion.zrevrange('crawler_history_{}:{}'.format(self.type, self.domain), 0, 0, withscores=True) + res = self.r_serv_onion.zrevrange('crawler_history_{}:{}:{}'.format(self.type, self.domain, self.port), 0, 0, withscores=True) if res: - res = res[0] return {'root_item':res[0], 'epoch':res[1]} else: return {} #todo use the right paste - def get_last_crawled_pastes(self, epoch=None): - if epoch is None: - list_root = self.r_serv_onion.zrevrange('crawler_history_{}:{}'.format(self.type, self.domain), 0, 0) - else: - list_root = self.r_serv_onion.zrevrangebyscore('crawler_history_{}:{}'.format(self.type, self.domain), int(epoch), int(epoch)) - if list_root: - return self.get_all_pastes_domain(list_root[0]) - else: - if epoch: - return self.get_last_crawled_pastes() - else: - return list_root + def get_domain_crawled_core_item(self, epoch=None): + core_item = {} + if epoch: + list_root = self.r_serv_onion.zrevrangebyscore('crawler_history_{}:{}'.format(self.type, self.domain, self.port), int(epoch), int(epoch)) + if list_root: + core_item['root_item'] = list_root[0] + core_item['epoch'] = epoch + return core_item + + # no history found for this epoch + if not core_item: + return self.get_last_crawled() + + #todo use the right paste + def get_last_crawled_pastes(self, item_root=None): + if item_root is None: + item_root = self.get_domain_crawled_core_item(self) + return self.get_all_pastes_domain(item_root) def get_all_pastes_domain(self, root_item): if root_item is None: diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index 3800c0bc..4d48e0b3 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -46,19 +46,20 @@ class TorSplashCrawler(): 'DEPTH_LIMIT': crawler_options['depth_limit'] }) - def crawl(self, type, crawler_options, date, url, domain, original_item): - self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, url=url, domain=domain,original_item=original_item) + def crawl(self, type, crawler_options, date, url, domain, port, original_item): + self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, url=url, domain=domain, port=port, original_item=original_item) self.process.start() class TorSplashSpider(Spider): name = 'TorSplashSpider' - def __init__(self, type, crawler_options, date, url, domain, original_item, *args, **kwargs): + def __init__(self, type, crawler_options, date, url, domain, port, original_item, *args, **kwargs): self.type = type self.original_item = original_item self.root_key = None self.start_urls = url self.domains = [domain] + self.port = str(port) date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8]) self.full_date = date['date_day'] self.date_month = date['date_month'] @@ -153,13 +154,22 @@ class TorSplashCrawler(): if self.root_key is None: self.root_key = relative_filename_paste # Create/Update crawler history - self.r_serv_onion.zadd('crawler_history_{}:{}'.format(self.type, self.domains[0]), self.date_epoch, self.root_key) + self.r_serv_onion.zadd('crawler_history_{}:{}:{}'.format(self.type, self.domains[0], self.port), self.date_epoch, self.root_key) + # Update domain port number + all_domain_ports = self.r_serv_onion.hget('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports') + if all_domain_ports: + all_domain_ports = all_domain_ports.split(';') + else: + all_domain_ports = [] + if self.port not in all_domain_ports: + all_domain_ports.append(self.port) + self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports', ';'.join(all_domain_ports)) #create paste metadata - self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.root_key) - self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['father']) - self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0]) - self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url) + self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'super_father', self.root_key) + self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'father', response.meta['father']) + self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'domain', '{}:{}'.format(self.domains[0], self.port)) + self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'real_link', response.url) self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste) @@ -206,6 +216,10 @@ class TorSplashCrawler(): self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url) time.sleep(10) + if response: + response_root_key = response.meta['root_key'] + else: + response_root_key = None yield SplashRequest( url, self.parse, diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py index 480bbe34..13a67545 100755 --- a/bin/torcrawler/tor_crawler.py +++ b/bin/torcrawler/tor_crawler.py @@ -39,6 +39,7 @@ if __name__ == '__main__': service_type = crawler_json['service_type'] url = crawler_json['url'] domain = crawler_json['domain'] + port = crawler_json['port'] original_item = crawler_json['item'] crawler_options = crawler_json['crawler_options'] date = crawler_json['date'] @@ -46,4 +47,4 @@ if __name__ == '__main__': redis_cache.delete('crawler_request:{}'.format(uuid)) crawler = TorSplashCrawler(splash_url, crawler_options) - crawler.crawl(service_type, crawler_options, date, url, domain, original_item) + crawler.crawl(service_type, crawler_options, date, url, domain, port, original_item) diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index d68c2eb5..78406368 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -122,6 +122,13 @@ def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None): list_crawled_metadata = [] for domain_epoch in list_domains_crawled: domain, epoch = domain_epoch.rsplit(';', 1) + domain = domain.split(':') + if len(domain) == 1: + port = 80 + domain = domain[0] + else: + port = domain[1] + domain = domain[0] metadata_domain = {} # get Domain type if type is None: @@ -133,6 +140,7 @@ def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None): metadata_domain['domain_name'] = '{}[...].{}'.format(domain_name[:40], tld_domain) else: metadata_domain['domain_name'] = domain + metadata_domain['port'] = port metadata_domain['epoch'] = epoch metadata_domain['last_check'] = r_serv_onion.hget('{}_metadata:{}'.format(type, domain), 'last_check') if metadata_domain['last_check'] is None: @@ -384,56 +392,7 @@ def create_spider_splash(): return redirect(url_for('hiddenServices.manual')) -@hiddenServices.route("/hiddenServices/", methods=['GET']) -def hiddenServices_page(): - last_onions = r_serv_onion.lrange('last_onion', 0 ,-1) - list_onion = [] - - now = datetime.datetime.now() - date = now.strftime("%Y%m%d") - - statDomains = {} - statDomains['domains_up'] = r_serv_onion.scard('onion_up:{}'.format(date)) - statDomains['domains_down'] = r_serv_onion.scard('onion_down:{}'.format(date)) - statDomains['total'] = statDomains['domains_up'] + statDomains['domains_down'] - statDomains['domains_queue'] = r_serv_onion.scard('onion_domain_crawler_queue') - - for onion in last_onions: - metadata_onion = {} - metadata_onion['domain'] = onion - metadata_onion['last_check'] = r_serv_onion.hget('onion_metadata:{}'.format(onion), 'last_check') - if metadata_onion['last_check'] is None: - metadata_onion['last_check'] = '********' - metadata_onion['first_seen'] = r_serv_onion.hget('onion_metadata:{}'.format(onion), 'first_seen') - if metadata_onion['first_seen'] is None: - metadata_onion['first_seen'] = '********' - if get_onion_status(onion, metadata_onion['last_check']): - metadata_onion['status_text'] = 'UP' - metadata_onion['status_color'] = 'Green' - metadata_onion['status_icon'] = 'fa-check-circle' - else: - metadata_onion['status_text'] = 'DOWN' - metadata_onion['status_color'] = 'Red' - metadata_onion['status_icon'] = 'fa-times-circle' - list_onion.append(metadata_onion) - - crawler_metadata=[] - all_onion_crawler = r_cache.smembers('all_crawler:onion') - for crawler in all_onion_crawler: - crawling_domain = r_cache.hget('metadata_crawler:{}'.format(crawler), 'crawling_domain') - started_time = r_cache.hget('metadata_crawler:{}'.format(crawler), 'started_time') - status_info = r_cache.hget('metadata_crawler:{}'.format(crawler), 'status') - crawler_info = '{} - {}'.format(crawler, started_time) - if status_info=='Waiting' or status_info=='Crawling': - status=True - else: - status=False - crawler_metadata.append({'crawler_info': crawler_info, 'crawling_domain': crawling_domain, 'status_info': status_info, 'status': status}) - - date_string = '{}-{}-{}'.format(date[0:4], date[4:6], date[6:8]) - return render_template("hiddenServices.html", last_onions=list_onion, statDomains=statDomains, - crawler_metadata=crawler_metadata, date_from=date_string, date_to=date_string) - +# # TODO: refractor @hiddenServices.route("/hiddenServices/last_crawled_domains_with_stats_json", methods=['GET']) def last_crawled_domains_with_stats_json(): last_onions = r_serv_onion.lrange('last_onion', 0 ,-1) @@ -571,10 +530,19 @@ def show_domains_by_daterange(): date_from=date_from, date_to=date_to, domains_up=domains_up, domains_down=domains_down, domains_tags=domains_tags, bootstrap_label=bootstrap_label) -@hiddenServices.route("/hiddenServices/show_domain", methods=['GET']) +@hiddenServices.route("/crawlers/show_domain", methods=['GET']) def show_domain(): domain = request.args.get('domain') epoch = request.args.get('epoch') + try: + epoch = int(epoch) + except: + epoch = None + port = request.args.get('port') + try: + port = int(port) + except: + port = 80 type = get_type_domain(domain) if domain is None or not r_serv_onion.exists('{}_metadata:{}'.format(type, domain)): return '404' @@ -590,16 +558,16 @@ def show_domain(): first_seen = '{}/{}/{}'.format(first_seen[0:4], first_seen[4:6], first_seen[6:8]) origin_paste = r_serv_onion.hget('{}_metadata:{}'.format(type, domain), 'paste_parent') - h = HiddenServices(domain, type) - last_crawled_time = h.get_last_crawled() - if 'epoch' in last_crawled_time: - last_check = '{} - {}'.format(last_check, time.strftime('%H:%M.%S', time.gmtime(last_crawled_time['epoch']))) - l_pastes = h.get_last_crawled_pastes(epoch=epoch) + h = HiddenServices(domain, type, port=port) + item_core = h.get_domain_crawled_core_item(epoch=epoch) + epoch = item_core['epoch'] + l_pastes = h.get_last_crawled_pastes(item_root=item_core['root_item']) dict_links = h.get_all_links(l_pastes) if l_pastes: status = True else: status = False + last_check = '{} - {}'.format(last_check, time.strftime('%H:%M.%S', time.gmtime(epoch))) screenshot = h.get_domain_random_screenshot(l_pastes) if screenshot: screenshot = screenshot[0] diff --git a/var/www/modules/hiddenServices/templates/Crawler_Splash_last_by_type.html b/var/www/modules/hiddenServices/templates/Crawler_Splash_last_by_type.html index ffa09fb0..6c3bab49 100644 --- a/var/www/modules/hiddenServices/templates/Crawler_Splash_last_by_type.html +++ b/var/www/modules/hiddenServices/templates/Crawler_Splash_last_by_type.html @@ -61,8 +61,11 @@ {% for metadata_domain in last_domains %} - - {{ metadata_domain['domain_name'] }} + + {{ metadata_domain['domain_name'] }} {{'{}/{}/{}'.format(metadata_domain['first_seen'][0:4], metadata_domain['first_seen'][4:6], metadata_domain['first_seen'][6:8])}} {{'{}/{}/{}'.format(metadata_domain['last_check'][0:4], metadata_domain['last_check'][4:6], metadata_domain['last_check'][6:8])}}