mirror of https://github.com/CIRCL/AIL-framework
chg: [Crawler] handle port: crawling + history
parent
f4cdddbc7f
commit
f64c385343
18
OVERVIEW.md
18
OVERVIEW.md
|
@ -57,6 +57,15 @@ Redis and ARDB overview
|
||||||
| ------ | ------ |
|
| ------ | ------ |
|
||||||
| *tag* | *paste* |
|
| *tag* | *paste* |
|
||||||
|
|
||||||
|
## DB7 - Metadata:
|
||||||
|
|
||||||
|
#### Crawled Items:
|
||||||
|
##### Hset:
|
||||||
|
| Key | Field | Value |
|
||||||
|
| ------ | ------ | ------ |
|
||||||
|
| paste_metadata:**item path** | super_father | **first url crawled** |
|
||||||
|
| | father | **item father** |
|
||||||
|
| | domain | **crawled domain**:**domain port** |
|
||||||
|
|
||||||
## DB9 - Crawler:
|
## DB9 - Crawler:
|
||||||
|
|
||||||
|
@ -65,19 +74,20 @@ Redis and ARDB overview
|
||||||
| ------ | ------ | ------ |
|
| ------ | ------ | ------ |
|
||||||
| **service type**:**domain** | first_seen | **date** |
|
| **service type**:**domain** | first_seen | **date** |
|
||||||
| | last_check | **date** |
|
| | last_check | **date** |
|
||||||
|
| | ports | **port**;**port**;**port** ... |
|
||||||
| | paste_parent | **parent last crawling (can be auto or manual)** |
|
| | paste_parent | **parent last crawling (can be auto or manual)** |
|
||||||
|
|
||||||
##### Zset:
|
##### Zset:
|
||||||
| Key | Field | Value |
|
| Key | Field | Value |
|
||||||
| ------ | ------ | ------ |
|
| ------ | ------ | ------ |
|
||||||
| crawler_history_**service type**:**domain** | **item root (first crawled item)** | **epoch (seconds)** |
|
| crawler\_history\_**service type**:**domain** | **item root (first crawled item)** | **epoch (seconds)** |
|
||||||
|
|
||||||
##### Regular key:
|
##### Key:
|
||||||
| Key | Value |
|
| Key | Value |
|
||||||
| ------ | ------ |
|
| ------ | ------ |
|
||||||
| crawler_history_**service type**:**domain** | **json config** |
|
| crawler\_config:**crawler mode**:**service type**:**domain** | **json config** |
|
||||||
|
|
||||||
##### exemple json config:
|
###### exemple json config:
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"closespider_pagecount": 1,
|
"closespider_pagecount": 1,
|
||||||
|
|
|
@ -167,8 +167,9 @@ def on_error_send_message_back_in_queue(type_service, domain, message):
|
||||||
redis_crawler.sadd('{}_domain_crawler_queue'.format(type_service), domain)
|
redis_crawler.sadd('{}_domain_crawler_queue'.format(type_service), domain)
|
||||||
redis_crawler.sadd('{}_crawler_priority_queue'.format(type_service), message)
|
redis_crawler.sadd('{}_crawler_priority_queue'.format(type_service), message)
|
||||||
|
|
||||||
def crawl_onion(url, domain, message, crawler_config):
|
def crawl_onion(url, domain, port, type_service, message, crawler_config):
|
||||||
crawler_config['url'] = url
|
crawler_config['url'] = url
|
||||||
|
crawler_config['port'] = port
|
||||||
print('Launching Crawler: {}'.format(url))
|
print('Launching Crawler: {}'.format(url))
|
||||||
|
|
||||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain)
|
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain)
|
||||||
|
@ -261,7 +262,6 @@ if __name__ == '__main__':
|
||||||
rotation_mode = ['onion', 'regular']
|
rotation_mode = ['onion', 'regular']
|
||||||
default_proto_map = {'http': 80, 'https': 443}
|
default_proto_map = {'http': 80, 'https': 443}
|
||||||
######################################################## add ftp ???
|
######################################################## add ftp ???
|
||||||
################################################################### # TODO: port
|
|
||||||
|
|
||||||
publisher.port = 6380
|
publisher.port = 6380
|
||||||
publisher.channel = "Script"
|
publisher.channel = "Script"
|
||||||
|
@ -331,6 +331,7 @@ if __name__ == '__main__':
|
||||||
print('url: {}'.format(url_data['url']))
|
print('url: {}'.format(url_data['url']))
|
||||||
print('domain: {}'.format(url_data['domain']))
|
print('domain: {}'.format(url_data['domain']))
|
||||||
print('domain_url: {}'.format(url_data['domain_url']))
|
print('domain_url: {}'.format(url_data['domain_url']))
|
||||||
|
print()
|
||||||
|
|
||||||
# Check blacklist
|
# Check blacklist
|
||||||
if not redis_crawler.sismember('blacklist_{}'.format(to_crawl['type_service']), url_data['domain']):
|
if not redis_crawler.sismember('blacklist_{}'.format(to_crawl['type_service']), url_data['domain']):
|
||||||
|
@ -357,40 +358,33 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
######################################################crawler strategy
|
######################################################crawler strategy
|
||||||
# CRAWL domain
|
# CRAWL domain
|
||||||
crawl_onion(url_data['url'], url_data['domain'], to_crawl['original_message'], crawler_config)
|
crawl_onion(url_data['url'], url_data['domain'], url_data['port'], to_crawl['type_service'], to_crawl['original_message'], crawler_config)
|
||||||
|
|
||||||
# Default Crawler
|
# Default Crawler
|
||||||
else:
|
else:
|
||||||
# CRAWL domain
|
# CRAWL domain
|
||||||
crawl_onion(url_data['domain_url'], url_data['domain'], to_crawl['original_message'], crawler_config)
|
crawl_onion(url_data['domain_url'], url_data['domain'], url_data['port'], to_crawl['type_service'], to_crawl['original_message'], crawler_config)
|
||||||
#if url != domain_url and not is_domain_up_day(url_data['domain'], to_crawl['type_service'], date['date_day']):
|
#if url != domain_url and not is_domain_up_day(url_data['domain'], to_crawl['type_service'], date['date_day']):
|
||||||
# crawl_onion(url_data['url'], url_data['domain'], to_crawl['original_message'])
|
# crawl_onion(url_data['url'], url_data['domain'], to_crawl['original_message'])
|
||||||
|
|
||||||
|
|
||||||
################################################### handle port
|
|
||||||
# CRAWL with port
|
|
||||||
#if port is not None:
|
|
||||||
# crawl_onion('{}:{}'.format(domain_url, port), domain, message)
|
|
||||||
#### ####
|
|
||||||
|
|
||||||
|
|
||||||
# Save last_status day (DOWN)
|
# Save last_status day (DOWN)
|
||||||
if not is_domain_up_day(url_data['domain'], to_crawl['type_service'], date['date_day']):
|
if not is_domain_up_day(url_data['domain'], to_crawl['type_service'], date['date_day']):
|
||||||
redis_crawler.sadd('{}_down:{}'.format(to_crawl['type_service'], date['date_day']), url_data['domain'])
|
redis_crawler.sadd('{}_down:{}'.format(to_crawl['type_service'], date['date_day']), url_data['domain'])
|
||||||
|
|
||||||
# if domain was UP at least one time
|
# if domain was UP at least one time
|
||||||
if redis_crawler.exists('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain'])):
|
if redis_crawler.exists('crawler_history_{}:{}:{}'.format(to_crawl['type_service'], url_data['domain'], url_data['port'])):
|
||||||
# add crawler history (if domain is down)
|
# add crawler history (if domain is down)
|
||||||
if not redis_crawler.zrangebyscore('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain']), date['epoch'], date['epoch']):
|
if not redis_crawler.zrangebyscore('crawler_history_{}:{}:{}'.format(to_crawl['type_service'], url_data['domain'], url_data['port']), date['epoch'], date['epoch']):
|
||||||
# Domain is down
|
# Domain is down
|
||||||
redis_crawler.zadd('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain']), int(date['epoch']), int(date['epoch']))
|
redis_crawler.zadd('crawler_history_{}:{}:{}'.format(to_crawl['type_service'], url_data['domain'], url_data['port']), int(date['epoch']), int(date['epoch']))
|
||||||
|
|
||||||
############################
|
############################
|
||||||
# extract page content
|
# extract page content
|
||||||
############################
|
############################
|
||||||
|
|
||||||
# update list, last crawled domains
|
# update list, last crawled domains
|
||||||
redis_crawler.lpush('last_{}'.format(to_crawl['type_service']), '{};{}'.format(url_data['domain'], date['epoch']))
|
redis_crawler.lpush('last_{}'.format(to_crawl['type_service']), '{}:{};{}'.format(url_data['domain'], url_data['port'], date['epoch']))
|
||||||
redis_crawler.ltrim('last_{}'.format(to_crawl['type_service']), 0, 15)
|
redis_crawler.ltrim('last_{}'.format(to_crawl['type_service']), 0, 15)
|
||||||
|
|
||||||
#update crawler status
|
#update crawler status
|
||||||
|
|
|
@ -37,7 +37,7 @@ class HiddenServices(object):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, domain, type):
|
def __init__(self, domain, type, port=80):
|
||||||
|
|
||||||
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
|
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
|
||||||
if not os.path.exists(configfile):
|
if not os.path.exists(configfile):
|
||||||
|
@ -61,6 +61,7 @@ class HiddenServices(object):
|
||||||
|
|
||||||
self.domain = domain
|
self.domain = domain
|
||||||
self.type = type
|
self.type = type
|
||||||
|
self.port = port
|
||||||
self.tags = {}
|
self.tags = {}
|
||||||
|
|
||||||
if type == 'onion' or type == 'regular':
|
if type == 'onion' or type == 'regular':
|
||||||
|
@ -110,7 +111,7 @@ class HiddenServices(object):
|
||||||
self.tags[tag] = self.tags.get(tag, 0) + 1
|
self.tags[tag] = self.tags.get(tag, 0) + 1
|
||||||
|
|
||||||
def get_first_crawled(self):
|
def get_first_crawled(self):
|
||||||
res = self.r_serv_onion.zrange('crawler_history_{}:{}'.format(self.type, self.domain), 0, 0, withscores=True)
|
res = self.r_serv_onion.zrange('crawler_history_{}:{}:{}'.format(self.type, self.domain, self.port), 0, 0, withscores=True)
|
||||||
if res:
|
if res:
|
||||||
res = res[0]
|
res = res[0]
|
||||||
return {'root_item':res[0], 'epoch':res[1]}
|
return {'root_item':res[0], 'epoch':res[1]}
|
||||||
|
@ -118,26 +119,31 @@ class HiddenServices(object):
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
def get_last_crawled(self):
|
def get_last_crawled(self):
|
||||||
res = self.r_serv_onion.zrevrange('crawler_history_{}:{}'.format(self.type, self.domain), 0, 0, withscores=True)
|
res = self.r_serv_onion.zrevrange('crawler_history_{}:{}:{}'.format(self.type, self.domain, self.port), 0, 0, withscores=True)
|
||||||
if res:
|
if res:
|
||||||
res = res[0]
|
|
||||||
return {'root_item':res[0], 'epoch':res[1]}
|
return {'root_item':res[0], 'epoch':res[1]}
|
||||||
else:
|
else:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
#todo use the right paste
|
#todo use the right paste
|
||||||
def get_last_crawled_pastes(self, epoch=None):
|
def get_domain_crawled_core_item(self, epoch=None):
|
||||||
if epoch is None:
|
core_item = {}
|
||||||
list_root = self.r_serv_onion.zrevrange('crawler_history_{}:{}'.format(self.type, self.domain), 0, 0)
|
if epoch:
|
||||||
else:
|
list_root = self.r_serv_onion.zrevrangebyscore('crawler_history_{}:{}'.format(self.type, self.domain, self.port), int(epoch), int(epoch))
|
||||||
list_root = self.r_serv_onion.zrevrangebyscore('crawler_history_{}:{}'.format(self.type, self.domain), int(epoch), int(epoch))
|
if list_root:
|
||||||
if list_root:
|
core_item['root_item'] = list_root[0]
|
||||||
return self.get_all_pastes_domain(list_root[0])
|
core_item['epoch'] = epoch
|
||||||
else:
|
return core_item
|
||||||
if epoch:
|
|
||||||
return self.get_last_crawled_pastes()
|
# no history found for this epoch
|
||||||
else:
|
if not core_item:
|
||||||
return list_root
|
return self.get_last_crawled()
|
||||||
|
|
||||||
|
#todo use the right paste
|
||||||
|
def get_last_crawled_pastes(self, item_root=None):
|
||||||
|
if item_root is None:
|
||||||
|
item_root = self.get_domain_crawled_core_item(self)
|
||||||
|
return self.get_all_pastes_domain(item_root)
|
||||||
|
|
||||||
def get_all_pastes_domain(self, root_item):
|
def get_all_pastes_domain(self, root_item):
|
||||||
if root_item is None:
|
if root_item is None:
|
||||||
|
|
|
@ -46,19 +46,20 @@ class TorSplashCrawler():
|
||||||
'DEPTH_LIMIT': crawler_options['depth_limit']
|
'DEPTH_LIMIT': crawler_options['depth_limit']
|
||||||
})
|
})
|
||||||
|
|
||||||
def crawl(self, type, crawler_options, date, url, domain, original_item):
|
def crawl(self, type, crawler_options, date, url, domain, port, original_item):
|
||||||
self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, url=url, domain=domain,original_item=original_item)
|
self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, url=url, domain=domain, port=port, original_item=original_item)
|
||||||
self.process.start()
|
self.process.start()
|
||||||
|
|
||||||
class TorSplashSpider(Spider):
|
class TorSplashSpider(Spider):
|
||||||
name = 'TorSplashSpider'
|
name = 'TorSplashSpider'
|
||||||
|
|
||||||
def __init__(self, type, crawler_options, date, url, domain, original_item, *args, **kwargs):
|
def __init__(self, type, crawler_options, date, url, domain, port, original_item, *args, **kwargs):
|
||||||
self.type = type
|
self.type = type
|
||||||
self.original_item = original_item
|
self.original_item = original_item
|
||||||
self.root_key = None
|
self.root_key = None
|
||||||
self.start_urls = url
|
self.start_urls = url
|
||||||
self.domains = [domain]
|
self.domains = [domain]
|
||||||
|
self.port = str(port)
|
||||||
date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8])
|
date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8])
|
||||||
self.full_date = date['date_day']
|
self.full_date = date['date_day']
|
||||||
self.date_month = date['date_month']
|
self.date_month = date['date_month']
|
||||||
|
@ -153,13 +154,22 @@ class TorSplashCrawler():
|
||||||
if self.root_key is None:
|
if self.root_key is None:
|
||||||
self.root_key = relative_filename_paste
|
self.root_key = relative_filename_paste
|
||||||
# Create/Update crawler history
|
# Create/Update crawler history
|
||||||
self.r_serv_onion.zadd('crawler_history_{}:{}'.format(self.type, self.domains[0]), self.date_epoch, self.root_key)
|
self.r_serv_onion.zadd('crawler_history_{}:{}:{}'.format(self.type, self.domains[0], self.port), self.date_epoch, self.root_key)
|
||||||
|
# Update domain port number
|
||||||
|
all_domain_ports = self.r_serv_onion.hget('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports')
|
||||||
|
if all_domain_ports:
|
||||||
|
all_domain_ports = all_domain_ports.split(';')
|
||||||
|
else:
|
||||||
|
all_domain_ports = []
|
||||||
|
if self.port not in all_domain_ports:
|
||||||
|
all_domain_ports.append(self.port)
|
||||||
|
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports', ';'.join(all_domain_ports))
|
||||||
|
|
||||||
#create paste metadata
|
#create paste metadata
|
||||||
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.root_key)
|
self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'super_father', self.root_key)
|
||||||
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['father'])
|
self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'father', response.meta['father'])
|
||||||
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0])
|
self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'domain', '{}:{}'.format(self.domains[0], self.port))
|
||||||
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url)
|
self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'real_link', response.url)
|
||||||
|
|
||||||
self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste)
|
self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste)
|
||||||
|
|
||||||
|
@ -206,6 +216,10 @@ class TorSplashCrawler():
|
||||||
|
|
||||||
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
|
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
|
||||||
time.sleep(10)
|
time.sleep(10)
|
||||||
|
if response:
|
||||||
|
response_root_key = response.meta['root_key']
|
||||||
|
else:
|
||||||
|
response_root_key = None
|
||||||
yield SplashRequest(
|
yield SplashRequest(
|
||||||
url,
|
url,
|
||||||
self.parse,
|
self.parse,
|
||||||
|
|
|
@ -39,6 +39,7 @@ if __name__ == '__main__':
|
||||||
service_type = crawler_json['service_type']
|
service_type = crawler_json['service_type']
|
||||||
url = crawler_json['url']
|
url = crawler_json['url']
|
||||||
domain = crawler_json['domain']
|
domain = crawler_json['domain']
|
||||||
|
port = crawler_json['port']
|
||||||
original_item = crawler_json['item']
|
original_item = crawler_json['item']
|
||||||
crawler_options = crawler_json['crawler_options']
|
crawler_options = crawler_json['crawler_options']
|
||||||
date = crawler_json['date']
|
date = crawler_json['date']
|
||||||
|
@ -46,4 +47,4 @@ if __name__ == '__main__':
|
||||||
redis_cache.delete('crawler_request:{}'.format(uuid))
|
redis_cache.delete('crawler_request:{}'.format(uuid))
|
||||||
|
|
||||||
crawler = TorSplashCrawler(splash_url, crawler_options)
|
crawler = TorSplashCrawler(splash_url, crawler_options)
|
||||||
crawler.crawl(service_type, crawler_options, date, url, domain, original_item)
|
crawler.crawl(service_type, crawler_options, date, url, domain, port, original_item)
|
||||||
|
|
|
@ -122,6 +122,13 @@ def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None):
|
||||||
list_crawled_metadata = []
|
list_crawled_metadata = []
|
||||||
for domain_epoch in list_domains_crawled:
|
for domain_epoch in list_domains_crawled:
|
||||||
domain, epoch = domain_epoch.rsplit(';', 1)
|
domain, epoch = domain_epoch.rsplit(';', 1)
|
||||||
|
domain = domain.split(':')
|
||||||
|
if len(domain) == 1:
|
||||||
|
port = 80
|
||||||
|
domain = domain[0]
|
||||||
|
else:
|
||||||
|
port = domain[1]
|
||||||
|
domain = domain[0]
|
||||||
metadata_domain = {}
|
metadata_domain = {}
|
||||||
# get Domain type
|
# get Domain type
|
||||||
if type is None:
|
if type is None:
|
||||||
|
@ -133,6 +140,7 @@ def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None):
|
||||||
metadata_domain['domain_name'] = '{}[...].{}'.format(domain_name[:40], tld_domain)
|
metadata_domain['domain_name'] = '{}[...].{}'.format(domain_name[:40], tld_domain)
|
||||||
else:
|
else:
|
||||||
metadata_domain['domain_name'] = domain
|
metadata_domain['domain_name'] = domain
|
||||||
|
metadata_domain['port'] = port
|
||||||
metadata_domain['epoch'] = epoch
|
metadata_domain['epoch'] = epoch
|
||||||
metadata_domain['last_check'] = r_serv_onion.hget('{}_metadata:{}'.format(type, domain), 'last_check')
|
metadata_domain['last_check'] = r_serv_onion.hget('{}_metadata:{}'.format(type, domain), 'last_check')
|
||||||
if metadata_domain['last_check'] is None:
|
if metadata_domain['last_check'] is None:
|
||||||
|
@ -384,56 +392,7 @@ def create_spider_splash():
|
||||||
|
|
||||||
return redirect(url_for('hiddenServices.manual'))
|
return redirect(url_for('hiddenServices.manual'))
|
||||||
|
|
||||||
@hiddenServices.route("/hiddenServices/", methods=['GET'])
|
# # TODO: refractor
|
||||||
def hiddenServices_page():
|
|
||||||
last_onions = r_serv_onion.lrange('last_onion', 0 ,-1)
|
|
||||||
list_onion = []
|
|
||||||
|
|
||||||
now = datetime.datetime.now()
|
|
||||||
date = now.strftime("%Y%m%d")
|
|
||||||
|
|
||||||
statDomains = {}
|
|
||||||
statDomains['domains_up'] = r_serv_onion.scard('onion_up:{}'.format(date))
|
|
||||||
statDomains['domains_down'] = r_serv_onion.scard('onion_down:{}'.format(date))
|
|
||||||
statDomains['total'] = statDomains['domains_up'] + statDomains['domains_down']
|
|
||||||
statDomains['domains_queue'] = r_serv_onion.scard('onion_domain_crawler_queue')
|
|
||||||
|
|
||||||
for onion in last_onions:
|
|
||||||
metadata_onion = {}
|
|
||||||
metadata_onion['domain'] = onion
|
|
||||||
metadata_onion['last_check'] = r_serv_onion.hget('onion_metadata:{}'.format(onion), 'last_check')
|
|
||||||
if metadata_onion['last_check'] is None:
|
|
||||||
metadata_onion['last_check'] = '********'
|
|
||||||
metadata_onion['first_seen'] = r_serv_onion.hget('onion_metadata:{}'.format(onion), 'first_seen')
|
|
||||||
if metadata_onion['first_seen'] is None:
|
|
||||||
metadata_onion['first_seen'] = '********'
|
|
||||||
if get_onion_status(onion, metadata_onion['last_check']):
|
|
||||||
metadata_onion['status_text'] = 'UP'
|
|
||||||
metadata_onion['status_color'] = 'Green'
|
|
||||||
metadata_onion['status_icon'] = 'fa-check-circle'
|
|
||||||
else:
|
|
||||||
metadata_onion['status_text'] = 'DOWN'
|
|
||||||
metadata_onion['status_color'] = 'Red'
|
|
||||||
metadata_onion['status_icon'] = 'fa-times-circle'
|
|
||||||
list_onion.append(metadata_onion)
|
|
||||||
|
|
||||||
crawler_metadata=[]
|
|
||||||
all_onion_crawler = r_cache.smembers('all_crawler:onion')
|
|
||||||
for crawler in all_onion_crawler:
|
|
||||||
crawling_domain = r_cache.hget('metadata_crawler:{}'.format(crawler), 'crawling_domain')
|
|
||||||
started_time = r_cache.hget('metadata_crawler:{}'.format(crawler), 'started_time')
|
|
||||||
status_info = r_cache.hget('metadata_crawler:{}'.format(crawler), 'status')
|
|
||||||
crawler_info = '{} - {}'.format(crawler, started_time)
|
|
||||||
if status_info=='Waiting' or status_info=='Crawling':
|
|
||||||
status=True
|
|
||||||
else:
|
|
||||||
status=False
|
|
||||||
crawler_metadata.append({'crawler_info': crawler_info, 'crawling_domain': crawling_domain, 'status_info': status_info, 'status': status})
|
|
||||||
|
|
||||||
date_string = '{}-{}-{}'.format(date[0:4], date[4:6], date[6:8])
|
|
||||||
return render_template("hiddenServices.html", last_onions=list_onion, statDomains=statDomains,
|
|
||||||
crawler_metadata=crawler_metadata, date_from=date_string, date_to=date_string)
|
|
||||||
|
|
||||||
@hiddenServices.route("/hiddenServices/last_crawled_domains_with_stats_json", methods=['GET'])
|
@hiddenServices.route("/hiddenServices/last_crawled_domains_with_stats_json", methods=['GET'])
|
||||||
def last_crawled_domains_with_stats_json():
|
def last_crawled_domains_with_stats_json():
|
||||||
last_onions = r_serv_onion.lrange('last_onion', 0 ,-1)
|
last_onions = r_serv_onion.lrange('last_onion', 0 ,-1)
|
||||||
|
@ -571,10 +530,19 @@ def show_domains_by_daterange():
|
||||||
date_from=date_from, date_to=date_to, domains_up=domains_up, domains_down=domains_down,
|
date_from=date_from, date_to=date_to, domains_up=domains_up, domains_down=domains_down,
|
||||||
domains_tags=domains_tags, bootstrap_label=bootstrap_label)
|
domains_tags=domains_tags, bootstrap_label=bootstrap_label)
|
||||||
|
|
||||||
@hiddenServices.route("/hiddenServices/show_domain", methods=['GET'])
|
@hiddenServices.route("/crawlers/show_domain", methods=['GET'])
|
||||||
def show_domain():
|
def show_domain():
|
||||||
domain = request.args.get('domain')
|
domain = request.args.get('domain')
|
||||||
epoch = request.args.get('epoch')
|
epoch = request.args.get('epoch')
|
||||||
|
try:
|
||||||
|
epoch = int(epoch)
|
||||||
|
except:
|
||||||
|
epoch = None
|
||||||
|
port = request.args.get('port')
|
||||||
|
try:
|
||||||
|
port = int(port)
|
||||||
|
except:
|
||||||
|
port = 80
|
||||||
type = get_type_domain(domain)
|
type = get_type_domain(domain)
|
||||||
if domain is None or not r_serv_onion.exists('{}_metadata:{}'.format(type, domain)):
|
if domain is None or not r_serv_onion.exists('{}_metadata:{}'.format(type, domain)):
|
||||||
return '404'
|
return '404'
|
||||||
|
@ -590,16 +558,16 @@ def show_domain():
|
||||||
first_seen = '{}/{}/{}'.format(first_seen[0:4], first_seen[4:6], first_seen[6:8])
|
first_seen = '{}/{}/{}'.format(first_seen[0:4], first_seen[4:6], first_seen[6:8])
|
||||||
origin_paste = r_serv_onion.hget('{}_metadata:{}'.format(type, domain), 'paste_parent')
|
origin_paste = r_serv_onion.hget('{}_metadata:{}'.format(type, domain), 'paste_parent')
|
||||||
|
|
||||||
h = HiddenServices(domain, type)
|
h = HiddenServices(domain, type, port=port)
|
||||||
last_crawled_time = h.get_last_crawled()
|
item_core = h.get_domain_crawled_core_item(epoch=epoch)
|
||||||
if 'epoch' in last_crawled_time:
|
epoch = item_core['epoch']
|
||||||
last_check = '{} - {}'.format(last_check, time.strftime('%H:%M.%S', time.gmtime(last_crawled_time['epoch'])))
|
l_pastes = h.get_last_crawled_pastes(item_root=item_core['root_item'])
|
||||||
l_pastes = h.get_last_crawled_pastes(epoch=epoch)
|
|
||||||
dict_links = h.get_all_links(l_pastes)
|
dict_links = h.get_all_links(l_pastes)
|
||||||
if l_pastes:
|
if l_pastes:
|
||||||
status = True
|
status = True
|
||||||
else:
|
else:
|
||||||
status = False
|
status = False
|
||||||
|
last_check = '{} - {}'.format(last_check, time.strftime('%H:%M.%S', time.gmtime(epoch)))
|
||||||
screenshot = h.get_domain_random_screenshot(l_pastes)
|
screenshot = h.get_domain_random_screenshot(l_pastes)
|
||||||
if screenshot:
|
if screenshot:
|
||||||
screenshot = screenshot[0]
|
screenshot = screenshot[0]
|
||||||
|
|
|
@ -61,8 +61,11 @@
|
||||||
</thead>
|
</thead>
|
||||||
<tbody id="tbody_last_crawled">
|
<tbody id="tbody_last_crawled">
|
||||||
{% for metadata_domain in last_domains %}
|
{% for metadata_domain in last_domains %}
|
||||||
<tr data-toggle="popover" data-trigger="hover" title="<span class='badge badge-dark'>{{metadata_domain['domain']}}</span>" data-content="{{metadata_domain['epoch']}}">
|
<tr data-toggle="popover" data-trigger="hover"
|
||||||
<td><a target="_blank" href="{{ url_for('hiddenServices.show_domain') }}?domain={{ metadata_domain['domain'] }}&epoch={{metadata_domain['epoch']}}">{{ metadata_domain['domain_name'] }}</a></td>
|
title="<span class='badge badge-dark'>{{metadata_domain['domain']}}</span>"
|
||||||
|
data-content="port: <span class='badge badge-secondary'>{{metadata_domain['port']}}</span><br>
|
||||||
|
epoch: {{metadata_domain['epoch']}}">
|
||||||
|
<td><a target="_blank" href="{{ url_for('hiddenServices.show_domain') }}?domain={{ metadata_domain['domain'] }}&port={{metadata_domain['port']}}&epoch={{metadata_domain['epoch']}}">{{ metadata_domain['domain_name'] }}</a></td>
|
||||||
<td>{{'{}/{}/{}'.format(metadata_domain['first_seen'][0:4], metadata_domain['first_seen'][4:6], metadata_domain['first_seen'][6:8])}}</td>
|
<td>{{'{}/{}/{}'.format(metadata_domain['first_seen'][0:4], metadata_domain['first_seen'][4:6], metadata_domain['first_seen'][6:8])}}</td>
|
||||||
<td>{{'{}/{}/{}'.format(metadata_domain['last_check'][0:4], metadata_domain['last_check'][4:6], metadata_domain['last_check'][6:8])}}</td>
|
<td>{{'{}/{}/{}'.format(metadata_domain['last_check'][0:4], metadata_domain['last_check'][4:6], metadata_domain['last_check'][6:8])}}</td>
|
||||||
<td><div style="color:{{metadata_domain['status_color']}}; display:inline-block">
|
<td><div style="color:{{metadata_domain['status_color']}}; display:inline-block">
|
||||||
|
|
Loading…
Reference in New Issue