mirror of https://github.com/CIRCL/AIL-framework
chg: [Crawler] handle port: crawling + history
parent
f4cdddbc7f
commit
f64c385343
18
OVERVIEW.md
18
OVERVIEW.md
|
@ -57,6 +57,15 @@ Redis and ARDB overview
|
|||
| ------ | ------ |
|
||||
| *tag* | *paste* |
|
||||
|
||||
## DB7 - Metadata:
|
||||
|
||||
#### Crawled Items:
|
||||
##### Hset:
|
||||
| Key | Field | Value |
|
||||
| ------ | ------ | ------ |
|
||||
| paste_metadata:**item path** | super_father | **first url crawled** |
|
||||
| | father | **item father** |
|
||||
| | domain | **crawled domain**:**domain port** |
|
||||
|
||||
## DB9 - Crawler:
|
||||
|
||||
|
@ -65,19 +74,20 @@ Redis and ARDB overview
|
|||
| ------ | ------ | ------ |
|
||||
| **service type**:**domain** | first_seen | **date** |
|
||||
| | last_check | **date** |
|
||||
| | ports | **port**;**port**;**port** ... |
|
||||
| | paste_parent | **parent last crawling (can be auto or manual)** |
|
||||
|
||||
##### Zset:
|
||||
| Key | Field | Value |
|
||||
| ------ | ------ | ------ |
|
||||
| crawler_history_**service type**:**domain** | **item root (first crawled item)** | **epoch (seconds)** |
|
||||
| crawler\_history\_**service type**:**domain** | **item root (first crawled item)** | **epoch (seconds)** |
|
||||
|
||||
##### Regular key:
|
||||
##### Key:
|
||||
| Key | Value |
|
||||
| ------ | ------ |
|
||||
| crawler_history_**service type**:**domain** | **json config** |
|
||||
| crawler\_config:**crawler mode**:**service type**:**domain** | **json config** |
|
||||
|
||||
##### exemple json config:
|
||||
###### exemple json config:
|
||||
```json
|
||||
{
|
||||
"closespider_pagecount": 1,
|
||||
|
|
|
@ -167,8 +167,9 @@ def on_error_send_message_back_in_queue(type_service, domain, message):
|
|||
redis_crawler.sadd('{}_domain_crawler_queue'.format(type_service), domain)
|
||||
redis_crawler.sadd('{}_crawler_priority_queue'.format(type_service), message)
|
||||
|
||||
def crawl_onion(url, domain, message, crawler_config):
|
||||
def crawl_onion(url, domain, port, type_service, message, crawler_config):
|
||||
crawler_config['url'] = url
|
||||
crawler_config['port'] = port
|
||||
print('Launching Crawler: {}'.format(url))
|
||||
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain)
|
||||
|
@ -261,7 +262,6 @@ if __name__ == '__main__':
|
|||
rotation_mode = ['onion', 'regular']
|
||||
default_proto_map = {'http': 80, 'https': 443}
|
||||
######################################################## add ftp ???
|
||||
################################################################### # TODO: port
|
||||
|
||||
publisher.port = 6380
|
||||
publisher.channel = "Script"
|
||||
|
@ -331,6 +331,7 @@ if __name__ == '__main__':
|
|||
print('url: {}'.format(url_data['url']))
|
||||
print('domain: {}'.format(url_data['domain']))
|
||||
print('domain_url: {}'.format(url_data['domain_url']))
|
||||
print()
|
||||
|
||||
# Check blacklist
|
||||
if not redis_crawler.sismember('blacklist_{}'.format(to_crawl['type_service']), url_data['domain']):
|
||||
|
@ -357,40 +358,33 @@ if __name__ == '__main__':
|
|||
|
||||
######################################################crawler strategy
|
||||
# CRAWL domain
|
||||
crawl_onion(url_data['url'], url_data['domain'], to_crawl['original_message'], crawler_config)
|
||||
crawl_onion(url_data['url'], url_data['domain'], url_data['port'], to_crawl['type_service'], to_crawl['original_message'], crawler_config)
|
||||
|
||||
# Default Crawler
|
||||
else:
|
||||
# CRAWL domain
|
||||
crawl_onion(url_data['domain_url'], url_data['domain'], to_crawl['original_message'], crawler_config)
|
||||
crawl_onion(url_data['domain_url'], url_data['domain'], url_data['port'], to_crawl['type_service'], to_crawl['original_message'], crawler_config)
|
||||
#if url != domain_url and not is_domain_up_day(url_data['domain'], to_crawl['type_service'], date['date_day']):
|
||||
# crawl_onion(url_data['url'], url_data['domain'], to_crawl['original_message'])
|
||||
|
||||
|
||||
################################################### handle port
|
||||
# CRAWL with port
|
||||
#if port is not None:
|
||||
# crawl_onion('{}:{}'.format(domain_url, port), domain, message)
|
||||
#### ####
|
||||
|
||||
|
||||
# Save last_status day (DOWN)
|
||||
if not is_domain_up_day(url_data['domain'], to_crawl['type_service'], date['date_day']):
|
||||
redis_crawler.sadd('{}_down:{}'.format(to_crawl['type_service'], date['date_day']), url_data['domain'])
|
||||
|
||||
# if domain was UP at least one time
|
||||
if redis_crawler.exists('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain'])):
|
||||
if redis_crawler.exists('crawler_history_{}:{}:{}'.format(to_crawl['type_service'], url_data['domain'], url_data['port'])):
|
||||
# add crawler history (if domain is down)
|
||||
if not redis_crawler.zrangebyscore('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain']), date['epoch'], date['epoch']):
|
||||
if not redis_crawler.zrangebyscore('crawler_history_{}:{}:{}'.format(to_crawl['type_service'], url_data['domain'], url_data['port']), date['epoch'], date['epoch']):
|
||||
# Domain is down
|
||||
redis_crawler.zadd('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain']), int(date['epoch']), int(date['epoch']))
|
||||
redis_crawler.zadd('crawler_history_{}:{}:{}'.format(to_crawl['type_service'], url_data['domain'], url_data['port']), int(date['epoch']), int(date['epoch']))
|
||||
|
||||
############################
|
||||
# extract page content
|
||||
############################
|
||||
|
||||
# update list, last crawled domains
|
||||
redis_crawler.lpush('last_{}'.format(to_crawl['type_service']), '{};{}'.format(url_data['domain'], date['epoch']))
|
||||
redis_crawler.lpush('last_{}'.format(to_crawl['type_service']), '{}:{};{}'.format(url_data['domain'], url_data['port'], date['epoch']))
|
||||
redis_crawler.ltrim('last_{}'.format(to_crawl['type_service']), 0, 15)
|
||||
|
||||
#update crawler status
|
||||
|
|
|
@ -37,7 +37,7 @@ class HiddenServices(object):
|
|||
|
||||
"""
|
||||
|
||||
def __init__(self, domain, type):
|
||||
def __init__(self, domain, type, port=80):
|
||||
|
||||
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
|
||||
if not os.path.exists(configfile):
|
||||
|
@ -61,6 +61,7 @@ class HiddenServices(object):
|
|||
|
||||
self.domain = domain
|
||||
self.type = type
|
||||
self.port = port
|
||||
self.tags = {}
|
||||
|
||||
if type == 'onion' or type == 'regular':
|
||||
|
@ -110,7 +111,7 @@ class HiddenServices(object):
|
|||
self.tags[tag] = self.tags.get(tag, 0) + 1
|
||||
|
||||
def get_first_crawled(self):
|
||||
res = self.r_serv_onion.zrange('crawler_history_{}:{}'.format(self.type, self.domain), 0, 0, withscores=True)
|
||||
res = self.r_serv_onion.zrange('crawler_history_{}:{}:{}'.format(self.type, self.domain, self.port), 0, 0, withscores=True)
|
||||
if res:
|
||||
res = res[0]
|
||||
return {'root_item':res[0], 'epoch':res[1]}
|
||||
|
@ -118,26 +119,31 @@ class HiddenServices(object):
|
|||
return {}
|
||||
|
||||
def get_last_crawled(self):
|
||||
res = self.r_serv_onion.zrevrange('crawler_history_{}:{}'.format(self.type, self.domain), 0, 0, withscores=True)
|
||||
res = self.r_serv_onion.zrevrange('crawler_history_{}:{}:{}'.format(self.type, self.domain, self.port), 0, 0, withscores=True)
|
||||
if res:
|
||||
res = res[0]
|
||||
return {'root_item':res[0], 'epoch':res[1]}
|
||||
else:
|
||||
return {}
|
||||
|
||||
#todo use the right paste
|
||||
def get_last_crawled_pastes(self, epoch=None):
|
||||
if epoch is None:
|
||||
list_root = self.r_serv_onion.zrevrange('crawler_history_{}:{}'.format(self.type, self.domain), 0, 0)
|
||||
else:
|
||||
list_root = self.r_serv_onion.zrevrangebyscore('crawler_history_{}:{}'.format(self.type, self.domain), int(epoch), int(epoch))
|
||||
if list_root:
|
||||
return self.get_all_pastes_domain(list_root[0])
|
||||
else:
|
||||
if epoch:
|
||||
return self.get_last_crawled_pastes()
|
||||
else:
|
||||
return list_root
|
||||
def get_domain_crawled_core_item(self, epoch=None):
|
||||
core_item = {}
|
||||
if epoch:
|
||||
list_root = self.r_serv_onion.zrevrangebyscore('crawler_history_{}:{}'.format(self.type, self.domain, self.port), int(epoch), int(epoch))
|
||||
if list_root:
|
||||
core_item['root_item'] = list_root[0]
|
||||
core_item['epoch'] = epoch
|
||||
return core_item
|
||||
|
||||
# no history found for this epoch
|
||||
if not core_item:
|
||||
return self.get_last_crawled()
|
||||
|
||||
#todo use the right paste
|
||||
def get_last_crawled_pastes(self, item_root=None):
|
||||
if item_root is None:
|
||||
item_root = self.get_domain_crawled_core_item(self)
|
||||
return self.get_all_pastes_domain(item_root)
|
||||
|
||||
def get_all_pastes_domain(self, root_item):
|
||||
if root_item is None:
|
||||
|
|
|
@ -46,19 +46,20 @@ class TorSplashCrawler():
|
|||
'DEPTH_LIMIT': crawler_options['depth_limit']
|
||||
})
|
||||
|
||||
def crawl(self, type, crawler_options, date, url, domain, original_item):
|
||||
self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, url=url, domain=domain,original_item=original_item)
|
||||
def crawl(self, type, crawler_options, date, url, domain, port, original_item):
|
||||
self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, url=url, domain=domain, port=port, original_item=original_item)
|
||||
self.process.start()
|
||||
|
||||
class TorSplashSpider(Spider):
|
||||
name = 'TorSplashSpider'
|
||||
|
||||
def __init__(self, type, crawler_options, date, url, domain, original_item, *args, **kwargs):
|
||||
def __init__(self, type, crawler_options, date, url, domain, port, original_item, *args, **kwargs):
|
||||
self.type = type
|
||||
self.original_item = original_item
|
||||
self.root_key = None
|
||||
self.start_urls = url
|
||||
self.domains = [domain]
|
||||
self.port = str(port)
|
||||
date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8])
|
||||
self.full_date = date['date_day']
|
||||
self.date_month = date['date_month']
|
||||
|
@ -153,13 +154,22 @@ class TorSplashCrawler():
|
|||
if self.root_key is None:
|
||||
self.root_key = relative_filename_paste
|
||||
# Create/Update crawler history
|
||||
self.r_serv_onion.zadd('crawler_history_{}:{}'.format(self.type, self.domains[0]), self.date_epoch, self.root_key)
|
||||
self.r_serv_onion.zadd('crawler_history_{}:{}:{}'.format(self.type, self.domains[0], self.port), self.date_epoch, self.root_key)
|
||||
# Update domain port number
|
||||
all_domain_ports = self.r_serv_onion.hget('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports')
|
||||
if all_domain_ports:
|
||||
all_domain_ports = all_domain_ports.split(';')
|
||||
else:
|
||||
all_domain_ports = []
|
||||
if self.port not in all_domain_ports:
|
||||
all_domain_ports.append(self.port)
|
||||
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports', ';'.join(all_domain_ports))
|
||||
|
||||
#create paste metadata
|
||||
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.root_key)
|
||||
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['father'])
|
||||
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0])
|
||||
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url)
|
||||
self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'super_father', self.root_key)
|
||||
self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'father', response.meta['father'])
|
||||
self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'domain', '{}:{}'.format(self.domains[0], self.port))
|
||||
self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'real_link', response.url)
|
||||
|
||||
self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste)
|
||||
|
||||
|
@ -206,6 +216,10 @@ class TorSplashCrawler():
|
|||
|
||||
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
|
||||
time.sleep(10)
|
||||
if response:
|
||||
response_root_key = response.meta['root_key']
|
||||
else:
|
||||
response_root_key = None
|
||||
yield SplashRequest(
|
||||
url,
|
||||
self.parse,
|
||||
|
|
|
@ -39,6 +39,7 @@ if __name__ == '__main__':
|
|||
service_type = crawler_json['service_type']
|
||||
url = crawler_json['url']
|
||||
domain = crawler_json['domain']
|
||||
port = crawler_json['port']
|
||||
original_item = crawler_json['item']
|
||||
crawler_options = crawler_json['crawler_options']
|
||||
date = crawler_json['date']
|
||||
|
@ -46,4 +47,4 @@ if __name__ == '__main__':
|
|||
redis_cache.delete('crawler_request:{}'.format(uuid))
|
||||
|
||||
crawler = TorSplashCrawler(splash_url, crawler_options)
|
||||
crawler.crawl(service_type, crawler_options, date, url, domain, original_item)
|
||||
crawler.crawl(service_type, crawler_options, date, url, domain, port, original_item)
|
||||
|
|
|
@ -122,6 +122,13 @@ def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None):
|
|||
list_crawled_metadata = []
|
||||
for domain_epoch in list_domains_crawled:
|
||||
domain, epoch = domain_epoch.rsplit(';', 1)
|
||||
domain = domain.split(':')
|
||||
if len(domain) == 1:
|
||||
port = 80
|
||||
domain = domain[0]
|
||||
else:
|
||||
port = domain[1]
|
||||
domain = domain[0]
|
||||
metadata_domain = {}
|
||||
# get Domain type
|
||||
if type is None:
|
||||
|
@ -133,6 +140,7 @@ def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None):
|
|||
metadata_domain['domain_name'] = '{}[...].{}'.format(domain_name[:40], tld_domain)
|
||||
else:
|
||||
metadata_domain['domain_name'] = domain
|
||||
metadata_domain['port'] = port
|
||||
metadata_domain['epoch'] = epoch
|
||||
metadata_domain['last_check'] = r_serv_onion.hget('{}_metadata:{}'.format(type, domain), 'last_check')
|
||||
if metadata_domain['last_check'] is None:
|
||||
|
@ -384,56 +392,7 @@ def create_spider_splash():
|
|||
|
||||
return redirect(url_for('hiddenServices.manual'))
|
||||
|
||||
@hiddenServices.route("/hiddenServices/", methods=['GET'])
|
||||
def hiddenServices_page():
|
||||
last_onions = r_serv_onion.lrange('last_onion', 0 ,-1)
|
||||
list_onion = []
|
||||
|
||||
now = datetime.datetime.now()
|
||||
date = now.strftime("%Y%m%d")
|
||||
|
||||
statDomains = {}
|
||||
statDomains['domains_up'] = r_serv_onion.scard('onion_up:{}'.format(date))
|
||||
statDomains['domains_down'] = r_serv_onion.scard('onion_down:{}'.format(date))
|
||||
statDomains['total'] = statDomains['domains_up'] + statDomains['domains_down']
|
||||
statDomains['domains_queue'] = r_serv_onion.scard('onion_domain_crawler_queue')
|
||||
|
||||
for onion in last_onions:
|
||||
metadata_onion = {}
|
||||
metadata_onion['domain'] = onion
|
||||
metadata_onion['last_check'] = r_serv_onion.hget('onion_metadata:{}'.format(onion), 'last_check')
|
||||
if metadata_onion['last_check'] is None:
|
||||
metadata_onion['last_check'] = '********'
|
||||
metadata_onion['first_seen'] = r_serv_onion.hget('onion_metadata:{}'.format(onion), 'first_seen')
|
||||
if metadata_onion['first_seen'] is None:
|
||||
metadata_onion['first_seen'] = '********'
|
||||
if get_onion_status(onion, metadata_onion['last_check']):
|
||||
metadata_onion['status_text'] = 'UP'
|
||||
metadata_onion['status_color'] = 'Green'
|
||||
metadata_onion['status_icon'] = 'fa-check-circle'
|
||||
else:
|
||||
metadata_onion['status_text'] = 'DOWN'
|
||||
metadata_onion['status_color'] = 'Red'
|
||||
metadata_onion['status_icon'] = 'fa-times-circle'
|
||||
list_onion.append(metadata_onion)
|
||||
|
||||
crawler_metadata=[]
|
||||
all_onion_crawler = r_cache.smembers('all_crawler:onion')
|
||||
for crawler in all_onion_crawler:
|
||||
crawling_domain = r_cache.hget('metadata_crawler:{}'.format(crawler), 'crawling_domain')
|
||||
started_time = r_cache.hget('metadata_crawler:{}'.format(crawler), 'started_time')
|
||||
status_info = r_cache.hget('metadata_crawler:{}'.format(crawler), 'status')
|
||||
crawler_info = '{} - {}'.format(crawler, started_time)
|
||||
if status_info=='Waiting' or status_info=='Crawling':
|
||||
status=True
|
||||
else:
|
||||
status=False
|
||||
crawler_metadata.append({'crawler_info': crawler_info, 'crawling_domain': crawling_domain, 'status_info': status_info, 'status': status})
|
||||
|
||||
date_string = '{}-{}-{}'.format(date[0:4], date[4:6], date[6:8])
|
||||
return render_template("hiddenServices.html", last_onions=list_onion, statDomains=statDomains,
|
||||
crawler_metadata=crawler_metadata, date_from=date_string, date_to=date_string)
|
||||
|
||||
# # TODO: refractor
|
||||
@hiddenServices.route("/hiddenServices/last_crawled_domains_with_stats_json", methods=['GET'])
|
||||
def last_crawled_domains_with_stats_json():
|
||||
last_onions = r_serv_onion.lrange('last_onion', 0 ,-1)
|
||||
|
@ -571,10 +530,19 @@ def show_domains_by_daterange():
|
|||
date_from=date_from, date_to=date_to, domains_up=domains_up, domains_down=domains_down,
|
||||
domains_tags=domains_tags, bootstrap_label=bootstrap_label)
|
||||
|
||||
@hiddenServices.route("/hiddenServices/show_domain", methods=['GET'])
|
||||
@hiddenServices.route("/crawlers/show_domain", methods=['GET'])
|
||||
def show_domain():
|
||||
domain = request.args.get('domain')
|
||||
epoch = request.args.get('epoch')
|
||||
try:
|
||||
epoch = int(epoch)
|
||||
except:
|
||||
epoch = None
|
||||
port = request.args.get('port')
|
||||
try:
|
||||
port = int(port)
|
||||
except:
|
||||
port = 80
|
||||
type = get_type_domain(domain)
|
||||
if domain is None or not r_serv_onion.exists('{}_metadata:{}'.format(type, domain)):
|
||||
return '404'
|
||||
|
@ -590,16 +558,16 @@ def show_domain():
|
|||
first_seen = '{}/{}/{}'.format(first_seen[0:4], first_seen[4:6], first_seen[6:8])
|
||||
origin_paste = r_serv_onion.hget('{}_metadata:{}'.format(type, domain), 'paste_parent')
|
||||
|
||||
h = HiddenServices(domain, type)
|
||||
last_crawled_time = h.get_last_crawled()
|
||||
if 'epoch' in last_crawled_time:
|
||||
last_check = '{} - {}'.format(last_check, time.strftime('%H:%M.%S', time.gmtime(last_crawled_time['epoch'])))
|
||||
l_pastes = h.get_last_crawled_pastes(epoch=epoch)
|
||||
h = HiddenServices(domain, type, port=port)
|
||||
item_core = h.get_domain_crawled_core_item(epoch=epoch)
|
||||
epoch = item_core['epoch']
|
||||
l_pastes = h.get_last_crawled_pastes(item_root=item_core['root_item'])
|
||||
dict_links = h.get_all_links(l_pastes)
|
||||
if l_pastes:
|
||||
status = True
|
||||
else:
|
||||
status = False
|
||||
last_check = '{} - {}'.format(last_check, time.strftime('%H:%M.%S', time.gmtime(epoch)))
|
||||
screenshot = h.get_domain_random_screenshot(l_pastes)
|
||||
if screenshot:
|
||||
screenshot = screenshot[0]
|
||||
|
|
|
@ -61,8 +61,11 @@
|
|||
</thead>
|
||||
<tbody id="tbody_last_crawled">
|
||||
{% for metadata_domain in last_domains %}
|
||||
<tr data-toggle="popover" data-trigger="hover" title="<span class='badge badge-dark'>{{metadata_domain['domain']}}</span>" data-content="{{metadata_domain['epoch']}}">
|
||||
<td><a target="_blank" href="{{ url_for('hiddenServices.show_domain') }}?domain={{ metadata_domain['domain'] }}&epoch={{metadata_domain['epoch']}}">{{ metadata_domain['domain_name'] }}</a></td>
|
||||
<tr data-toggle="popover" data-trigger="hover"
|
||||
title="<span class='badge badge-dark'>{{metadata_domain['domain']}}</span>"
|
||||
data-content="port: <span class='badge badge-secondary'>{{metadata_domain['port']}}</span><br>
|
||||
epoch: {{metadata_domain['epoch']}}">
|
||||
<td><a target="_blank" href="{{ url_for('hiddenServices.show_domain') }}?domain={{ metadata_domain['domain'] }}&port={{metadata_domain['port']}}&epoch={{metadata_domain['epoch']}}">{{ metadata_domain['domain_name'] }}</a></td>
|
||||
<td>{{'{}/{}/{}'.format(metadata_domain['first_seen'][0:4], metadata_domain['first_seen'][4:6], metadata_domain['first_seen'][6:8])}}</td>
|
||||
<td>{{'{}/{}/{}'.format(metadata_domain['last_check'][0:4], metadata_domain['last_check'][4:6], metadata_domain['last_check'][6:8])}}</td>
|
||||
<td><div style="color:{{metadata_domain['status_color']}}; display:inline-block">
|
||||
|
|
Loading…
Reference in New Issue