chg: [Crawler] handle port: crawling + history

pull/342/head
Terrtia 2019-03-22 16:48:07 +01:00
parent f4cdddbc7f
commit f64c385343
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
7 changed files with 98 additions and 102 deletions

View File

@ -57,6 +57,15 @@ Redis and ARDB overview
| ------ | ------ |
| *tag* | *paste* |
## DB7 - Metadata:
#### Crawled Items:
##### Hset:
| Key | Field | Value |
| ------ | ------ | ------ |
| paste_metadata:**item path** | super_father | **first url crawled** |
| | father | **item father** |
| | domain | **crawled domain**:**domain port** |
## DB9 - Crawler:
@ -65,19 +74,20 @@ Redis and ARDB overview
| ------ | ------ | ------ |
| **service type**:**domain** | first_seen | **date** |
| | last_check | **date** |
| | ports | **port**;**port**;**port** ... |
| | paste_parent | **parent last crawling (can be auto or manual)** |
##### Zset:
| Key | Field | Value |
| ------ | ------ | ------ |
| crawler_history_**service type**:**domain** | **item root (first crawled item)** | **epoch (seconds)** |
| crawler\_history\_**service type**:**domain** | **item root (first crawled item)** | **epoch (seconds)** |
##### Regular key:
##### Key:
| Key | Value |
| ------ | ------ |
| crawler_history_**service type**:**domain** | **json config** |
| crawler\_config:**crawler mode**:**service type**:**domain** | **json config** |
##### exemple json config:
###### exemple json config:
```json
{
"closespider_pagecount": 1,

View File

@ -167,8 +167,9 @@ def on_error_send_message_back_in_queue(type_service, domain, message):
redis_crawler.sadd('{}_domain_crawler_queue'.format(type_service), domain)
redis_crawler.sadd('{}_crawler_priority_queue'.format(type_service), message)
def crawl_onion(url, domain, message, crawler_config):
def crawl_onion(url, domain, port, type_service, message, crawler_config):
crawler_config['url'] = url
crawler_config['port'] = port
print('Launching Crawler: {}'.format(url))
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain)
@ -261,7 +262,6 @@ if __name__ == '__main__':
rotation_mode = ['onion', 'regular']
default_proto_map = {'http': 80, 'https': 443}
######################################################## add ftp ???
################################################################### # TODO: port
publisher.port = 6380
publisher.channel = "Script"
@ -331,6 +331,7 @@ if __name__ == '__main__':
print('url: {}'.format(url_data['url']))
print('domain: {}'.format(url_data['domain']))
print('domain_url: {}'.format(url_data['domain_url']))
print()
# Check blacklist
if not redis_crawler.sismember('blacklist_{}'.format(to_crawl['type_service']), url_data['domain']):
@ -357,40 +358,33 @@ if __name__ == '__main__':
######################################################crawler strategy
# CRAWL domain
crawl_onion(url_data['url'], url_data['domain'], to_crawl['original_message'], crawler_config)
crawl_onion(url_data['url'], url_data['domain'], url_data['port'], to_crawl['type_service'], to_crawl['original_message'], crawler_config)
# Default Crawler
else:
# CRAWL domain
crawl_onion(url_data['domain_url'], url_data['domain'], to_crawl['original_message'], crawler_config)
crawl_onion(url_data['domain_url'], url_data['domain'], url_data['port'], to_crawl['type_service'], to_crawl['original_message'], crawler_config)
#if url != domain_url and not is_domain_up_day(url_data['domain'], to_crawl['type_service'], date['date_day']):
# crawl_onion(url_data['url'], url_data['domain'], to_crawl['original_message'])
################################################### handle port
# CRAWL with port
#if port is not None:
# crawl_onion('{}:{}'.format(domain_url, port), domain, message)
#### ####
# Save last_status day (DOWN)
if not is_domain_up_day(url_data['domain'], to_crawl['type_service'], date['date_day']):
redis_crawler.sadd('{}_down:{}'.format(to_crawl['type_service'], date['date_day']), url_data['domain'])
# if domain was UP at least one time
if redis_crawler.exists('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain'])):
if redis_crawler.exists('crawler_history_{}:{}:{}'.format(to_crawl['type_service'], url_data['domain'], url_data['port'])):
# add crawler history (if domain is down)
if not redis_crawler.zrangebyscore('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain']), date['epoch'], date['epoch']):
if not redis_crawler.zrangebyscore('crawler_history_{}:{}:{}'.format(to_crawl['type_service'], url_data['domain'], url_data['port']), date['epoch'], date['epoch']):
# Domain is down
redis_crawler.zadd('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain']), int(date['epoch']), int(date['epoch']))
redis_crawler.zadd('crawler_history_{}:{}:{}'.format(to_crawl['type_service'], url_data['domain'], url_data['port']), int(date['epoch']), int(date['epoch']))
############################
# extract page content
############################
# update list, last crawled domains
redis_crawler.lpush('last_{}'.format(to_crawl['type_service']), '{};{}'.format(url_data['domain'], date['epoch']))
redis_crawler.lpush('last_{}'.format(to_crawl['type_service']), '{}:{};{}'.format(url_data['domain'], url_data['port'], date['epoch']))
redis_crawler.ltrim('last_{}'.format(to_crawl['type_service']), 0, 15)
#update crawler status

View File

@ -37,7 +37,7 @@ class HiddenServices(object):
"""
def __init__(self, domain, type):
def __init__(self, domain, type, port=80):
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
if not os.path.exists(configfile):
@ -61,6 +61,7 @@ class HiddenServices(object):
self.domain = domain
self.type = type
self.port = port
self.tags = {}
if type == 'onion' or type == 'regular':
@ -110,7 +111,7 @@ class HiddenServices(object):
self.tags[tag] = self.tags.get(tag, 0) + 1
def get_first_crawled(self):
res = self.r_serv_onion.zrange('crawler_history_{}:{}'.format(self.type, self.domain), 0, 0, withscores=True)
res = self.r_serv_onion.zrange('crawler_history_{}:{}:{}'.format(self.type, self.domain, self.port), 0, 0, withscores=True)
if res:
res = res[0]
return {'root_item':res[0], 'epoch':res[1]}
@ -118,26 +119,31 @@ class HiddenServices(object):
return {}
def get_last_crawled(self):
res = self.r_serv_onion.zrevrange('crawler_history_{}:{}'.format(self.type, self.domain), 0, 0, withscores=True)
res = self.r_serv_onion.zrevrange('crawler_history_{}:{}:{}'.format(self.type, self.domain, self.port), 0, 0, withscores=True)
if res:
res = res[0]
return {'root_item':res[0], 'epoch':res[1]}
else:
return {}
#todo use the right paste
def get_last_crawled_pastes(self, epoch=None):
if epoch is None:
list_root = self.r_serv_onion.zrevrange('crawler_history_{}:{}'.format(self.type, self.domain), 0, 0)
else:
list_root = self.r_serv_onion.zrevrangebyscore('crawler_history_{}:{}'.format(self.type, self.domain), int(epoch), int(epoch))
if list_root:
return self.get_all_pastes_domain(list_root[0])
else:
def get_domain_crawled_core_item(self, epoch=None):
core_item = {}
if epoch:
return self.get_last_crawled_pastes()
else:
return list_root
list_root = self.r_serv_onion.zrevrangebyscore('crawler_history_{}:{}'.format(self.type, self.domain, self.port), int(epoch), int(epoch))
if list_root:
core_item['root_item'] = list_root[0]
core_item['epoch'] = epoch
return core_item
# no history found for this epoch
if not core_item:
return self.get_last_crawled()
#todo use the right paste
def get_last_crawled_pastes(self, item_root=None):
if item_root is None:
item_root = self.get_domain_crawled_core_item(self)
return self.get_all_pastes_domain(item_root)
def get_all_pastes_domain(self, root_item):
if root_item is None:

View File

@ -46,19 +46,20 @@ class TorSplashCrawler():
'DEPTH_LIMIT': crawler_options['depth_limit']
})
def crawl(self, type, crawler_options, date, url, domain, original_item):
self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, url=url, domain=domain,original_item=original_item)
def crawl(self, type, crawler_options, date, url, domain, port, original_item):
self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, url=url, domain=domain, port=port, original_item=original_item)
self.process.start()
class TorSplashSpider(Spider):
name = 'TorSplashSpider'
def __init__(self, type, crawler_options, date, url, domain, original_item, *args, **kwargs):
def __init__(self, type, crawler_options, date, url, domain, port, original_item, *args, **kwargs):
self.type = type
self.original_item = original_item
self.root_key = None
self.start_urls = url
self.domains = [domain]
self.port = str(port)
date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8])
self.full_date = date['date_day']
self.date_month = date['date_month']
@ -153,13 +154,22 @@ class TorSplashCrawler():
if self.root_key is None:
self.root_key = relative_filename_paste
# Create/Update crawler history
self.r_serv_onion.zadd('crawler_history_{}:{}'.format(self.type, self.domains[0]), self.date_epoch, self.root_key)
self.r_serv_onion.zadd('crawler_history_{}:{}:{}'.format(self.type, self.domains[0], self.port), self.date_epoch, self.root_key)
# Update domain port number
all_domain_ports = self.r_serv_onion.hget('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports')
if all_domain_ports:
all_domain_ports = all_domain_ports.split(';')
else:
all_domain_ports = []
if self.port not in all_domain_ports:
all_domain_ports.append(self.port)
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports', ';'.join(all_domain_ports))
#create paste metadata
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.root_key)
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['father'])
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0])
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url)
self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'super_father', self.root_key)
self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'father', response.meta['father'])
self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'domain', '{}:{}'.format(self.domains[0], self.port))
self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'real_link', response.url)
self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste)
@ -206,6 +216,10 @@ class TorSplashCrawler():
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
time.sleep(10)
if response:
response_root_key = response.meta['root_key']
else:
response_root_key = None
yield SplashRequest(
url,
self.parse,

View File

@ -39,6 +39,7 @@ if __name__ == '__main__':
service_type = crawler_json['service_type']
url = crawler_json['url']
domain = crawler_json['domain']
port = crawler_json['port']
original_item = crawler_json['item']
crawler_options = crawler_json['crawler_options']
date = crawler_json['date']
@ -46,4 +47,4 @@ if __name__ == '__main__':
redis_cache.delete('crawler_request:{}'.format(uuid))
crawler = TorSplashCrawler(splash_url, crawler_options)
crawler.crawl(service_type, crawler_options, date, url, domain, original_item)
crawler.crawl(service_type, crawler_options, date, url, domain, port, original_item)

View File

@ -122,6 +122,13 @@ def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None):
list_crawled_metadata = []
for domain_epoch in list_domains_crawled:
domain, epoch = domain_epoch.rsplit(';', 1)
domain = domain.split(':')
if len(domain) == 1:
port = 80
domain = domain[0]
else:
port = domain[1]
domain = domain[0]
metadata_domain = {}
# get Domain type
if type is None:
@ -133,6 +140,7 @@ def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None):
metadata_domain['domain_name'] = '{}[...].{}'.format(domain_name[:40], tld_domain)
else:
metadata_domain['domain_name'] = domain
metadata_domain['port'] = port
metadata_domain['epoch'] = epoch
metadata_domain['last_check'] = r_serv_onion.hget('{}_metadata:{}'.format(type, domain), 'last_check')
if metadata_domain['last_check'] is None:
@ -384,56 +392,7 @@ def create_spider_splash():
return redirect(url_for('hiddenServices.manual'))
@hiddenServices.route("/hiddenServices/", methods=['GET'])
def hiddenServices_page():
last_onions = r_serv_onion.lrange('last_onion', 0 ,-1)
list_onion = []
now = datetime.datetime.now()
date = now.strftime("%Y%m%d")
statDomains = {}
statDomains['domains_up'] = r_serv_onion.scard('onion_up:{}'.format(date))
statDomains['domains_down'] = r_serv_onion.scard('onion_down:{}'.format(date))
statDomains['total'] = statDomains['domains_up'] + statDomains['domains_down']
statDomains['domains_queue'] = r_serv_onion.scard('onion_domain_crawler_queue')
for onion in last_onions:
metadata_onion = {}
metadata_onion['domain'] = onion
metadata_onion['last_check'] = r_serv_onion.hget('onion_metadata:{}'.format(onion), 'last_check')
if metadata_onion['last_check'] is None:
metadata_onion['last_check'] = '********'
metadata_onion['first_seen'] = r_serv_onion.hget('onion_metadata:{}'.format(onion), 'first_seen')
if metadata_onion['first_seen'] is None:
metadata_onion['first_seen'] = '********'
if get_onion_status(onion, metadata_onion['last_check']):
metadata_onion['status_text'] = 'UP'
metadata_onion['status_color'] = 'Green'
metadata_onion['status_icon'] = 'fa-check-circle'
else:
metadata_onion['status_text'] = 'DOWN'
metadata_onion['status_color'] = 'Red'
metadata_onion['status_icon'] = 'fa-times-circle'
list_onion.append(metadata_onion)
crawler_metadata=[]
all_onion_crawler = r_cache.smembers('all_crawler:onion')
for crawler in all_onion_crawler:
crawling_domain = r_cache.hget('metadata_crawler:{}'.format(crawler), 'crawling_domain')
started_time = r_cache.hget('metadata_crawler:{}'.format(crawler), 'started_time')
status_info = r_cache.hget('metadata_crawler:{}'.format(crawler), 'status')
crawler_info = '{} - {}'.format(crawler, started_time)
if status_info=='Waiting' or status_info=='Crawling':
status=True
else:
status=False
crawler_metadata.append({'crawler_info': crawler_info, 'crawling_domain': crawling_domain, 'status_info': status_info, 'status': status})
date_string = '{}-{}-{}'.format(date[0:4], date[4:6], date[6:8])
return render_template("hiddenServices.html", last_onions=list_onion, statDomains=statDomains,
crawler_metadata=crawler_metadata, date_from=date_string, date_to=date_string)
# # TODO: refractor
@hiddenServices.route("/hiddenServices/last_crawled_domains_with_stats_json", methods=['GET'])
def last_crawled_domains_with_stats_json():
last_onions = r_serv_onion.lrange('last_onion', 0 ,-1)
@ -571,10 +530,19 @@ def show_domains_by_daterange():
date_from=date_from, date_to=date_to, domains_up=domains_up, domains_down=domains_down,
domains_tags=domains_tags, bootstrap_label=bootstrap_label)
@hiddenServices.route("/hiddenServices/show_domain", methods=['GET'])
@hiddenServices.route("/crawlers/show_domain", methods=['GET'])
def show_domain():
domain = request.args.get('domain')
epoch = request.args.get('epoch')
try:
epoch = int(epoch)
except:
epoch = None
port = request.args.get('port')
try:
port = int(port)
except:
port = 80
type = get_type_domain(domain)
if domain is None or not r_serv_onion.exists('{}_metadata:{}'.format(type, domain)):
return '404'
@ -590,16 +558,16 @@ def show_domain():
first_seen = '{}/{}/{}'.format(first_seen[0:4], first_seen[4:6], first_seen[6:8])
origin_paste = r_serv_onion.hget('{}_metadata:{}'.format(type, domain), 'paste_parent')
h = HiddenServices(domain, type)
last_crawled_time = h.get_last_crawled()
if 'epoch' in last_crawled_time:
last_check = '{} - {}'.format(last_check, time.strftime('%H:%M.%S', time.gmtime(last_crawled_time['epoch'])))
l_pastes = h.get_last_crawled_pastes(epoch=epoch)
h = HiddenServices(domain, type, port=port)
item_core = h.get_domain_crawled_core_item(epoch=epoch)
epoch = item_core['epoch']
l_pastes = h.get_last_crawled_pastes(item_root=item_core['root_item'])
dict_links = h.get_all_links(l_pastes)
if l_pastes:
status = True
else:
status = False
last_check = '{} - {}'.format(last_check, time.strftime('%H:%M.%S', time.gmtime(epoch)))
screenshot = h.get_domain_random_screenshot(l_pastes)
if screenshot:
screenshot = screenshot[0]

View File

@ -61,8 +61,11 @@
</thead>
<tbody id="tbody_last_crawled">
{% for metadata_domain in last_domains %}
<tr data-toggle="popover" data-trigger="hover" title="<span class='badge badge-dark'>{{metadata_domain['domain']}}</span>" data-content="{{metadata_domain['epoch']}}">
<td><a target="_blank" href="{{ url_for('hiddenServices.show_domain') }}?domain={{ metadata_domain['domain'] }}&epoch={{metadata_domain['epoch']}}">{{ metadata_domain['domain_name'] }}</a></td>
<tr data-toggle="popover" data-trigger="hover"
title="<span class='badge badge-dark'>{{metadata_domain['domain']}}</span>"
data-content="port: <span class='badge badge-secondary'>{{metadata_domain['port']}}</span><br>
epoch: {{metadata_domain['epoch']}}">
<td><a target="_blank" href="{{ url_for('hiddenServices.show_domain') }}?domain={{ metadata_domain['domain'] }}&port={{metadata_domain['port']}}&epoch={{metadata_domain['epoch']}}">{{ metadata_domain['domain_name'] }}</a></td>
<td>{{'{}/{}/{}'.format(metadata_domain['first_seen'][0:4], metadata_domain['first_seen'][4:6], metadata_domain['first_seen'][6:8])}}</td>
<td>{{'{}/{}/{}'.format(metadata_domain['last_check'][0:4], metadata_domain['last_check'][4:6], metadata_domain['last_check'][6:8])}}</td>
<td><div style="color:{{metadata_domain['status_color']}}; display:inline-block">