chg: [Crawler] handle port: crawling + history

pull/342/head
Terrtia 2019-03-22 16:48:07 +01:00
parent f4cdddbc7f
commit f64c385343
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
7 changed files with 98 additions and 102 deletions

View File

@ -57,6 +57,15 @@ Redis and ARDB overview
| ------ | ------ | | ------ | ------ |
| *tag* | *paste* | | *tag* | *paste* |
## DB7 - Metadata:
#### Crawled Items:
##### Hset:
| Key | Field | Value |
| ------ | ------ | ------ |
| paste_metadata:**item path** | super_father | **first url crawled** |
| | father | **item father** |
| | domain | **crawled domain**:**domain port** |
## DB9 - Crawler: ## DB9 - Crawler:
@ -65,19 +74,20 @@ Redis and ARDB overview
| ------ | ------ | ------ | | ------ | ------ | ------ |
| **service type**:**domain** | first_seen | **date** | | **service type**:**domain** | first_seen | **date** |
| | last_check | **date** | | | last_check | **date** |
| | ports | **port**;**port**;**port** ... |
| | paste_parent | **parent last crawling (can be auto or manual)** | | | paste_parent | **parent last crawling (can be auto or manual)** |
##### Zset: ##### Zset:
| Key | Field | Value | | Key | Field | Value |
| ------ | ------ | ------ | | ------ | ------ | ------ |
| crawler_history_**service type**:**domain** | **item root (first crawled item)** | **epoch (seconds)** | | crawler\_history\_**service type**:**domain** | **item root (first crawled item)** | **epoch (seconds)** |
##### Regular key: ##### Key:
| Key | Value | | Key | Value |
| ------ | ------ | | ------ | ------ |
| crawler_history_**service type**:**domain** | **json config** | | crawler\_config:**crawler mode**:**service type**:**domain** | **json config** |
##### exemple json config: ###### exemple json config:
```json ```json
{ {
"closespider_pagecount": 1, "closespider_pagecount": 1,

View File

@ -167,8 +167,9 @@ def on_error_send_message_back_in_queue(type_service, domain, message):
redis_crawler.sadd('{}_domain_crawler_queue'.format(type_service), domain) redis_crawler.sadd('{}_domain_crawler_queue'.format(type_service), domain)
redis_crawler.sadd('{}_crawler_priority_queue'.format(type_service), message) redis_crawler.sadd('{}_crawler_priority_queue'.format(type_service), message)
def crawl_onion(url, domain, message, crawler_config): def crawl_onion(url, domain, port, type_service, message, crawler_config):
crawler_config['url'] = url crawler_config['url'] = url
crawler_config['port'] = port
print('Launching Crawler: {}'.format(url)) print('Launching Crawler: {}'.format(url))
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain) r_cache.hset('metadata_crawler:{}'.format(splash_port), 'crawling_domain', domain)
@ -261,7 +262,6 @@ if __name__ == '__main__':
rotation_mode = ['onion', 'regular'] rotation_mode = ['onion', 'regular']
default_proto_map = {'http': 80, 'https': 443} default_proto_map = {'http': 80, 'https': 443}
######################################################## add ftp ??? ######################################################## add ftp ???
################################################################### # TODO: port
publisher.port = 6380 publisher.port = 6380
publisher.channel = "Script" publisher.channel = "Script"
@ -331,6 +331,7 @@ if __name__ == '__main__':
print('url: {}'.format(url_data['url'])) print('url: {}'.format(url_data['url']))
print('domain: {}'.format(url_data['domain'])) print('domain: {}'.format(url_data['domain']))
print('domain_url: {}'.format(url_data['domain_url'])) print('domain_url: {}'.format(url_data['domain_url']))
print()
# Check blacklist # Check blacklist
if not redis_crawler.sismember('blacklist_{}'.format(to_crawl['type_service']), url_data['domain']): if not redis_crawler.sismember('blacklist_{}'.format(to_crawl['type_service']), url_data['domain']):
@ -357,40 +358,33 @@ if __name__ == '__main__':
######################################################crawler strategy ######################################################crawler strategy
# CRAWL domain # CRAWL domain
crawl_onion(url_data['url'], url_data['domain'], to_crawl['original_message'], crawler_config) crawl_onion(url_data['url'], url_data['domain'], url_data['port'], to_crawl['type_service'], to_crawl['original_message'], crawler_config)
# Default Crawler # Default Crawler
else: else:
# CRAWL domain # CRAWL domain
crawl_onion(url_data['domain_url'], url_data['domain'], to_crawl['original_message'], crawler_config) crawl_onion(url_data['domain_url'], url_data['domain'], url_data['port'], to_crawl['type_service'], to_crawl['original_message'], crawler_config)
#if url != domain_url and not is_domain_up_day(url_data['domain'], to_crawl['type_service'], date['date_day']): #if url != domain_url and not is_domain_up_day(url_data['domain'], to_crawl['type_service'], date['date_day']):
# crawl_onion(url_data['url'], url_data['domain'], to_crawl['original_message']) # crawl_onion(url_data['url'], url_data['domain'], to_crawl['original_message'])
################################################### handle port
# CRAWL with port
#if port is not None:
# crawl_onion('{}:{}'.format(domain_url, port), domain, message)
#### ####
# Save last_status day (DOWN) # Save last_status day (DOWN)
if not is_domain_up_day(url_data['domain'], to_crawl['type_service'], date['date_day']): if not is_domain_up_day(url_data['domain'], to_crawl['type_service'], date['date_day']):
redis_crawler.sadd('{}_down:{}'.format(to_crawl['type_service'], date['date_day']), url_data['domain']) redis_crawler.sadd('{}_down:{}'.format(to_crawl['type_service'], date['date_day']), url_data['domain'])
# if domain was UP at least one time # if domain was UP at least one time
if redis_crawler.exists('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain'])): if redis_crawler.exists('crawler_history_{}:{}:{}'.format(to_crawl['type_service'], url_data['domain'], url_data['port'])):
# add crawler history (if domain is down) # add crawler history (if domain is down)
if not redis_crawler.zrangebyscore('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain']), date['epoch'], date['epoch']): if not redis_crawler.zrangebyscore('crawler_history_{}:{}:{}'.format(to_crawl['type_service'], url_data['domain'], url_data['port']), date['epoch'], date['epoch']):
# Domain is down # Domain is down
redis_crawler.zadd('crawler_history_{}:{}'.format(to_crawl['type_service'], url_data['domain']), int(date['epoch']), int(date['epoch'])) redis_crawler.zadd('crawler_history_{}:{}:{}'.format(to_crawl['type_service'], url_data['domain'], url_data['port']), int(date['epoch']), int(date['epoch']))
############################ ############################
# extract page content # extract page content
############################ ############################
# update list, last crawled domains # update list, last crawled domains
redis_crawler.lpush('last_{}'.format(to_crawl['type_service']), '{};{}'.format(url_data['domain'], date['epoch'])) redis_crawler.lpush('last_{}'.format(to_crawl['type_service']), '{}:{};{}'.format(url_data['domain'], url_data['port'], date['epoch']))
redis_crawler.ltrim('last_{}'.format(to_crawl['type_service']), 0, 15) redis_crawler.ltrim('last_{}'.format(to_crawl['type_service']), 0, 15)
#update crawler status #update crawler status

View File

@ -37,7 +37,7 @@ class HiddenServices(object):
""" """
def __init__(self, domain, type): def __init__(self, domain, type, port=80):
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
if not os.path.exists(configfile): if not os.path.exists(configfile):
@ -61,6 +61,7 @@ class HiddenServices(object):
self.domain = domain self.domain = domain
self.type = type self.type = type
self.port = port
self.tags = {} self.tags = {}
if type == 'onion' or type == 'regular': if type == 'onion' or type == 'regular':
@ -110,7 +111,7 @@ class HiddenServices(object):
self.tags[tag] = self.tags.get(tag, 0) + 1 self.tags[tag] = self.tags.get(tag, 0) + 1
def get_first_crawled(self): def get_first_crawled(self):
res = self.r_serv_onion.zrange('crawler_history_{}:{}'.format(self.type, self.domain), 0, 0, withscores=True) res = self.r_serv_onion.zrange('crawler_history_{}:{}:{}'.format(self.type, self.domain, self.port), 0, 0, withscores=True)
if res: if res:
res = res[0] res = res[0]
return {'root_item':res[0], 'epoch':res[1]} return {'root_item':res[0], 'epoch':res[1]}
@ -118,26 +119,31 @@ class HiddenServices(object):
return {} return {}
def get_last_crawled(self): def get_last_crawled(self):
res = self.r_serv_onion.zrevrange('crawler_history_{}:{}'.format(self.type, self.domain), 0, 0, withscores=True) res = self.r_serv_onion.zrevrange('crawler_history_{}:{}:{}'.format(self.type, self.domain, self.port), 0, 0, withscores=True)
if res: if res:
res = res[0]
return {'root_item':res[0], 'epoch':res[1]} return {'root_item':res[0], 'epoch':res[1]}
else: else:
return {} return {}
#todo use the right paste #todo use the right paste
def get_last_crawled_pastes(self, epoch=None): def get_domain_crawled_core_item(self, epoch=None):
if epoch is None: core_item = {}
list_root = self.r_serv_onion.zrevrange('crawler_history_{}:{}'.format(self.type, self.domain), 0, 0) if epoch:
else: list_root = self.r_serv_onion.zrevrangebyscore('crawler_history_{}:{}'.format(self.type, self.domain, self.port), int(epoch), int(epoch))
list_root = self.r_serv_onion.zrevrangebyscore('crawler_history_{}:{}'.format(self.type, self.domain), int(epoch), int(epoch)) if list_root:
if list_root: core_item['root_item'] = list_root[0]
return self.get_all_pastes_domain(list_root[0]) core_item['epoch'] = epoch
else: return core_item
if epoch:
return self.get_last_crawled_pastes() # no history found for this epoch
else: if not core_item:
return list_root return self.get_last_crawled()
#todo use the right paste
def get_last_crawled_pastes(self, item_root=None):
if item_root is None:
item_root = self.get_domain_crawled_core_item(self)
return self.get_all_pastes_domain(item_root)
def get_all_pastes_domain(self, root_item): def get_all_pastes_domain(self, root_item):
if root_item is None: if root_item is None:

View File

@ -46,19 +46,20 @@ class TorSplashCrawler():
'DEPTH_LIMIT': crawler_options['depth_limit'] 'DEPTH_LIMIT': crawler_options['depth_limit']
}) })
def crawl(self, type, crawler_options, date, url, domain, original_item): def crawl(self, type, crawler_options, date, url, domain, port, original_item):
self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, url=url, domain=domain,original_item=original_item) self.process.crawl(self.crawler, type=type, crawler_options=crawler_options, date=date, url=url, domain=domain, port=port, original_item=original_item)
self.process.start() self.process.start()
class TorSplashSpider(Spider): class TorSplashSpider(Spider):
name = 'TorSplashSpider' name = 'TorSplashSpider'
def __init__(self, type, crawler_options, date, url, domain, original_item, *args, **kwargs): def __init__(self, type, crawler_options, date, url, domain, port, original_item, *args, **kwargs):
self.type = type self.type = type
self.original_item = original_item self.original_item = original_item
self.root_key = None self.root_key = None
self.start_urls = url self.start_urls = url
self.domains = [domain] self.domains = [domain]
self.port = str(port)
date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8]) date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8])
self.full_date = date['date_day'] self.full_date = date['date_day']
self.date_month = date['date_month'] self.date_month = date['date_month']
@ -153,13 +154,22 @@ class TorSplashCrawler():
if self.root_key is None: if self.root_key is None:
self.root_key = relative_filename_paste self.root_key = relative_filename_paste
# Create/Update crawler history # Create/Update crawler history
self.r_serv_onion.zadd('crawler_history_{}:{}'.format(self.type, self.domains[0]), self.date_epoch, self.root_key) self.r_serv_onion.zadd('crawler_history_{}:{}:{}'.format(self.type, self.domains[0], self.port), self.date_epoch, self.root_key)
# Update domain port number
all_domain_ports = self.r_serv_onion.hget('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports')
if all_domain_ports:
all_domain_ports = all_domain_ports.split(';')
else:
all_domain_ports = []
if self.port not in all_domain_ports:
all_domain_ports.append(self.port)
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports', ';'.join(all_domain_ports))
#create paste metadata #create paste metadata
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.root_key) self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'super_father', self.root_key)
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['father']) self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'father', response.meta['father'])
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0]) self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'domain', '{}:{}'.format(self.domains[0], self.port))
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url) self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'real_link', response.url)
self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste) self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste)
@ -206,6 +216,10 @@ class TorSplashCrawler():
self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url) self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url)
time.sleep(10) time.sleep(10)
if response:
response_root_key = response.meta['root_key']
else:
response_root_key = None
yield SplashRequest( yield SplashRequest(
url, url,
self.parse, self.parse,

View File

@ -39,6 +39,7 @@ if __name__ == '__main__':
service_type = crawler_json['service_type'] service_type = crawler_json['service_type']
url = crawler_json['url'] url = crawler_json['url']
domain = crawler_json['domain'] domain = crawler_json['domain']
port = crawler_json['port']
original_item = crawler_json['item'] original_item = crawler_json['item']
crawler_options = crawler_json['crawler_options'] crawler_options = crawler_json['crawler_options']
date = crawler_json['date'] date = crawler_json['date']
@ -46,4 +47,4 @@ if __name__ == '__main__':
redis_cache.delete('crawler_request:{}'.format(uuid)) redis_cache.delete('crawler_request:{}'.format(uuid))
crawler = TorSplashCrawler(splash_url, crawler_options) crawler = TorSplashCrawler(splash_url, crawler_options)
crawler.crawl(service_type, crawler_options, date, url, domain, original_item) crawler.crawl(service_type, crawler_options, date, url, domain, port, original_item)

View File

@ -122,6 +122,13 @@ def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None):
list_crawled_metadata = [] list_crawled_metadata = []
for domain_epoch in list_domains_crawled: for domain_epoch in list_domains_crawled:
domain, epoch = domain_epoch.rsplit(';', 1) domain, epoch = domain_epoch.rsplit(';', 1)
domain = domain.split(':')
if len(domain) == 1:
port = 80
domain = domain[0]
else:
port = domain[1]
domain = domain[0]
metadata_domain = {} metadata_domain = {}
# get Domain type # get Domain type
if type is None: if type is None:
@ -133,6 +140,7 @@ def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None):
metadata_domain['domain_name'] = '{}[...].{}'.format(domain_name[:40], tld_domain) metadata_domain['domain_name'] = '{}[...].{}'.format(domain_name[:40], tld_domain)
else: else:
metadata_domain['domain_name'] = domain metadata_domain['domain_name'] = domain
metadata_domain['port'] = port
metadata_domain['epoch'] = epoch metadata_domain['epoch'] = epoch
metadata_domain['last_check'] = r_serv_onion.hget('{}_metadata:{}'.format(type, domain), 'last_check') metadata_domain['last_check'] = r_serv_onion.hget('{}_metadata:{}'.format(type, domain), 'last_check')
if metadata_domain['last_check'] is None: if metadata_domain['last_check'] is None:
@ -384,56 +392,7 @@ def create_spider_splash():
return redirect(url_for('hiddenServices.manual')) return redirect(url_for('hiddenServices.manual'))
@hiddenServices.route("/hiddenServices/", methods=['GET']) # # TODO: refractor
def hiddenServices_page():
last_onions = r_serv_onion.lrange('last_onion', 0 ,-1)
list_onion = []
now = datetime.datetime.now()
date = now.strftime("%Y%m%d")
statDomains = {}
statDomains['domains_up'] = r_serv_onion.scard('onion_up:{}'.format(date))
statDomains['domains_down'] = r_serv_onion.scard('onion_down:{}'.format(date))
statDomains['total'] = statDomains['domains_up'] + statDomains['domains_down']
statDomains['domains_queue'] = r_serv_onion.scard('onion_domain_crawler_queue')
for onion in last_onions:
metadata_onion = {}
metadata_onion['domain'] = onion
metadata_onion['last_check'] = r_serv_onion.hget('onion_metadata:{}'.format(onion), 'last_check')
if metadata_onion['last_check'] is None:
metadata_onion['last_check'] = '********'
metadata_onion['first_seen'] = r_serv_onion.hget('onion_metadata:{}'.format(onion), 'first_seen')
if metadata_onion['first_seen'] is None:
metadata_onion['first_seen'] = '********'
if get_onion_status(onion, metadata_onion['last_check']):
metadata_onion['status_text'] = 'UP'
metadata_onion['status_color'] = 'Green'
metadata_onion['status_icon'] = 'fa-check-circle'
else:
metadata_onion['status_text'] = 'DOWN'
metadata_onion['status_color'] = 'Red'
metadata_onion['status_icon'] = 'fa-times-circle'
list_onion.append(metadata_onion)
crawler_metadata=[]
all_onion_crawler = r_cache.smembers('all_crawler:onion')
for crawler in all_onion_crawler:
crawling_domain = r_cache.hget('metadata_crawler:{}'.format(crawler), 'crawling_domain')
started_time = r_cache.hget('metadata_crawler:{}'.format(crawler), 'started_time')
status_info = r_cache.hget('metadata_crawler:{}'.format(crawler), 'status')
crawler_info = '{} - {}'.format(crawler, started_time)
if status_info=='Waiting' or status_info=='Crawling':
status=True
else:
status=False
crawler_metadata.append({'crawler_info': crawler_info, 'crawling_domain': crawling_domain, 'status_info': status_info, 'status': status})
date_string = '{}-{}-{}'.format(date[0:4], date[4:6], date[6:8])
return render_template("hiddenServices.html", last_onions=list_onion, statDomains=statDomains,
crawler_metadata=crawler_metadata, date_from=date_string, date_to=date_string)
@hiddenServices.route("/hiddenServices/last_crawled_domains_with_stats_json", methods=['GET']) @hiddenServices.route("/hiddenServices/last_crawled_domains_with_stats_json", methods=['GET'])
def last_crawled_domains_with_stats_json(): def last_crawled_domains_with_stats_json():
last_onions = r_serv_onion.lrange('last_onion', 0 ,-1) last_onions = r_serv_onion.lrange('last_onion', 0 ,-1)
@ -571,10 +530,19 @@ def show_domains_by_daterange():
date_from=date_from, date_to=date_to, domains_up=domains_up, domains_down=domains_down, date_from=date_from, date_to=date_to, domains_up=domains_up, domains_down=domains_down,
domains_tags=domains_tags, bootstrap_label=bootstrap_label) domains_tags=domains_tags, bootstrap_label=bootstrap_label)
@hiddenServices.route("/hiddenServices/show_domain", methods=['GET']) @hiddenServices.route("/crawlers/show_domain", methods=['GET'])
def show_domain(): def show_domain():
domain = request.args.get('domain') domain = request.args.get('domain')
epoch = request.args.get('epoch') epoch = request.args.get('epoch')
try:
epoch = int(epoch)
except:
epoch = None
port = request.args.get('port')
try:
port = int(port)
except:
port = 80
type = get_type_domain(domain) type = get_type_domain(domain)
if domain is None or not r_serv_onion.exists('{}_metadata:{}'.format(type, domain)): if domain is None or not r_serv_onion.exists('{}_metadata:{}'.format(type, domain)):
return '404' return '404'
@ -590,16 +558,16 @@ def show_domain():
first_seen = '{}/{}/{}'.format(first_seen[0:4], first_seen[4:6], first_seen[6:8]) first_seen = '{}/{}/{}'.format(first_seen[0:4], first_seen[4:6], first_seen[6:8])
origin_paste = r_serv_onion.hget('{}_metadata:{}'.format(type, domain), 'paste_parent') origin_paste = r_serv_onion.hget('{}_metadata:{}'.format(type, domain), 'paste_parent')
h = HiddenServices(domain, type) h = HiddenServices(domain, type, port=port)
last_crawled_time = h.get_last_crawled() item_core = h.get_domain_crawled_core_item(epoch=epoch)
if 'epoch' in last_crawled_time: epoch = item_core['epoch']
last_check = '{} - {}'.format(last_check, time.strftime('%H:%M.%S', time.gmtime(last_crawled_time['epoch']))) l_pastes = h.get_last_crawled_pastes(item_root=item_core['root_item'])
l_pastes = h.get_last_crawled_pastes(epoch=epoch)
dict_links = h.get_all_links(l_pastes) dict_links = h.get_all_links(l_pastes)
if l_pastes: if l_pastes:
status = True status = True
else: else:
status = False status = False
last_check = '{} - {}'.format(last_check, time.strftime('%H:%M.%S', time.gmtime(epoch)))
screenshot = h.get_domain_random_screenshot(l_pastes) screenshot = h.get_domain_random_screenshot(l_pastes)
if screenshot: if screenshot:
screenshot = screenshot[0] screenshot = screenshot[0]

View File

@ -61,8 +61,11 @@
</thead> </thead>
<tbody id="tbody_last_crawled"> <tbody id="tbody_last_crawled">
{% for metadata_domain in last_domains %} {% for metadata_domain in last_domains %}
<tr data-toggle="popover" data-trigger="hover" title="<span class='badge badge-dark'>{{metadata_domain['domain']}}</span>" data-content="{{metadata_domain['epoch']}}"> <tr data-toggle="popover" data-trigger="hover"
<td><a target="_blank" href="{{ url_for('hiddenServices.show_domain') }}?domain={{ metadata_domain['domain'] }}&epoch={{metadata_domain['epoch']}}">{{ metadata_domain['domain_name'] }}</a></td> title="<span class='badge badge-dark'>{{metadata_domain['domain']}}</span>"
data-content="port: <span class='badge badge-secondary'>{{metadata_domain['port']}}</span><br>
epoch: {{metadata_domain['epoch']}}">
<td><a target="_blank" href="{{ url_for('hiddenServices.show_domain') }}?domain={{ metadata_domain['domain'] }}&port={{metadata_domain['port']}}&epoch={{metadata_domain['epoch']}}">{{ metadata_domain['domain_name'] }}</a></td>
<td>{{'{}/{}/{}'.format(metadata_domain['first_seen'][0:4], metadata_domain['first_seen'][4:6], metadata_domain['first_seen'][6:8])}}</td> <td>{{'{}/{}/{}'.format(metadata_domain['first_seen'][0:4], metadata_domain['first_seen'][4:6], metadata_domain['first_seen'][6:8])}}</td>
<td>{{'{}/{}/{}'.format(metadata_domain['last_check'][0:4], metadata_domain['last_check'][4:6], metadata_domain['last_check'][6:8])}}</td> <td>{{'{}/{}/{}'.format(metadata_domain['last_check'][0:4], metadata_domain['last_check'][4:6], metadata_domain['last_check'][6:8])}}</td>
<td><div style="color:{{metadata_domain['status_color']}}; display:inline-block"> <td><div style="color:{{metadata_domain['status_color']}}; display:inline-block">