mirror of https://github.com/CIRCL/AIL-framework
chg: [Showpaste] add screenshot + improve onion db
parent
7652089433
commit
ed559d9f4a
|
@ -32,6 +32,7 @@ var/www/submitted
|
|||
# Local config
|
||||
bin/packages/config.cfg
|
||||
configs/keys
|
||||
files
|
||||
|
||||
# installed files
|
||||
nltk_data/
|
||||
|
|
|
@ -8,6 +8,7 @@ import redis
|
|||
import datetime
|
||||
import time
|
||||
import subprocess
|
||||
import requests
|
||||
|
||||
sys.path.append(os.environ['AIL_BIN'])
|
||||
from Helper import Process
|
||||
|
@ -17,14 +18,21 @@ from pubsublogger import publisher
|
|||
def signal_handler(sig, frame):
|
||||
sys.exit(0)
|
||||
|
||||
def crawl_onion(url, domain):
|
||||
date = datetime.datetime.now().strftime("%Y%m%d")
|
||||
def crawl_onion(url, domain, date):
|
||||
|
||||
if not r_onion.sismember('onion_up:'+date , domain):
|
||||
if not r_onion.sismember('onion_up:'+date , domain) and not r_onion.sismember('onion_down:'+date , domain):
|
||||
#if not r_onion.sismember('full_onion_up', domain) and not r_onion.sismember('onion_down:'+date , domain):
|
||||
super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father')
|
||||
if super_father is None:
|
||||
super_father=paste
|
||||
|
||||
try:
|
||||
r = requests.get(splash_url , timeout=0.010)
|
||||
except Exception:
|
||||
## FIXME: # TODO: relaunch docker
|
||||
exit(0)
|
||||
|
||||
if r.status_code == 200:
|
||||
process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, domain, paste, super_father],
|
||||
stdout=subprocess.PIPE)
|
||||
while process.poll() is None:
|
||||
|
@ -34,14 +42,16 @@ def crawl_onion(url, domain):
|
|||
if r_serv_metadata.exists('paste_children:'+paste):
|
||||
msg = 'infoleak:automatic-detection="onion";{}'.format(paste)
|
||||
p.populate_set_out(msg, 'Tags')
|
||||
|
||||
print(process.stdout.read())
|
||||
|
||||
r_onion.sadd('onion_up:'+date , domain)
|
||||
r_onion.sadd('onion_up_link:'+date , url)
|
||||
else:
|
||||
r_onion.sadd('onion_down:'+date , domain)
|
||||
r_onion.sadd('onion_down_link:'+date , url)
|
||||
print(process.stdout.read())
|
||||
else:
|
||||
## FIXME: # TODO: relaunch docker
|
||||
exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -102,15 +112,51 @@ if __name__ == '__main__':
|
|||
|
||||
domain_url = 'http://{}'.format(domain)
|
||||
|
||||
print('------------------START ONIOM CRAWLER------------------')
|
||||
print('------------------START ONION CRAWLER------------------')
|
||||
print('url: {}'.format(url))
|
||||
print('domain: {}'.format(domain))
|
||||
print('domain_url: {}'.format(domain_url))
|
||||
|
||||
crawl_onion(url, domain)
|
||||
if url != domain_url:
|
||||
crawl_onion(domain_url, domain)
|
||||
if not r_onion.sismember('banned_onion', domain):
|
||||
|
||||
date = datetime.datetime.now().strftime("%Y%m%d")
|
||||
|
||||
crawl_onion(url, domain, date)
|
||||
if url != domain_url:
|
||||
crawl_onion(domain_url, domain, date)
|
||||
|
||||
# save dowm onion
|
||||
if not r_onion.sismember('onion_up:'+date , domain):
|
||||
r_onion.sadd('onion_down:'+date , domain)
|
||||
r_onion.sadd('onion_down_link:'+date , url)
|
||||
r_onion.hincrby('onion_link_down', url, 1)
|
||||
if not r_onion.exists('onion_metadata:{}'.format(domain)):
|
||||
r_onion.hset('onion_metadata:{}'.format(domain), 'first_seen', date)
|
||||
r_onion.hset('onion_metadata:{}'.format(domain), 'last_seen', date)
|
||||
else:
|
||||
r_onion.hincrby('onion_link_up', url, 1)
|
||||
|
||||
# last check
|
||||
r_onion.hset('onion_metadata:{}'.format(domain), 'last_check', date)
|
||||
|
||||
# check external onions links (full_scrawl)
|
||||
external_domains = set()
|
||||
for link in r_onion.smembers('domain_onion_external_links:{}'.format(domain)):
|
||||
print(link)
|
||||
external_domain = re.findall(url_regex, link)
|
||||
print(external_domain)
|
||||
if len(external_domain) > 0:
|
||||
external_domain = external_domain[0][4]
|
||||
else:
|
||||
continue
|
||||
print(external_domain)
|
||||
# # TODO: add i2p
|
||||
if '.onion' in external_domain and external_domain != domain:
|
||||
external_domains.add(external_domain)
|
||||
if len(external_domains) >= 10:
|
||||
r_onion.sadd('onion_potential_source', domain)
|
||||
r_onion.delete('domain_onion_external_links:{}'.format(domain))
|
||||
print(r_onion.smembers('domain_onion_external_links:{}'.format(domain)))
|
||||
else:
|
||||
continue
|
||||
else:
|
||||
|
|
|
@ -94,6 +94,7 @@ class Paste(object):
|
|||
|
||||
var = self.p_path.split('/')
|
||||
self.p_date = Date(var[-4], var[-3], var[-2])
|
||||
self.p_rel_path = os.path.join(var[-4], var[-3], var[-2], self.p_name)
|
||||
self.p_source = var[-5]
|
||||
self.supposed_url = 'https://{}/{}'.format(self.p_source.replace('_pro', ''), var[-1].split('.gz')[0])
|
||||
|
||||
|
@ -291,6 +292,9 @@ class Paste(object):
|
|||
else:
|
||||
return '[]'
|
||||
|
||||
def get_p_rel_path(self):
|
||||
return self.p_rel_path
|
||||
|
||||
def save_all_attributes_redis(self, key=None):
|
||||
"""
|
||||
Saving all the attributes in a "Redis-like" Database (Redis, LevelDB)
|
||||
|
|
|
@ -10,6 +10,10 @@ import datetime
|
|||
import base64
|
||||
import redis
|
||||
|
||||
from scrapy.spidermiddlewares.httperror import HttpError
|
||||
from twisted.internet.error import DNSLookupError
|
||||
from twisted.internet.error import TimeoutError
|
||||
|
||||
from scrapy import Spider
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
from scrapy.crawler import CrawlerProcess, Crawler
|
||||
|
@ -79,6 +83,8 @@ class TorSplashCrawler():
|
|||
db=self.p.config.getint("ARDB_Onion", "db"),
|
||||
decode_responses=True)
|
||||
|
||||
self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date )
|
||||
|
||||
self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"),
|
||||
self.p.config.get("Directories", "crawled"), date )
|
||||
|
||||
|
@ -89,7 +95,7 @@ class TorSplashCrawler():
|
|||
self.start_urls,
|
||||
self.parse,
|
||||
endpoint='render.json',
|
||||
meta={'parent': self.original_paste},
|
||||
meta={'father': self.original_paste},
|
||||
args={ 'html': 1,
|
||||
'wait': 10,
|
||||
'render_all': 1,
|
||||
|
@ -106,44 +112,47 @@ class TorSplashCrawler():
|
|||
|
||||
UUID = self.domains[0]+str(uuid.uuid4())
|
||||
filename_paste = os.path.join(self.crawled_paste_filemame, UUID)
|
||||
relative_filename_paste = os.path.join(self.crawler_path, UUID)
|
||||
filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png')
|
||||
|
||||
# save new paste on disk
|
||||
if self.save_crawled_paste(filename_paste, response.data['html']):
|
||||
|
||||
self.r_serv_onion.sadd('onion_up:'+self.full_date , self.domains[0])
|
||||
self.r_serv_onion.sadd('full_onion_up', self.domains[0])
|
||||
|
||||
# create onion metadata
|
||||
if not self.r_serv_onion.exists('onion_metadata:{}'.format(self.domain[0])):
|
||||
self.r_serv_onion.hset('onion_metadata:{}'.format(self.domain[0]), 'first_seen', self.full_date)
|
||||
self.r_serv_onion.hset('onion_metadata:{}'.format(self.domain[0]), 'last_seen', self.full_date)
|
||||
if not self.r_serv_onion.exists('onion_metadata:{}'.format(self.domains[0])):
|
||||
self.r_serv_onion.hset('onion_metadata:{}'.format(self.domains[0]), 'first_seen', self.full_date)
|
||||
self.r_serv_onion.hset('onion_metadata:{}'.format(self.domains[0]), 'last_seen', self.full_date)
|
||||
|
||||
# add onion screenshot history
|
||||
self.r_serv_onion.sadd('onion_history:{}'.format(self.domain[0]), self.full_date)
|
||||
self.r_serv_onion.sadd('onion_history:{}'.format(self.domains[0]), self.full_date)
|
||||
|
||||
#create paste metadata
|
||||
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father)
|
||||
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['parent'])
|
||||
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['father'])
|
||||
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0])
|
||||
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url)
|
||||
|
||||
self.r_serv_metadata.sadd('paste_children:'+response.meta['parent'], filename_paste)
|
||||
self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste)
|
||||
|
||||
dirname = os.path.dirname(filename_screenshot)
|
||||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
|
||||
print(sys.getsizeof(response.data['png']))
|
||||
print(sys.getsizeof(response.data['html']))
|
||||
print(self.domains[0])
|
||||
|
||||
|
||||
size_screenshot = (len(response.data['png'])*3) /4
|
||||
print(size_screenshot)
|
||||
|
||||
if size_screenshot < 5000000: #bytes
|
||||
with open(filename_screenshot, 'wb') as f:
|
||||
f.write(base64.standard_b64decode(response.data['png'].encode()))
|
||||
|
||||
# save external links in set
|
||||
lext = LinkExtractor(deny_domains=self.domains, unique=True)
|
||||
for link in lext.extract_links(response):
|
||||
self.r_serv_metadata.sadd('paste_crawler:filename_paste', link)
|
||||
self.r_serv_onion.sadd('domain_onion_external_links:{}'.format(self.domains[0]), link.url)
|
||||
self.r_serv_metadata.sadd('paste_onion_external_links:{}'.format(filename_paste), link.url)
|
||||
|
||||
#le = LinkExtractor(unique=True)
|
||||
le = LinkExtractor(allow_domains=self.domains, unique=True)
|
||||
|
@ -154,12 +163,38 @@ class TorSplashCrawler():
|
|||
link.url,
|
||||
self.parse,
|
||||
endpoint='render.json',
|
||||
meta={'parent': UUID},
|
||||
meta={'father': relative_filename_paste},
|
||||
args={ 'html': 1,
|
||||
'png': 1,
|
||||
'render_all': 1,
|
||||
'wait': 10}
|
||||
#errback=self.errback_catcher
|
||||
)
|
||||
'''
|
||||
def errback_catcher(self, failure):
|
||||
# catch all errback failures,
|
||||
self.logger.error(repr(failure))
|
||||
|
||||
#if isinstance(failure.value, HttpError):
|
||||
if failure.check(HttpError):
|
||||
# you can get the response
|
||||
response = failure.value.response
|
||||
print('HttpError')
|
||||
self.logger.error('HttpError on %s', response.url)
|
||||
|
||||
#elif isinstance(failure.value, DNSLookupError):
|
||||
elif failure.check(DNSLookupError):
|
||||
# this is the original request
|
||||
request = failure.request
|
||||
print(DNSLookupError)
|
||||
self.logger.error('DNSLookupError on %s', request.url)
|
||||
|
||||
#elif isinstance(failure.value, TimeoutError):
|
||||
elif failure.check(TimeoutError):
|
||||
request = failure.request
|
||||
print(TimeoutError)
|
||||
self.logger.error('TimeoutError on %s', request.url)
|
||||
'''
|
||||
|
||||
def save_crawled_paste(self, filename, content):
|
||||
|
||||
|
|
|
@ -96,6 +96,12 @@ r_serv_statistics = redis.StrictRedis(
|
|||
db=cfg.getint("ARDB_Statistics", "db"),
|
||||
decode_responses=True)
|
||||
|
||||
r_serv_onion = redis.StrictRedis(
|
||||
host=cfg.get("ARDB_Onion", "host"),
|
||||
port=cfg.getint("ARDB_Onion", "port"),
|
||||
db=cfg.getint("ARDB_Onion", "db"),
|
||||
decode_responses=True)
|
||||
|
||||
|
||||
sys.path.append('../../configs/keys')
|
||||
# MISP #
|
||||
|
@ -144,4 +150,6 @@ bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info']
|
|||
|
||||
UPLOAD_FOLDER = os.path.join(os.environ['AIL_FLASK'], 'submitted')
|
||||
|
||||
SCREENSHOT_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"))
|
||||
|
||||
max_dashboard_logs = int(cfg.get("Flask", "max_dashboard_logs"))
|
||||
|
|
|
@ -5,9 +5,10 @@
|
|||
Flask functions and routes for the trending modules page
|
||||
'''
|
||||
import redis
|
||||
import os
|
||||
import json
|
||||
import flask
|
||||
from flask import Flask, render_template, jsonify, request, Blueprint, make_response, Response
|
||||
from flask import Flask, render_template, jsonify, request, Blueprint, make_response, Response, send_from_directory
|
||||
import difflib
|
||||
import ssdeep
|
||||
|
||||
|
@ -22,12 +23,14 @@ r_serv_pasteName = Flask_config.r_serv_pasteName
|
|||
r_serv_metadata = Flask_config.r_serv_metadata
|
||||
r_serv_tags = Flask_config.r_serv_tags
|
||||
r_serv_statistics = Flask_config.r_serv_statistics
|
||||
r_serv_onion = Flask_config.r_serv_onion
|
||||
max_preview_char = Flask_config.max_preview_char
|
||||
max_preview_modal = Flask_config.max_preview_modal
|
||||
DiffMaxLineLength = Flask_config.DiffMaxLineLength
|
||||
bootstrap_label = Flask_config.bootstrap_label
|
||||
misp_event_url = Flask_config.misp_event_url
|
||||
hive_case_url = Flask_config.hive_case_url
|
||||
SCREENSHOT_FOLDER = Flask_config.SCREENSHOT_FOLDER
|
||||
|
||||
showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templates')
|
||||
|
||||
|
@ -130,6 +133,16 @@ def showpaste(content_range):
|
|||
|
||||
list_tags.append( (tag, automatic, tag_status_tp, tag_status_fp) )
|
||||
|
||||
crawler_metadata = {}
|
||||
if 'infoleak:submission="crawler"' in l_tags:
|
||||
crawler_metadata['get_metadata'] = True
|
||||
crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'father')
|
||||
crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+requested_path,'real_link')
|
||||
crawler_metadata['external_links'] =r_serv_metadata.scard('paste_onion_external_links:'+requested_path)
|
||||
crawler_metadata['screenshot'] = paste.get_p_rel_path()
|
||||
else:
|
||||
crawler_metadata['get_metadata'] = False
|
||||
|
||||
if Flask_config.pymisp is False:
|
||||
misp = False
|
||||
else:
|
||||
|
@ -157,6 +170,7 @@ def showpaste(content_range):
|
|||
hive_url = hive_case_url.replace('id_here', hive_case)
|
||||
|
||||
return render_template("show_saved_paste.html", date=p_date, bootstrap_label=bootstrap_label, active_taxonomies=active_taxonomies, active_galaxies=active_galaxies, list_tags=list_tags, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list, hashtype_list = p_hashtype_list, date_list=p_date_list,
|
||||
crawler_metadata=crawler_metadata,
|
||||
misp=misp, hive=hive, misp_eventid=misp_eventid, misp_url=misp_url, hive_caseid=hive_caseid, hive_url=hive_url)
|
||||
|
||||
# ============ ROUTES ============
|
||||
|
@ -202,5 +216,9 @@ def showDiff():
|
|||
the_html = htmlD.make_file(lines1, lines2)
|
||||
return the_html
|
||||
|
||||
@showsavedpastes.route('/screenshot/<path:filename>')
|
||||
def screenshot(filename):
|
||||
return send_from_directory(SCREENSHOT_FOLDER, filename+'.png', as_attachment=True)
|
||||
|
||||
# ========= REGISTRATION =========
|
||||
app.register_blueprint(showsavedpastes)
|
||||
|
|
|
@ -373,6 +373,42 @@
|
|||
</tbody>
|
||||
</table>
|
||||
{% endif %}
|
||||
|
||||
{% if crawler_metadata['get_metadata'] %}
|
||||
<div class="row">
|
||||
<div class="col-md-7">
|
||||
<img src="{{ url_for('showsavedpastes.screenshot', filename=crawler_metadata['screenshot']) }}" onError="this.onerror=null;this.src='{{ url_for('static', filename='image/AIL.png') }}';" style="width:100%;" />
|
||||
</div>
|
||||
|
||||
<div class="col-md-5">
|
||||
<div class="row">
|
||||
<div class="panel panel-default">
|
||||
<div class="panel-heading">
|
||||
<i id="flash-tld" class="glyphicon glyphicon-flash " flash-tld=""></i> Graph
|
||||
</div>
|
||||
|
||||
<table class="table table-hover table-striped">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>Father</td>
|
||||
<td>{{ crawler_metadata['paste_father'] }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Source link</td>
|
||||
<td>{{ crawler_metadata['real_link'] }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>External links</td>
|
||||
<td>{{ crawler_metadata['external_links'] }}</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<h3> Content: </h3>
|
||||
<a href="{{ url_for('showsavedpastes.showsavedrawpaste') }}?paste={{ request.args.get('paste') }}" id='raw_paste' > [Raw content] </a>
|
||||
<p data-initsize="{{ initsize }}"> <pre id="paste-holder">{{ content }}</pre></p>
|
||||
|
|
Loading…
Reference in New Issue