chg: [Showpaste] add screenshot + improve onion db

pull/260/head
Terrtia 2018-08-16 17:24:39 +02:00
parent 7652089433
commit ed559d9f4a
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
7 changed files with 186 additions and 38 deletions

1
.gitignore vendored
View File

@ -32,6 +32,7 @@ var/www/submitted
# Local config
bin/packages/config.cfg
configs/keys
files
# installed files
nltk_data/

View File

@ -8,6 +8,7 @@ import redis
import datetime
import time
import subprocess
import requests
sys.path.append(os.environ['AIL_BIN'])
from Helper import Process
@ -17,14 +18,21 @@ from pubsublogger import publisher
def signal_handler(sig, frame):
sys.exit(0)
def crawl_onion(url, domain):
date = datetime.datetime.now().strftime("%Y%m%d")
def crawl_onion(url, domain, date):
if not r_onion.sismember('onion_up:'+date , domain):
if not r_onion.sismember('onion_up:'+date , domain) and not r_onion.sismember('onion_down:'+date , domain):
#if not r_onion.sismember('full_onion_up', domain) and not r_onion.sismember('onion_down:'+date , domain):
super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father')
if super_father is None:
super_father=paste
try:
r = requests.get(splash_url , timeout=0.010)
except Exception:
## FIXME: # TODO: relaunch docker
exit(0)
if r.status_code == 200:
process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, domain, paste, super_father],
stdout=subprocess.PIPE)
while process.poll() is None:
@ -34,14 +42,16 @@ def crawl_onion(url, domain):
if r_serv_metadata.exists('paste_children:'+paste):
msg = 'infoleak:automatic-detection="onion";{}'.format(paste)
p.populate_set_out(msg, 'Tags')
print(process.stdout.read())
r_onion.sadd('onion_up:'+date , domain)
r_onion.sadd('onion_up_link:'+date , url)
else:
r_onion.sadd('onion_down:'+date , domain)
r_onion.sadd('onion_down_link:'+date , url)
print(process.stdout.read())
else:
## FIXME: # TODO: relaunch docker
exit(0)
if __name__ == '__main__':
@ -102,15 +112,51 @@ if __name__ == '__main__':
domain_url = 'http://{}'.format(domain)
print('------------------START ONIOM CRAWLER------------------')
print('------------------START ONION CRAWLER------------------')
print('url: {}'.format(url))
print('domain: {}'.format(domain))
print('domain_url: {}'.format(domain_url))
crawl_onion(url, domain)
if url != domain_url:
crawl_onion(domain_url, domain)
if not r_onion.sismember('banned_onion', domain):
date = datetime.datetime.now().strftime("%Y%m%d")
crawl_onion(url, domain, date)
if url != domain_url:
crawl_onion(domain_url, domain, date)
# save dowm onion
if not r_onion.sismember('onion_up:'+date , domain):
r_onion.sadd('onion_down:'+date , domain)
r_onion.sadd('onion_down_link:'+date , url)
r_onion.hincrby('onion_link_down', url, 1)
if not r_onion.exists('onion_metadata:{}'.format(domain)):
r_onion.hset('onion_metadata:{}'.format(domain), 'first_seen', date)
r_onion.hset('onion_metadata:{}'.format(domain), 'last_seen', date)
else:
r_onion.hincrby('onion_link_up', url, 1)
# last check
r_onion.hset('onion_metadata:{}'.format(domain), 'last_check', date)
# check external onions links (full_scrawl)
external_domains = set()
for link in r_onion.smembers('domain_onion_external_links:{}'.format(domain)):
print(link)
external_domain = re.findall(url_regex, link)
print(external_domain)
if len(external_domain) > 0:
external_domain = external_domain[0][4]
else:
continue
print(external_domain)
# # TODO: add i2p
if '.onion' in external_domain and external_domain != domain:
external_domains.add(external_domain)
if len(external_domains) >= 10:
r_onion.sadd('onion_potential_source', domain)
r_onion.delete('domain_onion_external_links:{}'.format(domain))
print(r_onion.smembers('domain_onion_external_links:{}'.format(domain)))
else:
continue
else:

View File

@ -94,6 +94,7 @@ class Paste(object):
var = self.p_path.split('/')
self.p_date = Date(var[-4], var[-3], var[-2])
self.p_rel_path = os.path.join(var[-4], var[-3], var[-2], self.p_name)
self.p_source = var[-5]
self.supposed_url = 'https://{}/{}'.format(self.p_source.replace('_pro', ''), var[-1].split('.gz')[0])
@ -291,6 +292,9 @@ class Paste(object):
else:
return '[]'
def get_p_rel_path(self):
return self.p_rel_path
def save_all_attributes_redis(self, key=None):
"""
Saving all the attributes in a "Redis-like" Database (Redis, LevelDB)

View File

@ -10,6 +10,10 @@ import datetime
import base64
import redis
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError
from scrapy import Spider
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess, Crawler
@ -79,6 +83,8 @@ class TorSplashCrawler():
db=self.p.config.getint("ARDB_Onion", "db"),
decode_responses=True)
self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date )
self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"),
self.p.config.get("Directories", "crawled"), date )
@ -89,7 +95,7 @@ class TorSplashCrawler():
self.start_urls,
self.parse,
endpoint='render.json',
meta={'parent': self.original_paste},
meta={'father': self.original_paste},
args={ 'html': 1,
'wait': 10,
'render_all': 1,
@ -106,44 +112,47 @@ class TorSplashCrawler():
UUID = self.domains[0]+str(uuid.uuid4())
filename_paste = os.path.join(self.crawled_paste_filemame, UUID)
relative_filename_paste = os.path.join(self.crawler_path, UUID)
filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png')
# save new paste on disk
if self.save_crawled_paste(filename_paste, response.data['html']):
self.r_serv_onion.sadd('onion_up:'+self.full_date , self.domains[0])
self.r_serv_onion.sadd('full_onion_up', self.domains[0])
# create onion metadata
if not self.r_serv_onion.exists('onion_metadata:{}'.format(self.domain[0])):
self.r_serv_onion.hset('onion_metadata:{}'.format(self.domain[0]), 'first_seen', self.full_date)
self.r_serv_onion.hset('onion_metadata:{}'.format(self.domain[0]), 'last_seen', self.full_date)
if not self.r_serv_onion.exists('onion_metadata:{}'.format(self.domains[0])):
self.r_serv_onion.hset('onion_metadata:{}'.format(self.domains[0]), 'first_seen', self.full_date)
self.r_serv_onion.hset('onion_metadata:{}'.format(self.domains[0]), 'last_seen', self.full_date)
# add onion screenshot history
self.r_serv_onion.sadd('onion_history:{}'.format(self.domain[0]), self.full_date)
self.r_serv_onion.sadd('onion_history:{}'.format(self.domains[0]), self.full_date)
#create paste metadata
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father)
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['parent'])
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['father'])
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0])
self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url)
self.r_serv_metadata.sadd('paste_children:'+response.meta['parent'], filename_paste)
self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste)
dirname = os.path.dirname(filename_screenshot)
if not os.path.exists(dirname):
os.makedirs(dirname)
print(sys.getsizeof(response.data['png']))
print(sys.getsizeof(response.data['html']))
print(self.domains[0])
size_screenshot = (len(response.data['png'])*3) /4
print(size_screenshot)
if size_screenshot < 5000000: #bytes
with open(filename_screenshot, 'wb') as f:
f.write(base64.standard_b64decode(response.data['png'].encode()))
# save external links in set
lext = LinkExtractor(deny_domains=self.domains, unique=True)
for link in lext.extract_links(response):
self.r_serv_metadata.sadd('paste_crawler:filename_paste', link)
self.r_serv_onion.sadd('domain_onion_external_links:{}'.format(self.domains[0]), link.url)
self.r_serv_metadata.sadd('paste_onion_external_links:{}'.format(filename_paste), link.url)
#le = LinkExtractor(unique=True)
le = LinkExtractor(allow_domains=self.domains, unique=True)
@ -154,12 +163,38 @@ class TorSplashCrawler():
link.url,
self.parse,
endpoint='render.json',
meta={'parent': UUID},
meta={'father': relative_filename_paste},
args={ 'html': 1,
'png': 1,
'render_all': 1,
'wait': 10}
#errback=self.errback_catcher
)
'''
def errback_catcher(self, failure):
# catch all errback failures,
self.logger.error(repr(failure))
#if isinstance(failure.value, HttpError):
if failure.check(HttpError):
# you can get the response
response = failure.value.response
print('HttpError')
self.logger.error('HttpError on %s', response.url)
#elif isinstance(failure.value, DNSLookupError):
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
print(DNSLookupError)
self.logger.error('DNSLookupError on %s', request.url)
#elif isinstance(failure.value, TimeoutError):
elif failure.check(TimeoutError):
request = failure.request
print(TimeoutError)
self.logger.error('TimeoutError on %s', request.url)
'''
def save_crawled_paste(self, filename, content):

View File

@ -96,6 +96,12 @@ r_serv_statistics = redis.StrictRedis(
db=cfg.getint("ARDB_Statistics", "db"),
decode_responses=True)
r_serv_onion = redis.StrictRedis(
host=cfg.get("ARDB_Onion", "host"),
port=cfg.getint("ARDB_Onion", "port"),
db=cfg.getint("ARDB_Onion", "db"),
decode_responses=True)
sys.path.append('../../configs/keys')
# MISP #
@ -144,4 +150,6 @@ bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info']
UPLOAD_FOLDER = os.path.join(os.environ['AIL_FLASK'], 'submitted')
SCREENSHOT_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"))
max_dashboard_logs = int(cfg.get("Flask", "max_dashboard_logs"))

View File

@ -5,9 +5,10 @@
Flask functions and routes for the trending modules page
'''
import redis
import os
import json
import flask
from flask import Flask, render_template, jsonify, request, Blueprint, make_response, Response
from flask import Flask, render_template, jsonify, request, Blueprint, make_response, Response, send_from_directory
import difflib
import ssdeep
@ -22,12 +23,14 @@ r_serv_pasteName = Flask_config.r_serv_pasteName
r_serv_metadata = Flask_config.r_serv_metadata
r_serv_tags = Flask_config.r_serv_tags
r_serv_statistics = Flask_config.r_serv_statistics
r_serv_onion = Flask_config.r_serv_onion
max_preview_char = Flask_config.max_preview_char
max_preview_modal = Flask_config.max_preview_modal
DiffMaxLineLength = Flask_config.DiffMaxLineLength
bootstrap_label = Flask_config.bootstrap_label
misp_event_url = Flask_config.misp_event_url
hive_case_url = Flask_config.hive_case_url
SCREENSHOT_FOLDER = Flask_config.SCREENSHOT_FOLDER
showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templates')
@ -130,6 +133,16 @@ def showpaste(content_range):
list_tags.append( (tag, automatic, tag_status_tp, tag_status_fp) )
crawler_metadata = {}
if 'infoleak:submission="crawler"' in l_tags:
crawler_metadata['get_metadata'] = True
crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'father')
crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+requested_path,'real_link')
crawler_metadata['external_links'] =r_serv_metadata.scard('paste_onion_external_links:'+requested_path)
crawler_metadata['screenshot'] = paste.get_p_rel_path()
else:
crawler_metadata['get_metadata'] = False
if Flask_config.pymisp is False:
misp = False
else:
@ -157,6 +170,7 @@ def showpaste(content_range):
hive_url = hive_case_url.replace('id_here', hive_case)
return render_template("show_saved_paste.html", date=p_date, bootstrap_label=bootstrap_label, active_taxonomies=active_taxonomies, active_galaxies=active_galaxies, list_tags=list_tags, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list, hashtype_list = p_hashtype_list, date_list=p_date_list,
crawler_metadata=crawler_metadata,
misp=misp, hive=hive, misp_eventid=misp_eventid, misp_url=misp_url, hive_caseid=hive_caseid, hive_url=hive_url)
# ============ ROUTES ============
@ -202,5 +216,9 @@ def showDiff():
the_html = htmlD.make_file(lines1, lines2)
return the_html
@showsavedpastes.route('/screenshot/<path:filename>')
def screenshot(filename):
return send_from_directory(SCREENSHOT_FOLDER, filename+'.png', as_attachment=True)
# ========= REGISTRATION =========
app.register_blueprint(showsavedpastes)

View File

@ -373,6 +373,42 @@
</tbody>
</table>
{% endif %}
{% if crawler_metadata['get_metadata'] %}
<div class="row">
<div class="col-md-7">
<img src="{{ url_for('showsavedpastes.screenshot', filename=crawler_metadata['screenshot']) }}" onError="this.onerror=null;this.src='{{ url_for('static', filename='image/AIL.png') }}';" style="width:100%;" />
</div>
<div class="col-md-5">
<div class="row">
<div class="panel panel-default">
<div class="panel-heading">
<i id="flash-tld" class="glyphicon glyphicon-flash " flash-tld=""></i> Graph
</div>
<table class="table table-hover table-striped">
<tbody>
<tr>
<td>Father</td>
<td>{{ crawler_metadata['paste_father'] }}</td>
</tr>
<tr>
<td>Source link</td>
<td>{{ crawler_metadata['real_link'] }}</td>
</tr>
<tr>
<td>External links</td>
<td>{{ crawler_metadata['external_links'] }}</td>
</tr>
</tbody>
</table>
</div>
</div>
</div>
</div>
{% endif %}
<h3> Content: </h3>
<a href="{{ url_for('showsavedpastes.showsavedrawpaste') }}?paste={{ request.args.get('paste') }}" id='raw_paste' > [Raw content] </a>
<p data-initsize="{{ initsize }}"> <pre id="paste-holder">{{ content }}</pre></p>