diff --git a/bin/lib/crawler_splash.py b/bin/lib/crawler_splash.py deleted file mode 100755 index 1ff28524..00000000 --- a/bin/lib/crawler_splash.py +++ /dev/null @@ -1,205 +0,0 @@ -#!/usr/bin/python3 - -""" -API Helper -=================== - - -""" -import base64 -import gzip -import json -import os -import re -import redis -import sys -import uuid - -from datetime import datetime, timedelta -from urllib.parse import urlparse - -sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) -import ConfigLoader - - -config_loader = ConfigLoader.ConfigLoader() -r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata") -r_serv_onion = config_loader.get_redis_conn("ARDB_Onion") -config_loader = None - -# # # # # # # # -# # -# COOKIES # -# # -# # # # # # # # - -# # # # -# Cookies Fields: -# - name -# - value -# - path (optional) -# - domain (optional) -# - secure (optional) -# - httpOnly (optional) -# # # # -def create_cookie_dict(browser_cookie=[], cookie_name=None, cookie_value=None, domain=None, crawler_type='regular'): - # UI created - if cookie_name and cookie_value and domain: - dict_cookie = create_cookie_dict_from_input(cookie_name, cookie_value, domain) - # Cookies imported from the browser - else: - dict_cookie = create_cookie_dict_from_browser(browser_cookie) - - # tor browser: disable secure cookie - if crawler_type=='onion': - dict_cookie['secure'] = False - - dict_cookie['expires'] = (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z' - return dict_cookie - -def create_cookie_dict_from_input(cookie_name, cookie_value, cookie_domain): - # WebKit use domain for cookie validation - return {'name': cookie_name, 'value': cookie_value, 'domain': '.{}'.format(cookie_domain)} - -# # TODO: handle prefix cookies -# # TODO: fill empty fields -def create_cookie_dict_from_browser(browser_cookie): - url = urlparse(browser_cookie['Host raw']) - domain = url.netloc.split(':', 1)[0] - dict_cookie = {'path': browser_cookie['Path raw'], - 'name': browser_cookie['Name raw'], - 'httpOnly': browser_cookie['HTTP only raw'] == 'true', - 'secure': browser_cookie['Send for'] == 'Encrypted connections only', - 'domain': domain, - 'value': browser_cookie['Content raw'] - } - return dict_cookie - -def load_cookies(cookies_uuid, domain=None, crawler_type='regular'): - cookies_json, l_cookies = get_cookies(cookies_uuid) - all_cookies = [] - for cookie_dict in cookies_json: - all_cookies.append(create_cookie_dict(browser_cookie=cookie_dict, crawler_type=crawler_type)) - for cookie_name, cookie_value in l_cookies: - all_cookies.append(create_cookie_dict( cookie_name=cookie_name, cookie_value=cookie_value, domain=domain, crawler_type=crawler_type)) - return all_cookies - -def get_all_cookies(): - r_serv_onion.smembers('cookies:all') - -def get_all_global_cookies(): - r_serv_onion.smembers('cookies:global') - -def get_user_cookies(user_id): - r_serv_onion.smembers('cookies:user:{}'.format(user_id)) - -def exist_cookies_uuid(cookies_uuid): - return r_serv_onion.exists('cookie_metadata:{}'.format(cookies_uuid)) - -def get_manual_cookies_keys(cookies_uuid): - return r_serv_onion.hgetall('cookies:manual_cookies:{}'.format(cookies_uuid)) - -def get_manual_cookie_val(cookies_uuid, cookie_name): - return r_serv_onion.hget('cookies:manual_cookies:{}'.format(cookies_uuid), cookie_name) - -def get_cookies(cookies_uuid): - cookies_json = r_serv_onion.get('cookies:json_cookies:{}'.format(cookies_uuid)) - if cookies_json: - cookies_json = json.loads(cookies_json) - else: - cookies_json = [] - l_cookies = [ ( cookie_name, get_manual_cookie_val(cookies_uuid, cookie_name)) for cookie_name in get_manual_cookies_keys(cookies_uuid) ] - return (cookies_json, l_cookies) - -# # TODO: handle errors + add api handler -def save_cookies(user_id, json_cookies=None, l_cookies=[], cookies_uuid=None, level=1, description=None): - if cookies_uuid is None or not exist_cookies_uuid(cookies_uuid): - cookies_uuid = str(uuid.uuid4()) - - if json_cookies: - json_cookies = json.loads(json_cookies) # # TODO: catch Exception - r_serv_onion.set('cookies:json_cookies:{}'.format(cookies_uuid), json.dumps(json_cookies)) - - for cookie_dict in l_cookies: - r_serv_onion.hset('cookies:manual_cookies:{}'.format(cookies_uuid), cookie_dict['name'], cookie_dict['value']) - - # cookies level # # TODO: edit level set on edit - r_serv_onion.sadd('cookies:all', cookies_uuid) - if level==0: - r_serv_onion.sadd('cookies:user:{}'.format(user_id), cookies_uuid) - else: - r_serv_onion.sadd('cookies:global', cookies_uuid) - - # metadata - r_serv_onion.hset('cookie_metadata:{}'.format(id), 'user_id', user_id) - r_serv_onion.hset('cookie_metadata:{}'.format(id), 'level', level) - r_serv_onion.hset('cookie_metadata:{}'.format(id), 'description', description) - r_serv_onion.hset('cookie_metadata:{}'.format(id), 'date', datetime.date.today().strftime("%Y%m%d")) - return cookies_uuid - -#### #### - -def is_redirection(domain, last_url): - url = urlparse(last_url) - last_domain = url.netloc - last_domain = last_domain.split('.') - last_domain = '{}.{}'.format(last_domain[-2], last_domain[-1]) - return domain != last_domain - -# domain up -def create_domain_metadata(domain_type, domain, current_port, date, date_month): - # Add to global set - r_serv_onion.sadd('{}_up:{}'.format(domain_type, date), domain) - r_serv_onion.sadd('full_{}_up'.format(domain_type), domain) - r_serv_onion.sadd('month_{}_up:{}'.format(domain_type, date_month), domain) - - # create onion metadata - if not r_serv_onion.exists('{}_metadata:{}'.format(domain_type, domain)): - r_serv_onion.hset('{}_metadata:{}'.format(domain_type, domain), 'first_seen', date) - r_serv_onion.hset('{}_metadata:{}'.format(domain_type, domain), 'last_check', date) - - # Update domain port number - all_domain_ports = r_serv_onion.hget('{}_metadata:{}'.format(domain_type, domain), 'ports') - if all_domain_ports: - all_domain_ports = all_domain_ports.split(';') - else: - all_domain_ports = [] - if current_port not in all_domain_ports: - all_domain_ports.append(current_port) - r_serv_onion.hset('{}_metadata:{}'.format(domain_type, domain), 'ports', ';'.join(all_domain_ports)) - -# add root_item to history -def add_domain_root_item(root_item, domain_type, domain, epoch_date, port): - # Create/Update crawler history - r_serv_onion.zadd('crawler_history_{}:{}:{}'.format(domain_type, domain, port), epoch_date, root_item) - -def create_item_metadata(item_id, domain, url, port, item_father): - r_serv_metadata.hset('paste_metadata:{}'.format(item_id), 'father', item_father) - r_serv_metadata.hset('paste_metadata:{}'.format(item_id), 'domain', '{}:{}'.format(domain, port)) - r_serv_metadata.hset('paste_metadata:{}'.format(item_id), 'real_link', url) - # add this item_id to his father - r_serv_metadata.sadd('paste_children:{}'.format(item_father), item_id) - -def create_item_id(item_dir, domain): - if len(domain) > 215: - UUID = domain[-215:]+str(uuid.uuid4()) - else: - UUID = domain+str(uuid.uuid4()) - return os.path.join(item_dir, UUID) - -def save_crawled_item(item_id, item_content): - try: - gzipencoded = gzip.compress(item_content.encode()) - gzip64encoded = base64.standard_b64encode(gzipencoded).decode() - return gzip64encoded - except: - print("file error: {}".format(item_id)) - return False - -def save_har(har_dir, item_id, har_content): - if not os.path.exists(har_dir): - os.makedirs(har_dir) - item_id = item_id.split('/')[-1] - filename = os.path.join(har_dir, item_id + '.json') - with open(filename, 'w') as f: - f.write(json.dumps(har_content)) diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index 7df5e5de..96eb8bf6 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -28,7 +28,7 @@ from Helper import Process sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib')) #import ConfigLoader import Screenshot -import crawler_splash +import crawlers script_cookie = """ function main(splash, args) @@ -176,18 +176,18 @@ class TorSplashCrawler(): # detect connection to proxy refused error_log = (json.loads(response.body.decode())) print(error_log) - elif crawler_splash.is_redirection(self.domains[0], response.data['last_url']): + elif crawlers.is_redirection(self.domains[0], response.data['last_url']): pass # ignore response else: - item_id = crawler_splash.create_item_id(self.item_dir, self.domains[0]) + item_id = crawlers.create_item_id(self.item_dir, self.domains[0]) self.save_crawled_item(item_id, response.data['html']) - crawler_splash.create_item_metadata(item_id, self.domains[0], response.data['last_url'], self.port, response.meta['father']) + crawlers.create_item_metadata(item_id, self.domains[0], response.data['last_url'], self.port, response.meta['father']) if self.root_key is None: self.root_key = item_id - crawler_splash.add_domain_root_item(item_id, self.domain_type, self.domains[0], self.date_epoch, self.port) - crawler_splash.create_domain_metadata(self.domain_type, self.domains[0], self.port, self.full_date, self.date_month) + crawlers.add_domain_root_item(item_id, self.domain_type, self.domains[0], self.date_epoch, self.port) + crawlers.create_domain_metadata(self.domain_type, self.domains[0], self.port, self.full_date, self.date_month) if 'cookies' in response.data: all_cookies = response.data['cookies'] @@ -202,7 +202,7 @@ class TorSplashCrawler(): Screenshot.save_domain_relationship(sha256_string, self.domains[0]) # HAR if 'har' in response.data: - crawler_splash.save_har(self.har_dir, item_id, response.data['har']) + crawlers.save_har(self.har_dir, item_id, response.data['har']) le = LinkExtractor(allow_domains=self.domains, unique=True) for link in le.extract_links(response): @@ -247,7 +247,7 @@ class TorSplashCrawler(): print(failure.type) def save_crawled_item(self, item_id, item_content): - gzip64encoded = crawler_splash.save_crawled_item(item_id, item_content) + gzip64encoded = crawlers.save_crawled_item(item_id, item_content) # Send item to queue # send paste to Global diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py index 77fb9385..b74bfcc6 100755 --- a/bin/torcrawler/tor_crawler.py +++ b/bin/torcrawler/tor_crawler.py @@ -9,7 +9,7 @@ from TorSplashCrawler import TorSplashCrawler sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) import ConfigLoader -import crawler_splash +import crawlers if __name__ == '__main__': @@ -37,7 +37,7 @@ if __name__ == '__main__': crawler_options = crawler_json['crawler_options'] date = crawler_json['date'] requested_mode = crawler_json['requested'] - cookies = crawler_splash.load_cookies('ccad0090-bdcb-4ba5-875b-3dae8f936216', domain, crawler_type=service_type) + cookies = crawlers.load_cookies('ccad0090-bdcb-4ba5-875b-3dae8f936216', domain, crawler_type=service_type) redis_cache.delete('crawler_request:{}'.format(uuid)) diff --git a/var/www/blueprints/crawler_splash.py b/var/www/blueprints/crawler_splash.py index 6569ba47..6ea4e40f 100644 --- a/var/www/blueprints/crawler_splash.py +++ b/var/www/blueprints/crawler_splash.py @@ -10,7 +10,7 @@ import sys import json import random -from flask import Flask, render_template, jsonify, request, Blueprint, redirect, url_for, Response +from flask import Flask, render_template, jsonify, request, Blueprint, redirect, url_for, Response, make_response from flask_login import login_required, current_user, login_user, logout_user sys.path.append('modules') @@ -25,7 +25,7 @@ import Tag sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib')) import Domain -import crawler_splash +import crawlers r_cache = Flask_config.r_cache r_serv_db = Flask_config.r_serv_db @@ -44,7 +44,19 @@ def api_validator(api_response): if api_response: return Response(json.dumps(api_response[0], indent=2, sort_keys=True), mimetype='application/json'), api_response[1] +def create_json_response(data, status_code): + return Response(json.dumps(data, indent=2, sort_keys=True), mimetype='application/json'), status_code + # ============= ROUTES ============== +@crawler_splash.route("/crawlers/manual", methods=['GET']) +#@login_required +#@login_read_only +def manual(): + user_id = current_user.get_id() + l_cookies = crawlers.api_get_cookies_list(user_id) + return render_template("crawler_manual.html", crawler_enabled=True, l_cookies=l_cookies) + + # add route : /crawlers/show_domain @crawler_splash.route('/crawlers/showDomain', methods=['GET', 'POST']) @login_required @@ -194,18 +206,30 @@ def crawler_cookies_add_post(): l_manual_cookie.append(cookie_dict) elif l_input[1]: # cookie_value l_invalid_cookie.append({'name': '', 'value': l_input[1]}) - else: - #print(l_input) - pass + if l_invalid_cookie: + return create_json_response({'error': 'invalid cookie', 'invalid fileds': l_invalid_cookie}, 400) - cookie_uuid = crawler_splash.save_cookies(user_id, json_cookies=json_file, l_cookies=l_manual_cookie, level=level, description=description) - return render_template("add_cookies.html") + cookies_uuid = crawler_splash.save_cookies(user_id, json_cookies=json_file, l_cookies=l_manual_cookie, level=level, description=description) + return redirect(url_for('crawler_splash.crawler_cookies_all', cookies_uuid=cookies_uuid)) @crawler_splash.route('/crawler/cookies/all', methods=['GET']) #@login_required #@login_read_only def crawler_cookies_all(): - user_id = current_user.get_id(user_id) - user_cookies = crawler_splash.get_user_cookies(user_id) - global_cookies = crawler_splash.get_all_global_cookies() - return render_template("add_cookies.html", user_cookies=user_cookies, global_cookies=global_cookies) + user_id = current_user.get_id() + user_cookies = crawlers.get_all_user_cookies_metadata(user_id) + global_cookies = crawlers.get_all_global_cookies_metadata() + return render_template("all_cookies.html", user_cookies=user_cookies, global_cookies=global_cookies) + +@crawler_splash.route('/crawler/cookies/show', methods=['GET']) +#@login_required +#@login_read_only +def crawler_cookies_show(): + user_id = current_user.get_id() + cookies_uuid = request.args.get('cookies_uuid') + res = crawlers.api_get_cookies(cookies_uuid, user_id) + if res[1] !=200: + return create_json_response(res[0], res[1]) + cookies_json = json.dumps(res[0]['json_cookies'], indent=4, sort_keys=True) + cookie_metadata = crawlers.get_cookies_metadata(cookies_uuid) + return render_template("edit_cookies.html", cookie_metadata=cookie_metadata, cookies_json=cookies_json, manual_cookies=res[0]['manual_cookies']) diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index 0db6bbe6..0b4bb2be 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -30,6 +30,9 @@ r_serv_metadata = Flask_config.r_serv_metadata crawler_enabled = Flask_config.crawler_enabled bootstrap_label = Flask_config.bootstrap_label +sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib')) +import crawlers + hiddenServices = Blueprint('hiddenServices', __name__, template_folder='templates') faup = Faup() @@ -257,12 +260,6 @@ def dashboard(): crawler_metadata_regular=crawler_metadata_regular, statDomains_onion=statDomains_onion, statDomains_regular=statDomains_regular) -@hiddenServices.route("/crawlers/manual", methods=['GET']) -@login_required -@login_read_only -def manual(): - return render_template("Crawler_Splash_manual.html", crawler_enabled=crawler_enabled) - @hiddenServices.route("/crawlers/crawler_splash_onion", methods=['GET']) @login_required @login_read_only @@ -475,7 +472,7 @@ def create_spider_splash(): create_crawler_config(mode, service_type, crawler_config, domain, url=url) send_url_to_crawl_in_queue(mode, service_type, url) - return redirect(url_for('hiddenServices.manual')) + return redirect(url_for('crawler_splash.manual')) @hiddenServices.route("/crawlers/auto_crawler", methods=['GET']) @login_required diff --git a/var/www/templates/crawler/crawler_splash/add_cookies.html b/var/www/templates/crawler/crawler_splash/add_cookies.html index d2820a69..801212a3 100644 --- a/var/www/templates/crawler/crawler_splash/add_cookies.html +++ b/var/www/templates/crawler/crawler_splash/add_cookies.html @@ -2,7 +2,7 @@
-