From 9868833c77b308170378ca63a5bcb089b92e4867 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Wed, 24 Apr 2019 14:09:04 +0200 Subject: [PATCH] chg: [crawled screenshot] use sha256 as filepath --- bin/packages/HiddenServices.py | 7 +-- bin/torcrawler/TorSplashCrawler.py | 45 ++++++++++++++------ var/www/modules/Flask_config.py | 2 +- var/www/modules/showpaste/Flask_showpaste.py | 10 ++++- 4 files changed, 45 insertions(+), 19 deletions(-) diff --git a/bin/packages/HiddenServices.py b/bin/packages/HiddenServices.py index 39fd4427..79876c4c 100755 --- a/bin/packages/HiddenServices.py +++ b/bin/packages/HiddenServices.py @@ -233,9 +233,10 @@ class HiddenServices(object): origin_paste = paste paste= paste.replace(self.paste_directory+'/', '') - paste = paste.replace(self.paste_crawled_directory_name, '') - if os.path.isfile( '{}{}.png'.format(self.screenshot_directory, paste) ): - l_screenshot_paste.append({'screenshot': paste[1:], 'item': origin_paste}) + screenshot = self.r_serv_metadata.hget('paste_metadata:{}'.format(paste), 'screenshot') + if screenshot: + screenshot = os.path.join(screenshot[0:2], screenshot[2:4], screenshot[4:6], screenshot[6:8], screenshot[8:10], screenshot[10:12], screenshot[12:]) + l_screenshot_paste.append({'screenshot': screenshot, 'item': origin_paste}) if len(l_screenshot_paste) > num_screenshot: l_random_screenshot = [] diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index a83ff185..ac42fdf8 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -12,6 +12,8 @@ import redis import json import time +from hashlib import sha256 + from scrapy.spidermiddlewares.httperror import HttpError from twisted.internet.error import DNSLookupError from twisted.internet.error import TimeoutError @@ -103,7 +105,8 @@ class TorSplashCrawler(): self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"), self.p.config.get("Directories", "crawled"), date_str ) - self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str ) + self.crawled_har = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str ) + self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot") ) def start_requests(self): yield SplashRequest( @@ -135,13 +138,13 @@ class TorSplashCrawler(): UUID = self.domains[0][-215:]+str(uuid.uuid4()) else: UUID = self.domains[0]+str(uuid.uuid4()) - filename_paste = os.path.join(self.crawled_paste_filemame, UUID) + filename_paste_full = os.path.join(self.crawled_paste_filemame, UUID) relative_filename_paste = os.path.join(self.crawler_path, UUID) - filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png') + filename_har = os.path.join(self.crawled_har, UUID +'.png') # # TODO: modify me # save new paste on disk - if self.save_crawled_paste(filename_paste, response.data['html']): + if self.save_crawled_paste(relative_filename_paste, response.data['html']): # add this paste to the domain crawled set # TODO: # FIXME: put this on cache ? #self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste) @@ -170,14 +173,14 @@ class TorSplashCrawler(): self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports', ';'.join(all_domain_ports)) #create paste metadata - self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'super_father', self.root_key) - self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'father', response.meta['father']) - self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'domain', '{}:{}'.format(self.domains[0], self.port)) - self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'real_link', response.url) + self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'super_father', self.root_key) + self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'father', response.meta['father']) + self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'domain', '{}:{}'.format(self.domains[0], self.port)) + self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'real_link', response.url) - self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste) + self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], relative_filename_paste) - dirname = os.path.dirname(filename_screenshot) + dirname = os.path.dirname(filename_har) if not os.path.exists(dirname): os.makedirs(dirname) @@ -185,11 +188,27 @@ class TorSplashCrawler(): size_screenshot = (len(response.data['png'])*3) /4 if size_screenshot < 5000000: #bytes - with open(filename_screenshot, 'wb') as f: - f.write(base64.standard_b64decode(response.data['png'].encode())) + image_content = base64.standard_b64decode(response.data['png'].encode()) + hash = sha256(image_content).hexdigest() + print(hash) + img_dir_path = os.path.join(hash[0:2], hash[2:4], hash[4:6], hash[6:8], hash[8:10], hash[10:12]) + filename_img = os.path.join(self.crawled_screenshot, 'screenshot', img_dir_path, hash[12:] +'.png') + dirname = os.path.dirname(filename_img) + if not os.path.exists(dirname): + os.makedirs(dirname) + if not os.path.exists(filename_img): + with open(filename_img, 'wb') as f: + f.write(image_content) + # add item metadata + self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'screenshot', hash) + # add sha256 metadata + self.r_serv_onion.zincrby('screenshot:{}'.format(hash), relative_filename_paste, 1) if 'har' in response.data: - with open(filename_screenshot+'har.txt', 'wb') as f: + dirname = os.path.dirname(filename_har) + if not os.path.exists(dirname): + os.makedirs(dirname) + with open(filename_har+'har.txt', 'wb') as f: f.write(json.dumps(response.data['har']).encode()) # save external links in set diff --git a/var/www/modules/Flask_config.py b/var/www/modules/Flask_config.py index fc972c25..311eaaff 100644 --- a/var/www/modules/Flask_config.py +++ b/var/www/modules/Flask_config.py @@ -167,7 +167,7 @@ dict_update_description = {'v1.5':{'nb_background_update': 4, 'update_warning_me UPLOAD_FOLDER = os.path.join(os.environ['AIL_FLASK'], 'submitted') PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + '/' -SCREENSHOT_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot")) +SCREENSHOT_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"), 'screenshot') REPO_ORIGIN = 'https://github.com/CIRCL/AIL-framework.git' diff --git a/var/www/modules/showpaste/Flask_showpaste.py b/var/www/modules/showpaste/Flask_showpaste.py index becf86d0..474280b5 100644 --- a/var/www/modules/showpaste/Flask_showpaste.py +++ b/var/www/modules/showpaste/Flask_showpaste.py @@ -40,6 +40,12 @@ showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templa # ============ FUNCTIONS ============ +def get_item_screenshot_path(item): + screenshot = r_serv_metadata.hget('paste_metadata:{}'.format(item), 'screenshot') + if screenshot: + screenshot = os.path.join(screenshot[0:2], screenshot[2:4], screenshot[4:6], screenshot[6:8], screenshot[8:10], screenshot[10:12], screenshot[12:]) + return screenshot + def showpaste(content_range, requested_path): if PASTES_FOLDER not in requested_path: # remove full path @@ -200,7 +206,7 @@ def showpaste(content_range, requested_path): crawler_metadata['domain'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'domain') crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'father') crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+requested_path,'real_link') - crawler_metadata['screenshot'] = paste.get_p_date_path() + crawler_metadata['screenshot'] = get_item_screenshot_path(requested_path) else: crawler_metadata['get_metadata'] = False @@ -342,7 +348,7 @@ def show_item_min(requested_path , content_range=0): crawler_metadata['domain'] = r_serv_metadata.hget('paste_metadata:'+relative_path, 'domain') crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+relative_path, 'father') crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+relative_path,'real_link') - crawler_metadata['screenshot'] = paste.get_p_rel_path() + crawler_metadata['screenshot'] = get_item_screenshot_path(relative_path) else: crawler_metadata['get_metadata'] = False