chg: [crawled screenshot] use sha256 as filepath

pull/342/head
Terrtia 2019-04-24 14:09:04 +02:00
parent 99b9c95638
commit 9868833c77
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
4 changed files with 45 additions and 19 deletions

View File

@ -233,9 +233,10 @@ class HiddenServices(object):
origin_paste = paste origin_paste = paste
paste= paste.replace(self.paste_directory+'/', '') paste= paste.replace(self.paste_directory+'/', '')
paste = paste.replace(self.paste_crawled_directory_name, '') screenshot = self.r_serv_metadata.hget('paste_metadata:{}'.format(paste), 'screenshot')
if os.path.isfile( '{}{}.png'.format(self.screenshot_directory, paste) ): if screenshot:
l_screenshot_paste.append({'screenshot': paste[1:], 'item': origin_paste}) screenshot = os.path.join(screenshot[0:2], screenshot[2:4], screenshot[4:6], screenshot[6:8], screenshot[8:10], screenshot[10:12], screenshot[12:])
l_screenshot_paste.append({'screenshot': screenshot, 'item': origin_paste})
if len(l_screenshot_paste) > num_screenshot: if len(l_screenshot_paste) > num_screenshot:
l_random_screenshot = [] l_random_screenshot = []

View File

@ -12,6 +12,8 @@ import redis
import json import json
import time import time
from hashlib import sha256
from scrapy.spidermiddlewares.httperror import HttpError from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError from twisted.internet.error import TimeoutError
@ -103,7 +105,8 @@ class TorSplashCrawler():
self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"), self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"),
self.p.config.get("Directories", "crawled"), date_str ) self.p.config.get("Directories", "crawled"), date_str )
self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str ) self.crawled_har = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str )
self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot") )
def start_requests(self): def start_requests(self):
yield SplashRequest( yield SplashRequest(
@ -135,13 +138,13 @@ class TorSplashCrawler():
UUID = self.domains[0][-215:]+str(uuid.uuid4()) UUID = self.domains[0][-215:]+str(uuid.uuid4())
else: else:
UUID = self.domains[0]+str(uuid.uuid4()) UUID = self.domains[0]+str(uuid.uuid4())
filename_paste = os.path.join(self.crawled_paste_filemame, UUID) filename_paste_full = os.path.join(self.crawled_paste_filemame, UUID)
relative_filename_paste = os.path.join(self.crawler_path, UUID) relative_filename_paste = os.path.join(self.crawler_path, UUID)
filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png') filename_har = os.path.join(self.crawled_har, UUID +'.png')
# # TODO: modify me # # TODO: modify me
# save new paste on disk # save new paste on disk
if self.save_crawled_paste(filename_paste, response.data['html']): if self.save_crawled_paste(relative_filename_paste, response.data['html']):
# add this paste to the domain crawled set # TODO: # FIXME: put this on cache ? # add this paste to the domain crawled set # TODO: # FIXME: put this on cache ?
#self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste) #self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste)
@ -170,14 +173,14 @@ class TorSplashCrawler():
self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports', ';'.join(all_domain_ports)) self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports', ';'.join(all_domain_ports))
#create paste metadata #create paste metadata
self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'super_father', self.root_key) self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'super_father', self.root_key)
self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'father', response.meta['father']) self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'father', response.meta['father'])
self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'domain', '{}:{}'.format(self.domains[0], self.port)) self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'domain', '{}:{}'.format(self.domains[0], self.port))
self.r_serv_metadata.hset('paste_metadata:{}'.format(filename_paste), 'real_link', response.url) self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'real_link', response.url)
self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste) self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], relative_filename_paste)
dirname = os.path.dirname(filename_screenshot) dirname = os.path.dirname(filename_har)
if not os.path.exists(dirname): if not os.path.exists(dirname):
os.makedirs(dirname) os.makedirs(dirname)
@ -185,11 +188,27 @@ class TorSplashCrawler():
size_screenshot = (len(response.data['png'])*3) /4 size_screenshot = (len(response.data['png'])*3) /4
if size_screenshot < 5000000: #bytes if size_screenshot < 5000000: #bytes
with open(filename_screenshot, 'wb') as f: image_content = base64.standard_b64decode(response.data['png'].encode())
f.write(base64.standard_b64decode(response.data['png'].encode())) hash = sha256(image_content).hexdigest()
print(hash)
img_dir_path = os.path.join(hash[0:2], hash[2:4], hash[4:6], hash[6:8], hash[8:10], hash[10:12])
filename_img = os.path.join(self.crawled_screenshot, 'screenshot', img_dir_path, hash[12:] +'.png')
dirname = os.path.dirname(filename_img)
if not os.path.exists(dirname):
os.makedirs(dirname)
if not os.path.exists(filename_img):
with open(filename_img, 'wb') as f:
f.write(image_content)
# add item metadata
self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'screenshot', hash)
# add sha256 metadata
self.r_serv_onion.zincrby('screenshot:{}'.format(hash), relative_filename_paste, 1)
if 'har' in response.data: if 'har' in response.data:
with open(filename_screenshot+'har.txt', 'wb') as f: dirname = os.path.dirname(filename_har)
if not os.path.exists(dirname):
os.makedirs(dirname)
with open(filename_har+'har.txt', 'wb') as f:
f.write(json.dumps(response.data['har']).encode()) f.write(json.dumps(response.data['har']).encode())
# save external links in set # save external links in set

View File

@ -167,7 +167,7 @@ dict_update_description = {'v1.5':{'nb_background_update': 4, 'update_warning_me
UPLOAD_FOLDER = os.path.join(os.environ['AIL_FLASK'], 'submitted') UPLOAD_FOLDER = os.path.join(os.environ['AIL_FLASK'], 'submitted')
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + '/' PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) + '/'
SCREENSHOT_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot")) SCREENSHOT_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot"), 'screenshot')
REPO_ORIGIN = 'https://github.com/CIRCL/AIL-framework.git' REPO_ORIGIN = 'https://github.com/CIRCL/AIL-framework.git'

View File

@ -40,6 +40,12 @@ showsavedpastes = Blueprint('showsavedpastes', __name__, template_folder='templa
# ============ FUNCTIONS ============ # ============ FUNCTIONS ============
def get_item_screenshot_path(item):
screenshot = r_serv_metadata.hget('paste_metadata:{}'.format(item), 'screenshot')
if screenshot:
screenshot = os.path.join(screenshot[0:2], screenshot[2:4], screenshot[4:6], screenshot[6:8], screenshot[8:10], screenshot[10:12], screenshot[12:])
return screenshot
def showpaste(content_range, requested_path): def showpaste(content_range, requested_path):
if PASTES_FOLDER not in requested_path: if PASTES_FOLDER not in requested_path:
# remove full path # remove full path
@ -200,7 +206,7 @@ def showpaste(content_range, requested_path):
crawler_metadata['domain'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'domain') crawler_metadata['domain'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'domain')
crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'father') crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+requested_path, 'father')
crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+requested_path,'real_link') crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+requested_path,'real_link')
crawler_metadata['screenshot'] = paste.get_p_date_path() crawler_metadata['screenshot'] = get_item_screenshot_path(requested_path)
else: else:
crawler_metadata['get_metadata'] = False crawler_metadata['get_metadata'] = False
@ -342,7 +348,7 @@ def show_item_min(requested_path , content_range=0):
crawler_metadata['domain'] = r_serv_metadata.hget('paste_metadata:'+relative_path, 'domain') crawler_metadata['domain'] = r_serv_metadata.hget('paste_metadata:'+relative_path, 'domain')
crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+relative_path, 'father') crawler_metadata['paste_father'] = r_serv_metadata.hget('paste_metadata:'+relative_path, 'father')
crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+relative_path,'real_link') crawler_metadata['real_link'] = r_serv_metadata.hget('paste_metadata:'+relative_path,'real_link')
crawler_metadata['screenshot'] = paste.get_p_rel_path() crawler_metadata['screenshot'] = get_item_screenshot_path(relative_path)
else: else:
crawler_metadata['get_metadata'] = False crawler_metadata['get_metadata'] = False